# Makefile for creating a language sample # Last edited on 2002-02-12 00:34:55 by stolfi MAKERULES := lang-sample.make ###################################################################### # Caller must define the following variables: # # SAMPLE = name for text sample, e.g. "engl/wow". # SUBSEC = subsection tag, e.g. "cos.2" or "tot.1". # SIZETAG = "whole" or "trunc" # SAMPLE := SAMPLE.IS.UNDEFINED SUBSEC := SUBSEC.IS.UNDEFINED SIZETAG := SIZETAG.IS.UNDEFINED ifneq "${SAMPLE}" "SAMPLE.IS.UNDEFINED" # Top-level input and output directories: SAMPLE_REP := dat BIN := ${HOME}/bin # Derived directories and file names: SAMPLE_DIR := ${SAMPLE_REP}/${SAMPLE} SOURCE_DIR := ${SAMPLE_DIR}/source SOURCE_EVT := ${SOURCE_DIR}/main.evt SAMPLE_AWK := ${SAMPLE_DIR}/sample-fns.gawk # Token streams, with and without location codes: RAW_LTS := ${SAMPLE_DIR}/${SUBSEC}/raw.lts RAW_TKS := ${SAMPLE_DIR}/${SUBSEC}/raw.tks # Word counts and frequencies: RAW_WFR := ${SAMPLE_DIR}/${SUBSEC}/raw.wfr GUD_WFR := ${SAMPLE_DIR}/${SUBSEC}/gud.wfr BAD_WFR := ${SAMPLE_DIR}/${SUBSEC}/bad.wfr DERIVED_FILES := ${RAW_LTS} ${RAW_TKS} ${RAW_WFR} ${GUD_WFR} ${BAD_WFR} all: ${DERIVED_FILES} # If there are subsections other than 'tot.1", the "tot.1" subsection # must be created from them, in order to get the right right # number of words from each subsection. ALL_SUBSECS := ${strip ${shell cat ${SAMPLE_DIR}/subsections.tags}} ifeq "/${ALL_SUBSECS}/" "//" TOT_SUBSEC := *NONE* else TOT_SUBSEC := tot.1 endif ###################################################################### # The following applies only to subsections whose raw words are # to be extracted directly from the EVT file: ifneq "${SUBSEC}" "${TOT_SUBSEC}" # Create partial EVT source file with this subsection only: SAMPLE_EVT := ${SAMPLE_DIR}/${SUBSEC}/raw.evt ${SAMPLE_EVT}: ${SOURCE_EVT} ${MAKERULES} \ select-evt-lines ${SAMPLE_AWK} @echo "ALL_SUBSECS = '${ALL_SUBSECS}'" @echo "TOT_SUBSEC = '${TOT_SUBSEC}'" ls -ld ${SOURCE_EVT} cat ${SOURCE_EVT} \ | ./select-evt-lines \ -f ${SAMPLE_AWK} \ -v sample=${SAMPLE} -v subsec=${SUBSEC} \ > ${SAMPLE_EVT} # Rebuild the source EVMT file in the source repository # if it is out of date: SOURCE_SOURCES := \ ${SOURCE_DIR}/main.org \ ${SOURCE_DIR}/Makefile \ ${SOURCE_DIR}/preprocess-org \ ${BIN}/org-to-evt ${SOURCE_EVT}: ${SOURCE_SOURCES} cd ${SOURCE_DIR} && make all SAMPLE_TBL := ${SAMPLE_DIR}/word-map.tbl RAWNUM_FILE := ${SAMPLE_DIR}/${SUBSEC}/${SIZETAG}-raw.num RAWNUM := ${shell cat ${RAWNUM_FILE}} # get raw tokens with locators from EVT file: ${RAW_LTS}: ${SAMPLE_EVT} ${SAMPLE_TBL} ${RAWNUM_FILE} ${MAKERULES} \ ${BIN}/evt-to-wds fix-raw-words ${SAMPLE_AWK} cat ${SAMPLE_EVT} \ | ${BIN}/evt-to-wds \ -v showBreaks=0 \ -v showParags=1 \ -v showPuncts=1 \ -v smashSymbols=1 \ -v showLocation=1 \ | ./fix-raw-words \ -f ${SAMPLE_AWK} \ -v sample=${SAMPLE} \ -v table=${SAMPLE_TBL} \ -v field=3 \ | gawk \ -v num=${RAWNUM} \ '($$2 != 1){n++;} (n > num){exit 0;} //{print;}' \ > ${RAW_LTS} # Remove locators leaving only the raw tokens (including punctuation): ${RAW_TKS}: ${RAW_LTS} ${MAKERULES} cat ${RAW_LTS} \ | gawk '/./ { print $$3; }' \ > ${RAW_TKS} endif # End rules for subsections extracted from the EVT file ###################################################################### ###################################################################### # The following applies only for subsection thst is total of others: # ifeq "${SUBSEC}" "${TOT_SUBSEC}" SUBSECDIRS := ${addprefix ${SAMPLE_DIR}/,${ALL_SUBSECS}} ALL_RAW_LTS := ${addsuffix /raw.lts,${SUBSECDIRS}} ${RAW_LTS}: ${ALL_RAW_LTS} ${MAKERULES} @echo "ALL_SUBSECS = '${ALL_SUBSECS}'" @echo "TOT_SUBSEC = '${TOT_SUBSEC}'" cat ${ALL_RAW_LTS} > ${RAW_LTS} ALL_RAW_TKS := ${addsuffix /raw.tks,${SUBSECDIRS}} ${RAW_TKS}: ${ALL_RAW_TKS} ${MAKERULES} @echo "ALL_SUBSECS = '${ALL_SUBSECS}'" @echo "TOT_SUBSEC = '${TOT_SUBSEC}'" cat ${ALL_RAW_TKS} > ${RAW_TKS} endif # End of code for subsection that is the total of others ###################################################################### # Count raw word ocurrences and compute their rel. frequencies: ${RAW_WFR}: ${RAW_TKS} ${MAKERULES} \ compute-freqs @echo "${RAW_TKS} -> ${RAW_WFR}" cat ${RAW_TKS} \ | egrep -v '=' \ | sort | uniq -c | expand \ | sort -b +0 -1nr +1 -2 \ | compute-freqs \ > ${RAW_WFR} # Extract the good words: ${GUD_WFR}: ${RAW_WFR} ${MAKERULES} \ select-gud-bad-words ${SAMPLE_AWK} @echo "${RAW_WFR} -> ${GUD_WFR}" cat ${RAW_WFR} \ | select-gud-bad-words \ -f ${SAMPLE_AWK} \ -v inField=3 -v writeBad=0 \ > ${GUD_WFR} # Extract the bad words: ${BAD_WFR}: ${RAW_WFR} ${MAKERULES} \ select-gud-bad-words ${SAMPLE_AWK} @echo "${RAW_WFR} -> ${BAD_WFR}" cat ${RAW_WFR} \ | select-gud-bad-words \ -f ${SAMPLE_AWK} \ -v inField=3 -v writeBad=1 \ > ${BAD_WFR} # Rule to compute word frequencis: %.wfr: %.tks compute-freqs cat $*.tks \ | sort | uniq -c | expand \ | sort -b +0 -1nr +1 -2 \ | ./compute-freqs \ > $*.wfr endif # End of code for whole makefile # ======================================================================