# Last edited on 2002-01-20 18:53:02 by stolfi # Compute the token/word length histogram MAKEFILE := tw-length-hists.make ###################################################################### # Makefile for computing the token and word length # histograms for a specified sample, quality subset # (raw, good, or bad), and element factoring. # Caller must define # ${LANG} = "voyn", "chin", etc.; # ${BOOK} = "wow", "vms", etc. # ${QUAL} = "raw", "gud", or "bad". # ${ELEM} = "bgly", "qoko", "viqr", etc. # LANG := LANG.IS.UNDEFINED ifneq "${LANG}" "LANG.IS.UNDEFINED" BOOK := BOOK.IS.UNDEFINED ifneq "${BOOK}" "BOOK.IS.UNDEFINED" SAMPLE_DIR := ${LANG}/${BOOK} SAMPLE_TOT_DIR := ${LANG}/${BOOK}/tot.1 QUAL := QUAL.IS.UNDEFINED ifneq "${QUAL}" "QUAL.IS.UNDEFINED" WFR_FILE := ${SAMPLE_TOT_DIR}/${QUAL}.wfr ELEM := ELEM.IS.UNDEFINED ifneq "${ELEM}" "ELEM.IS.UNDEFINED" CTS_FILE := ${SAMPLE_TOT_DIR}/${QUAL}-fact-${ELEM}.cts .PHONY: all all: dat/${CTS_FILE} for tkwd in t w; do \ ${MAKE} LANG=${LANG} QUAL=${QUAL} ELEM=${ELEM} TKWD=$$tkwd \ -f ${MAKEFILE} single-hist; \ done; FACTOR_LIB := dat/${SAMPLE_DIR}/factor-text-to-${ELEM}.gawk dat/${CTS_FILE}: dat/${WFR_FILE} \ factor-field-general ${FACTOR_LIB} \ compute-elem-counts ${MAKEFILE} @echo "dat/${WFR_FILE} -> dat/${CTS_FILE}" cat dat/${WFR_FILE} \ | factor-field-general \ -f ${FACTOR_LIB} -v inField=3 -v outField=4 \ | gawk '//{ print $$1, $$3, $$4; }' \ | compute-elem-counts \ > dat/${CTS_FILE} ###################################################################### # Recursive make for each language, book, # element type, sample quality, and token/word counting. # Caller must define ${LANG}, ${BOOK}, ${ELEM}, ${QUAL} and also # ${TKWD} = "t" (tokens) or "w" (words). # TKWD := TKWD.IS.UNDEFINED ifneq "${TKWD}" "TKWD.IS.UNDEFINED" LHI_FILE := ${SAMPLE_TOT_DIR}/${QUAL}-fact-${ELEM}-${TKWD}.lhi AVG_TEX := ${SAMPLE_TOT_DIR}/${QUAL}-fact-${ELEM}-${TKWD}-avlen.tex single-hist: dat/${LHI_FILE} dat/${AVG_TEX} dat/${LHI_FILE}: dat/${CTS_FILE} \ compute-elem-count-distrib \ ${MAKEFILE} @echo "dat/${CTS_FILE} -> dat/${LHI_FILE}" cat dat/${CTS_FILE} \ | gawk -v tkwd="${TKWD}" \ '/./{ print (tkwd == "t" ? $$1 : 1), $$3, $$4; }' \ | compute-elem-count-distrib \ > dat/${LHI_FILE} cat dat/${LHI_FILE} dat/${AVG_TEX}: dat/${LHI_FILE} cat dat/${LHI_FILE} \ | gawk \ -v lg=${LANG} -v bk=${BOOK} -v ek=${ELEM} -v tw=${TKWD} \ ' /^[#]/{next;} \ /./{ t+= $$2; e += $$2*$$1; } \ END { \ xtw = ( tw == "t" ? "Tk" : "Wd" ); \ printf "\\def\\%s%sAvg%sN%s{%.2f}\n", lg, bk, xtw, ek, e/t; \ } \ ' \ > dat/${AVG_TEX} cat dat/${AVG_TEX} update-paper-include dat/${AVG_TEX} exp/${AVG_TEX} endif # End of ${LANG}/${BOOK}/${ELEM}/${QUAL}/${TKWD} recursion ###################################################################### endif endif endif endif # End of ${LANG}/${BOOK}/${ELEM}/${QUAL} recursion ######################################################################