# Last edited on 2004-07-24 03:03:54 by stolfi # Compute the token/word length histogram MAKEFILE := dup-word-lists.make # MAKEFILE := ###################################################################### # Makefile for computing the repetitious word lists for a specified # language sample. Caller must define # ${LANG} = "voyn", "chin", etc.; # ${BOOK} = "wow", "vms", etc. # LANG := LANG.IS.UNDEFINED ifneq "${LANG}" "LANG.IS.UNDEFINED" BOOK := BOOK.IS.UNDEFINED ifneq "${BOOK}" "BOOK.IS.UNDEFINED" SAMPLE_DIR := ${LANG}/${BOOK} SAMPLE_TOT_DIR := ${LANG}/${BOOK}/tot.1 BIN := ${STOLFIHOME}/bin # Decide source EVT file and word-cleanup filter ifeq "${LANG}" "voyn" EVT_FILE := ${SAMPLE_TOT_DIR}/raw.evt FIX_WORDS := /bin/cat FIX_WORDS_DEPS := else EVT_FILE := ${SAMPLE_DIR}/source/main.evt SAMPLE_AWK := ${SAMPLE_DIR}/sample-fns.gawk SAMPLE_TBL := ${SAMPLE_DIR}/word-map.tbl FIX_WORDS := ./fix-words \ -f dat/${SAMPLE_AWK} \ -v sample=${LANG}/${BOOK} \ -v table=dat/${SAMPLE_TBL} \ -v field=5 FIX_WORDS_DEPS := \ ./fix-words \ dat/${SAMPLE_AWK} \ dat/${SAMPLE_TBL} endif TLW_FILE := ${SAMPLE_TOT_DIR}/raw.tlw REP_FILE := ${SAMPLE_TOT_DIR}/raw.rep REPFM_FILE := ${SAMPLE_TOT_DIR}/raw.repfm REPFR_FILE := ${SAMPLE_TOT_DIR}/raw.repfr TEX_REP_SUMMARY := ${SAMPLE_TOT_DIR}/raw-rep-summary.tex .PHONY: all rep-list all: rep-list rep-list: dat/${REP_FILE} dat/${REPFM_FILE} dat/${REPFR_FILE} dat/${TEX_REP_SUMMARY} # List word replications (unformatted) CONTEXT := 3 dat/${REP_FILE}: dat/${TLW_FILE} \ list-replicate-words \ ${MAKEFILE} @echo "dat/${TLW_FILE} -> dat/${REP_FILE}" cat dat/${TLW_FILE} \ | gawk '($$1 == "a"){ print $$2, $$1, $$3; }' \ | list-replicate-words \ -v sep='_' \ -v context=${CONTEXT} \ -v lineWidth=0 \ > dat/${REP_FILE} cat dat/${REP_FILE} \ | head -10 # List word replications (formatted) dat/${REPFM_FILE}: dat/${TLW_FILE} \ list-replicate-words \ ${MAKEFILE} @echo "dat/${TLW_FILE} -> dat/${REPFM_FILE}" cat dat/${TLW_FILE} \ | gawk '($$1 == "a"){ print $$2, $$1, $$3; }' \ | list-replicate-words \ -v sep='_' \ -v context=${CONTEXT} \ -v lineWidth=100 \ > dat/${REPFM_FILE} cat dat/${REPFM_FILE} \ | head -10 # Extract replicated words and count them dat/${REPFR_FILE}: dat/${REP_FILE} \ compute-freqs \ ${MAKEFILE} @echo "dat/${REP_FILE} -> dat/${REPFR_FILE}" cat dat/${REP_FILE} \ | gawk -v ctx=${CONTEXT} \ ' BEGIN {ctr = 3 + 2*ctx + 1; } \ /./{ print $$(ctr); } \ ' \ | sort | uniq -c | expand \ | sort -b +0 -1nr +1 -2 \ | compute-freqs \ > dat/${REPFR_FILE} cat dat/${REPFR_FILE} \ | head -10 # Compute frequency of replications dat/${TEX_REP_SUMMARY}: dat/${REPFR_FILE} dat/${TLW_FILE} \ ${MAKEFILE} \ summarize-rep-stats @echo "dat/${REPFR_FILE} -> dat/${TEX_REP_SUMMARY}" summarize-rep-stats ${LANG} ${BOOK} tot.1 \ > dat/${TEX_REP_SUMMARY} cat dat/${TEX_REP_SUMMARY} update-paper-include dat/${TEX_REP_SUMMARY} exp/${TEX_REP_SUMMARY} endif endif # End of ${LANG}/${BOOK} recursion ######################################################################