# Last edited on 2004-07-24 03:02:23 by stolfi # Compute the token/word length histogram MAKEFILE := dup-word-lists.make # MAKEFILE := ###################################################################### # Makefile for computing the duplicate word lists for a specified # language sample. Caller must define # ${LANG} = "voyn", "chin", etc.; # ${BOOK} = "wow", "vms", etc. # LANG := LANG.IS.UNDEFINED ifneq "${LANG}" "LANG.IS.UNDEFINED" BOOK := BOOK.IS.UNDEFINED ifneq "${BOOK}" "BOOK.IS.UNDEFINED" SAMPLE_DIR := ${LANG}/${BOOK} SAMPLE_TOT_DIR := ${LANG}/${BOOK}/tot.1 BIN := ${STOLFIHOME}/bin # Decide source EVT file and word-cleanup filter ifeq "${LANG}" "voyn" EVT_FILE := ${SAMPLE_TOT_DIR}/raw.evt FIX_WORDS := /bin/cat FIX_WORDS_DEPS := else EVT_FILE := ${SAMPLE_DIR}/source/main.evt SAMPLE_AWK := ${SAMPLE_DIR}/sample-fns.gawk SAMPLE_TBL := ${SAMPLE_DIR}/word-map.tbl FIX_WORDS := ./fix-words \ -f dat/${SAMPLE_AWK} \ -v sample=${LANG}/${BOOK} \ -v table=dat/${SAMPLE_TBL} \ -v field=5 FIX_WORDS_DEPS := \ ./fix-words \ dat/${SAMPLE_AWK} \ dat/${SAMPLE_TBL} endif TLW_FILE := ${SAMPLE_TOT_DIR}/raw.tlw DUP_FILE := ${SAMPLE_TOT_DIR}/raw.dup DUPFM_FILE := ${SAMPLE_TOT_DIR}/raw.dupfm DUPFR_FILE := ${SAMPLE_TOT_DIR}/raw.dupfr TEX_DUP_SUMMARY := ${SAMPLE_TOT_DIR}/raw-dup-summary.tex .PHONY: all dup-list all: dup-list dup-list: dat/${DUP_FILE} dat/${DUPFM_FILE} dat/${DUPFR_FILE} dat/${TEX_DUP_SUMMARY} # List word duplications (unformatted) CONTEXT := 3 dat/${DUP_FILE}: dat/${TLW_FILE} \ list-duplicate-words \ ${MAKEFILE} @echo "dat/${TLW_FILE} -> dat/${DUP_FILE}" cat dat/${TLW_FILE} \ | gawk '($$1 == "a"){ print $$2, $$1, $$3; }' \ | list-duplicate-words \ -v sep='_' \ -v context=${CONTEXT} \ -v lineWidth=0 \ > dat/${DUP_FILE} cat dat/${DUP_FILE} \ | head -10 # List word duplications (formatted) dat/${DUPFM_FILE}: dat/${TLW_FILE} \ list-duplicate-words \ ${MAKEFILE} @echo "dat/${TLW_FILE} -> dat/${DUPFM_FILE}" cat dat/${TLW_FILE} \ | gawk '($$1 == "a"){ print $$2, $$1, $$3; }' \ | list-duplicate-words \ -v sep='_' \ -v context=${CONTEXT} \ -v lineWidth=100 \ > dat/${DUPFM_FILE} cat dat/${DUPFM_FILE} \ | head -10 # Extract duplicated words and count them dat/${DUPFR_FILE}: dat/${DUP_FILE} \ compute-freqs \ ${MAKEFILE} @echo "dat/${DUP_FILE} -> dat/${DUPFR_FILE}" cat dat/${DUP_FILE} \ | gawk -v ctx=${CONTEXT} \ ' BEGIN {ctr = 3 + 2*ctx + 1; } \ /./{ print $$(ctr); } \ ' \ | sort | uniq -c | expand \ | sort -b +0 -1nr +1 -2 \ | compute-freqs \ > dat/${DUPFR_FILE} cat dat/${DUPFR_FILE} \ | head -10 # Compute frequency of duplications dat/${TEX_DUP_SUMMARY}: dat/${DUPFR_FILE} dat/${TLW_FILE} \ ${MAKEFILE} \ summarize-dup-stats @echo "dat/${DUPFR_FILE} -> dat/${TEX_DUP_SUMMARY}" summarize-dup-stats ${LANG} ${BOOK} tot.1 \ > dat/${TEX_DUP_SUMMARY} cat dat/${TEX_DUP_SUMMARY} update-paper-include dat/${TEX_DUP_SUMMARY} exp/${TEX_DUP_SUMMARY} endif endif # End of ${LANG}/${BOOK} recursion ######################################################################