#! /bin/csh -f # Last edited on 2004-05-27 10:10:46 by stolfi set cmd = "$0"; set cmd = "${cmd:t}" set usage = "${cmd} LANG BOOK SUBSEC > SUMMARY.tex" # Reads a tabulation dat/LANG/BOOK/SUBSEC/raw.dupfr of consecutively # duplicated words and the original list dat/LANG/BOOK/SUBSEC/raw.tlw # of all words with locations from the sample. Writes to stdout a file # with TeX definitions summarizing the frequency of duplications, the # most commonly duplicated word, and its duplication probability. if ( $#argv != 3 ) then echo "usage: ${usage}"; exit 1 endif set lang = "$1"; shift; set book = "$1"; shift; set subsec = "$1"; shift; set sample = "${lang}/${book}" set secdir = "${sample}/${subsec}" set frfile = "dat/${secdir}/raw.dupfr" set lwfile = "dat/${secdir}/raw.tlw" set tmpfile = "/tmp/$$" # Get most commonly duplicated word and its duplication count set tmp = ( `cat ${frfile} | head -1` ) if ( $#tmp == 3 ) then set maxdupct = "${tmp[1]}" set maxdupwd = "${tmp[3]}" else set maxdupct = 0 set maxdupwd = "-" endif # Compute the total number of replications set pgm = '/./{ s+=$1; } END{print s;}' set totdupct = `cat ${frfile} | gawk "${pgm}"` # Compute the total number of tokens (excluding punct and breaks) set pgm = '($2 != 1){ s++; } END{print s;}' set tottokct = `cat ${lwfile} | gawk "${pgm}"` set maxdupfr = `gawk -v tot=${tottokct} -v ct=${maxdupct} 'BEGIN{printf "%7.5f", ct/tot;}'` set totdupfr = `gawk -v tot=${tottokct} -v ct=${totdupct} 'BEGIN{printf "%7.5f", ct/tot;}'` # Output results echo '% created by '"${cmd}" echo '%' echo "${maxdupwd}" \ | dat/${sample}/reencode-words-for-tex -v field=1 \ | gawk '//{printf "\\def\\'"${LANG}${BOOK}"'DupMaxWd{%s}\n", $1;}' echo '\def\'"${LANG}${BOOK}DupMaxCt{${maxdupct}}" echo '\def\'"${LANG}${BOOK}DupMaxFr{${maxdupfr}}" echo '\def\'"${LANG}${BOOK}DupTotCt{${totdupct}}" echo '\def\'"${LANG}${BOOK}DupTotFr{${totdupfr}}"