#! /bin/bash -f # Last edited on 2012-05-05 14:42:46 by stolfilocal cmd=${0##*/} usage="${cmd} {NWORDS} {LANG}/{BOOK}/{SUBSEC} > {OUTNAME}.wfr" # Produces a list of the NWORDS most frequent `good' words of the # given language sample, with counts and frequencies, in the format # COUNT FREQ CUMCOUNT CUMFREQ WORD if [[ $# -ne 2 ]]; then echo "usage: ${usage}"; exit 1 fi nwords="$1"; shift; samplesec="$1"; shift; dir=dat/${samplesec} sample=${samplesec%/*} subsec=${samplesec##*/} lang=${sample%/*} book=${sample##*/} echo "book = ${book} lang = ${lang} subsec = ${subsec}" 1>&2 if [[ -r ${dir}/gud.wfr ]]; then ifile="${dir}/gud.wfr" else echo 'no file "gud.wfr" in '"${dir}" 1>&2; exit 1 fi cat ${ifile} \ | gawk '/./{print $1, $3}' \ | combine-counts \ | sort -b -k1,1nr \ | compute-cum-freqs \ | head -${nwords}