#! /bin/csh -f # Last edited on 2012-05-05 14:42:28 by stolfilocal set usage = "$0 NWORDS LANG/BOOK/SUBSEC > list.wfr" # Produces a list of the NWORDS least frequent `good' words # given language sample, with counts # and frequencies, in the format # COUNT FREQ CUMCOUNT CUMFREQ WORD # Breaks ties randomly. if ( $#argv != 2 ) then echo "usage: ${usage}"; exit 1 endif set nwords = "$1"; shift; set samplesec = "$1"; shift; set dir = dat/${samplesec} set sample = ${samplesec:h} set subsec = ${samplesec:t} set lang = ${sample:h} set book = ${sample:t} if ( -r ${dir}/gud.wfr ) then set ifile = "${dir}/gud.wfr" else echo 'no "gud.wfr" in '"${dir}"; exit 1 endif cat ${ifile} \ | gawk '/./{print $1, $3}' \ | combine-counts \ | gawk '/./{i++; s=sin(i); printf "%7d %10.8f %s\n", $1, s*s, $2; }' \ | sort -b -k1,1n -k2,2g \ | gawk '/./{print $1, $3}' \ | compute-cum-freqs \ | head -${nwords} \ | sort -b -k5,5