#! /bin/bash # Last edited on 2012-05-05 14:59:26 by stolfilocal cmd="${0##*/}"; usage="${cmd} {NWORDS} {SAMPLE} {SECTAG}... > {OUTNAME}.tex" # Produces a list of the NWORDS most common words and their # frequencies, per section. The arguments may be section tags # like "bio", or subsection tags like "bio.1". # The output records have the format # # {SECTAG} {COUNT} {FREQ} {CUMCOUNT} {CUMFREQ} {WORD} if [[ $# -lt 3 ]]; then echo "usage: ${usage}" 1>&2; exit 1 fi nwords="$1"; shift; sample="$1"; shift; secs=( "$@" ) for sec in ${secs[@]} ; do printf "sec = ${sec} " 1>&2 ifiles=( `cd dat/${sample} && ls ${sec}.?/gud.wfr` ) gawk -v sec="${sec}" -v ifiles="${ifiles[*]}" \ 'BEGIN { printf "%s: {%s}\n", sec, ifiles > "/dev/stderr"; }' ( cd dat/${sample} && cat ${ifiles[@]} ) \ | gawk '/./{ print $1,$3; }' \ | combine-counts \ | sort -b -k1,1nr -k2,2 \ | compute-cum-freqs \ | head -${nwords} \ | gawk -v sec=${sec} '/./{ print sec, $1,$2,$3,$4,$5; }' done