#! /bin/csh -f # Last edited on 1999-12-10 04:14:23 by stolfi set usage = "$0 [ -maxLines NN ] [ -dir DIR ] [-title TIT] [-out OUT] SEC1 SEC2 ... " # Reads a bunch of item count files DIR/SECi.frq (as written by compute-freqs), # where each line has a COUNT, a FREQ, and an ITEM, and generates various reports: # # DIR/OUT.cmp-cts shows raw counts of each item and file; one line per item, # one column of counts per file, plus one "total" column # and one shared column with the items; sorted by decreasing # total counts. # # DIR/OUT.cmp-frq ditto, with relative frequencies ×9999 instead # of raw counts. # # DIR/OUT.cmp-top shows the item frequency rankings in each file, and # overall; two columns per file, one with relative # frequencies ×99, the other with the item; each column # pair sorted by its own frequency, decreasing. # # The "-title" option specifies a title for the shared item column. # # The OUT defaults to "all". # # The "-maxLines NN" option truncates the output after "NN" lines # (default 50) set dir = "" set tit = "" set out = "all" set maxLines = 50 while ( ( $#argv > 0 ) && ( "/$1" =~ /-* ) ) if ( ( $#argv >= 2 ) && ( "/$1" == "/-dir" ) ) then set dir = "$2/"; shift; shift; else if ( ( $#argv >= 2 ) && ( "/$1" == "/-out" ) ) then set out = "$2"; shift; shift; else if ( ( $#argv >= 2 ) && ( "/$1" == "/-title" ) ) then set tit = "$2"; shift; shift; else if ( ( $#argv >= 2 ) && ( "/$1" == "/-maxLines" ) ) then set maxLines = "$2"; shift; shift; else echo "invalid option $1" echo "usage: ${usage}"; exit 1 endif end if ( $#argv < 2 ) then echo "usage: ${usage}"; exit 1 endif if ( ! ( $?dir ) ) then echo 'must specify "-dir"'; exit 1 echo "usage: ${usage}"; exit 1 endif set secs = ( $* ) set frfiles = ( `echo ${secs} | tr ' ' '\012' | sed -e 's@.*@'"${dir}"'&.frq@'` ) echo "Tabulating the raw word counts..." set tmp = "/tmp/$$" set ctfiles = ( ) foreach ffile ( ${frfiles} ) set name = ${ffile:t}; set name = ${name:r} set cfile = "${tmp}-${name}.cts" echo "${ffile} -> ${cfile}" cat ${ffile} \ | gawk '/./{printf "%d %s\n", $1, $3;}' \ > ${cfile} set ctfiles = ( $ctfiles $cfile ) end compare-counts \ -titles "${secs} ${tit}" \ -sort 1 \ -totals \ -maxlines ${maxLines} \ -widths '6' \ ${ctfiles} \ > ${dir}${out}.cmp-cts /bin/rm ${ctfiles} echo "Tabulating the relative frequencies ( × 9999 ) per file..." set refiles = ( ) foreach ffile ( ${frfiles} ) set name = ${ffile:t}; set name = ${name:r} set rfile = "${tmp}-${name}.fri" echo "${ffile} -> ${rfile}" cat ${ffile} \ | gawk '/./{printf "%d %s\n", int(9999*$2), $3;}' \ > ${rfile} set refiles = ( $refiles $rfile ) end compare-counts \ -titles "`echo ${secs} | tr -d '.'` ${tit}" \ -sort 1 \ -maxlines ${maxLines} \ -widths '4' \ ${refiles} \ > ${dir}${out}.cmp-frq /bin/rm ${refiles} echo "Listing the frequency-ranked items per file..." set ppfiles = ( ) foreach ffile ( ${frfiles} ) set name = ${ffile:t}; set name = ${name:r} set pfile = "${tmp}-${name}.pct" echo "${ffile} -> ${pfile}" cat ${ffile} \ | sort -b +0 -1nr \ | head -${maxLines} \ | gawk '/./{printf "%3d %s\n", int(999*$2), $3;}' \ > ${pfile} set ppfiles = ( $ppfiles $pfile ) end multicol \ -v titles="${secs}" \ -v colsep=" " \ ${ppfiles} \ > ${dir}${out}.cmp-top /bin/rm ${ppfiles} ls -l ${dir}${out}.cmp-{cts,frq,top}