#! /bin/csh -f # Last edited on 2000-10-11 18:57:24 by stolfi set usage = "$0 GRCLASS SECTION OBSNAME GENNAME " # Compares observed word frequencies with computed word probabilities # Writes a file with the results, and generates a plot of the same. # # Input files: # # prob/obs/SECTION/OBSNAME.frq # Observed word frequencies, in the format # COUNT FREQ WORD. # # prob/gen/GRCLASS/SECTION/GENNAME.prb # Computed probabilities, in the format PROB WORD. # # Output files: # # prob/cmp/GRCLASS/SECTION/GENNAME.pr2 # Comparison file, in the format # PROBS PRGEN PRTOT DELTA WORD # where PROBS, PRGEN are the two # probabilities, PRTOT is their sum, # and DELTA is the log base 10 of # PRGEN/PROBS (fuzzified by 1/N) # where N is the total observed # word COUNT) # # prob/cmp/GRCLASS/SECTION/GENNAME.gif # A plot of PRGEN against PROBS. # if ( $#argv != 4) then echo "usage: ${usage}"; exit 1 endif set grclass = "$1"; shift; set sec = "$1"; shift; set obsname = "$1"; shift; set genname = "$1"; shift; set obsfile = "prob/obs/${sec}/${obsname}.frq" set genfile = "prob/gen/${grclass}/${sec}/${genname}.prb" set cmpfile = "prob/cmp/${grclass}/${sec}/${genname}.pr2" set cmpplot = "prob/cmp/${grclass}/${sec}/${genname}.gif" echo "comparing ${genfile} against ${obsfile}..." set tmp = "/tmp/$$" cat ${obsfile} \ | gawk '/./{print $2,$3;}' \ | sort -b +1 -2 \ > ${tmp}-obs.prb set eps = `cat ${obsfile} | gawk '/./{s+=$1} END{print 1.0/s}'` sort -b +1 -2 ${genfile} \ > ${tmp}-gen.prb join \ -j1 2 -j2 2 -o '1.1,2.1,0' \ -a1 -a2 -e 0.00000 \ ${tmp}-obs.prb ${tmp}-gen.prb \ | gawk -v eps="${eps}" \ ' BEGIN {log10 = log(10); } \ /./{ \ obs=$1; gen=$2; w=$3; \ d = log(sqrt(gen*gen + eps*eps)/sqrt(obs*obs + eps*eps))/log10; \ printf "%s %s %7.5f %+4.1f %s\n", obs, gen, obs+gen, d, w; \ } \ ' \ | sort -b +3 -4gr +4 -5 \ > ${cmpfile} /bin/rm -f ${tmp}-*.prb plot-joint-probs \ -eps ${eps} \ -title "${sec}: obs/${obsname} x gen/${genname}" \ < ${cmpfile} \ > ${cmpplot}