#! /bin/csh -f # Last edited on 2004-02-17 15:14:54 by stolfi set usage = "$0 SAMPLE SUBSEC..." # Computes the number of raw/good/bad tokens and words in # text subset SAMPLE (usually dat/LANG/BOOK), in the listed sections. # A SUBSEC can be "/" meaning a break in the table. if ( $#argv < 2 ) then echo "usage: ${usage}"; exit 1 endif set smp = "$1"; shift; set secs = ( $* ) printf "# tokens words \n" printf "# ----------------------------- -----------------------------\n" printf "# sec raw gud ppt bad ppt raw gud ppt bad ppt\n" printf "# ------ ----- ----- ---- ----- ---- ----- ----- ---- ----- ----\n" foreach sec ( ${secs} ) if ( "@${sec}" == "@/" ) then echo "/" else foreach c ( raw gud bad ) set wd${c} = `cat ${smp}/${sec}/${c}.wfr | wc -l` set tk${c} = `cat ${smp}/${sec}/${c}.wfr | gawk '/./{s+=$1;} END{print s;}'` end @ tkgudpc = ( 1000 * $tkgud ) / ( $tkraw + 1 ) @ tkbadpc = ( 1000 * $tkbad ) / ( $tkraw + 1 ) @ wdgudpc = ( 1000 * $wdgud ) / ( $wdraw + 1 ) @ wdbadpc = ( 1000 * $wdbad ) / ( $wdraw + 1 ) printf " %-6s %5d %5d %4d %5d %4d %5d %5d %4d %5d %4d\n" "${sec}" \ "${tkraw}" "${tkgud}" "${tkgudpc}" "${tkbad}" "${tkbadpc}" \ "${wdraw}" "${wdgud}" "${wdgudpc}" "${wdbad}" "${wdbadpc}" endif end