#! /bin/csh -f # Last edited on 2000-06-14 01:42:12 by stolfi set usage = "$0 SECTION TAGPAIR SELTYPE PART" # Extracts word component statistics from the frequency files prepared # in Note 057. The input data file is stats/{words,labels}/TAGPAIR/SECTION.frq. # Each entry should have the format # # COUNT FREQ TYPE-VALUE # # where TYPE is a component type tag, as used in Note 057 (e.g. "pm" # for crust+mantle prefix), VALUE is the component itself. # # This script selects the entries with TYPE = SELTYPE and writes the # corresponding COUNT, FREQ, and VALUE to prob/obs/SECTION/PART.frq # # As a special case, SELTYPE = "w" means "take all entries". if ( $#argv != 4 ) then echo "usage: ${usage}"; exit 1 endif set sec = "$1"; shift set tagpair = "$1"; shift set seltype = "$1"; shift set part = "$1"; shift if ( ${seltype} == "w" ) set seltype = '.*' if ( ${sec} == "txt.n" ) then set wdtype = "words" set infile = "stats/words/${tagpair}/tot.frq" else if ( ${sec} == "lab.n" ) then set wdtype = "labels" set infile = "stats/labels/${tagpair}/tot.frq" else set wdtype = "words" set infile = "stats/words/${tagpair}/${sec}.frq" endif set obsdir = "prob/obs/${sec}" set frfile = "${obsdir}/${part}.frq" if ( ! ( -d ${obsdir} ) ) mkdir ${obsdir} echo "${infile} -> ${frfile}" cat ${infile} \ | gawk \ ' ($3 ~ /^'"${seltype}"'-/){ \ c = $1; w = $3; \ gsub(/^'"${seltype}"'-/,"",w); if (w==""){ w = "."; } \ printf "%7d %s\n", c, w; next; \ } \ ' \ | sort -b -n +0 -1nr +1 -2 \ | compute-freqs \ > ${frfile}