#! /bin/csh -f # Last edited on 2000-06-08 04:22:03 by stolfi set usage = "$0 PTAG SEC NL NR" # Tabulates the pairs of prefixes and suffixes from a frequency file # of word feature PTAG for subsection SEC. Assumes available the files # # stats/{words,labels}/PTAG/SEC.frq # counts and frequencies of the PTAG pairs for section SEC. # # This script considers ony words of the frequency file that contain # exactly one instance of "-", and separates those words into a "left" # and a "right" components at that character. Outputs the following # files: # # stats/{words,labels}/PTAG/SEC-L.frq # stats/{words,labels}/PTAG/SEC-R.frq # counts and frequencies for the left and right components # of those pairs. # # stats/{words,labels}/PTAG/SEC-L.dic # The NL most common left components. # # stats/{words,labels}/PTAG/SEC-R.dic # The NR most common right components. # # stats/{words,labels}/PTAG/SEC.mtx # Counts of PTAG pairs with the prefixes SEC-L.dic and SEC-R.dic, # in tabular format. # if ( $#argv != 4 ) then echo "usage: ${usage}"; exit 1 endif set ptag = "$1"; shift; set sec = "$1"; shift; set nLeft = "$1"; shift; set nRight = "$1"; shift; set tmp = "/tmp/$$" foreach f ( words labels ) set ifile = "stats/${f}/${ptag}/${sec}.frq" set lfile = "stats/${f}/${ptag}/${sec}-L.frq" set rfile = "stats/${f}/${ptag}/${sec}-R.frq" if ( ! ( -r ${ifile} ) ) then echo "${ifile} not found"; exit 1 endif # Extract left and right components echo "${lfile}" cat ${ifile} \ | gawk '($3 ~ /[-]/) { gsub(/[-].*$/, "-", $3); printf "%7d %s\n", $1, $3; }' \ | combine-counts \ | sort -b +0 -1nr +1 -2 \ | compute-freqs \ > ${lfile} echo "${rfile}" cat ${ifile} \ | gawk '($3 ~ /[-]/) { gsub(/^.*[-]/, "-", $3); printf "%7d %s\n", $1, $3; }' \ | combine-counts \ | sort -b +0 -1nr +1 -2 \ | compute-freqs \ > ${rfile} # Compute cumulative frequencies, and get the most common ones: foreach sn ( L.${nLeft} R.${nRight} ) set side = "${sn:r}" set num = "${sn:e}" set frqfile = "stats/${f}/${ptag}/${sec}-${side}.frq" set fcmfile = "stats/${f}/${ptag}/${sec}-${side}.fcm" set dicfile = "stats/${f}/${ptag}/${sec}-${side}.dic" echo "${fcmfile}" cat ${frqfile} \ | gawk '/./{print $1, $3;}' \ | compute-cum-freqs \ > ${fcmfile} echo "${dicfile}" cat ${fcmfile} \ | head -${num} \ | gawk '($2 >= 0.001){print $5;}' \ > ${dicfile} end # Now tabulate the corresponding pairs: set prefs = "stats/${f}/${ptag}/${sec}-L.dic" if ( -z ${prefs} ) then echo "${prefs} is empty"; exit 1 endif set suffs = "stats/${f}/${ptag}/${sec}-R.dic" if ( -z ${suffs} ) then echo "${suffs} is empty"; exit 1 endif set mfile = "stats/${f}/${ptag}/${sec}.mtx" echo ${mfile} cat ${ifile} \ | gawk '($3 ~ /[-]/) {n=$1;w =$3; gsub(/[-]/, "- -", w); printf "%7d %s\n", n,w; }' \ | count-diword-freqs \ -f factor-table.gawk \ -v counted=1 \ -v digits=5 \ -v rows=${prefs} \ -v cols=${suffs} \ > ${mfile} end