#! /bin/csh -f # Last edited on 1999-12-05 15:59:18 by stolfi set usage = "$0 FTAG SPLITCHAR" # Tabulates the pairs of prefixes and suffixes from a frequency # file of word feature FTAG. Assumes available the files # # stats-subsecs/FTAG/tot.frq # counts and frequencies of the FTAG values over all sections, # # FTAG-prefixes.dic, FTAG-suffixes.dic # list of official prefixes and suffixes, for tabulation # # smash-FTAG-letters # a script that reads a file of counts and FTAGs, # and collapses each `letter' of the FTAG # to a single "X" (preserving the SPLITCHAR if any). # # This script considers ony words of the frequency file that contain # exactly one instance of the given SPLITCHAR, and separates those words # into a "prefix" and a "suffix" at that character. Outputs two files, # stats-subsecs/FTAG/tot-pref-suff.mtx with the counts of # prefix/suffix combinations, and a similar table # stats-subsecs/FTAG/tot-pref-suff-len.mtx where the prefixes and # suffixes have been filtered through smash-FTAG-letters # if ( $#argv != 2 ) then echo "usage: ${usage}"; exit 1 endif set ftag = "$1"; shift; set splitchar = "$1"; shift; set tmp = "/tmp/$$" set ifile = "stats-subsecs/${ftag}/tot.frq" if ( ! ( -r ${ifile} ) ) then echo "${ifile} not found"; exit 1 endif foreach cp ( cat. smash-${ftag}-letters.-len set ccmd = "${cp:r}" set ctag = "${cp:e}" set prefs = "${ftag}-prefs${ctag}.dic if ( ! ( -r ${prefs} ) ) then echo "${prefs} not found"; exit 1 endif set suffs = "${ftag}-suffs${ctag}.dic if ( ! ( -r ${suffs} ) ) then echo "${suffs} not found"; exit 1 endif cat ${ifile} \ | gawk '/./{print $1, $3;}' \ | ${ccmd} \ | egrep '['"${splitchar}"']' \ | egrep -v '['"${splitchar}"'].*['"${splitchar}"']' \ | sed -e 's/['"${splitchar}"']/- -/g' \ | gawk '/./{print $1, $3, $2;}' \ | count-diword-freqs \ -v counted=1 \ -v digits=5 \ -v rows=${suffs} \ -v cols=${prefs} \ > stats-subsecs/${ftag}/tot-pref-suff${ctag}.mtx end