#! /bin/csh -f # Last edited on 1999-12-05 15:56:05 by stolfi set usage = "$0 FTAG SPLITCHAR" # Extracts prefixes and suffixes from a frequency file of # word feature FTAG. Assumes available the files # # stats-subsecs/FTAG/tot.frq # counts and frequencies of the FTAG values over all sections, # # smash-FTAG-letters # a script that reads a file of counts and FTAGs, # and collapses each `letter' of the FTAG # to a single "X" (preserving the SPLITCHAR if any). # # This script considers ony words of the frequency file that contain # exactly one instance of the given SPLITCHAR, and separates those # words into a "prefix" and a "suffix" at that character. Outputs # files # # stats-subsecs/FTAG/tot-{pref,suff}.frq # counts and freqs of prefixes and suffixes, separately. # # stats-subsecs/FTAG/tot-{pref,suff}-len.frq # counts and freqs of prefix and suffix lengths, separately. if ( $#argv != 2 ) then echo "usage: ${usage}"; exit 1 endif set ftag = "$1"; shift; set splitchar = "$1"; shift; set tmp = "/tmp/$$" set ifile = "stats-subsecs/${ftag}/tot.frq" if ( ! ( -r ${ifile} ) ) then echo "${ifile} not found"; exit 1 endif foreach cp ( cat. smash-${ftag}-letters.-len set ccmd = "${cp:r}" set ctag = "${cp:e}" foreach ap ( 2.pref 3.suff ) set atag = "${ap:e}" set afld = "${ap:r}" cat ${ifile} \ | gawk '/./{print $1, $3;}' \ | ${ccmd} \ | egrep '['"${splitchar}"']' \ | egrep -v '['"${splitchar}"'].*['"${splitchar}"']' \ | sed -e 's/['"${splitchar}"']/'"${splitchar}"' '"${splitchar}"'/g' \ | gawk -v fld=${afld} '/./{print $1, $(fld);}' \ | combine-counts \ | sort -b +0 -1nr \ | compute-cum-freqs \ > stats-subsecs/${ftag}/tot-${atag}${ctag}.frq end end