#! /bin/gawk -f # Last edited on 2004-01-24 01:40:05 by stolfi BEGIN { abort = -1; usage = ( \ "extract-slot -v tkwd=TKWD -f SLOTFNS.gawk \\\n" \ " < INFILE.wfr > OUTFILE.cts" \ ); # Reads a word frequency list with lines of the form COUNT FREQ WORD # # Extracts from each WORD a certain non-empty string SLOT, defined by the # function "slot_extract" from the package SLOTFNS.gawk . # # Writes out a file with lines TCOUNT SLOT PSLOT LENGTH where # # SLOT is each of the SLOT strings extracted from the input, # without repetitions; # # TCOUNT is the sum of all COUNTs of WORDs with that SLOT string. # # PSLOT is the SLOT string factored into "{}"-bracketed "elements", # by calling the function "slot_factor" from the same package; # # LENGTH is the number of "{}"-bracketed elements in PSLOT. split("",t); } /^ *([#]|$)/ {next;} /./ { c = $1; w = $3; s = tolower(slot_extract(w)); t[s] += c; next; } END { printf "t[empty] = %s\n", t[""] > "/dev/stderr"; for (s in t) { ct = t[s]; fs = slot_factor(s); if (s == "") { s = "_"; } if (fs == "") { fs = "_"; } ts = fs; gsub(/[^{}]/, "", ts); printf "%7d %s %d\n", ct, fs, length(ts)/2; } }