#! /usr/bin/gawk -f # Last edited on 2001-01-17 22:13:55 by stolfi BEGIN { abort = -1; usage = ( "extract-and-count-prefixes \\\n" \ " [ -v spaceMarker=STRING ] \\\n" \ " [ -v showBadWords=BOOL ] \\\n" \ " < INFILE > OUTFILE" \ ); # Assumes the input records have fields # # COUNT WORD # # where WORD is plain EVA, with capitalized ligatures and elements # marked off by {}; and COUNT is its token count. For each input record # the program appends to each end of WORD a `spaceMarker' element # in braces, and then writes out several records of the form # # COUNT PREFIX # # where the PREFIX consists of the first K elements of # the padded WORD, including the padding elements, # for all K from 1 to the maximum number. if (spaceMarker == "") { spaceMarker = "_"; } spaceMarkerWBraces = ( "{" spaceMarker "}" ); if (showBadWords == "") { showBadWords = 0; } # indexed with the capitalized and bracketed word prefix: split("", prefixCt); nwordGud = 0; nwordShort = 0; ntokenGud = 0; ntokenShort = 0; } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } /./ { if (NF != 2) { data_error("bad line format"); } ct = $1; worig = $2; if (worig !~ /^[{}a-zA-Z?]+$/) { data_error("bad word"); } w = worig; # split word into elements: gsub(/[}][{]/, "} {", w); ne = split(w, welem, " "); # Append and prepend a space to the word: welem[0] = spaceMarkerWBraces; welem[ne+1] = spaceMarkerWBraces; # Element consistency check: wordBad = 0; for (i = 1; i <= ne; i++) { e = welem[i]; if (e !~ /^[{][a-zA-Z?]+[}]$/) { data_error(("badly formed elem \"" e "\"")); } } if (ne < order - 1) { nwordShort++; ntokenShort += ct; next; } else { nwordGud++; ntokenGud += ct; } extract_prefixes(welem, ne, ct, worig); next; } function extract_prefixes(welem, ne, ct, worig, i,ei,p) { # tallies `ct' occurrences of the prefixes `welem[0..k]' # for all `k' starting from 1. # `worig' is the original word (for debugging). p = welem[0]; prefixCt[p] += ct; for (k = 1; k <= ne+1; k++) { e = welem[k]; p = ( p e ); if (! (p in prefixCt)) { prefixCt[p] = 0; } prefixCt[p] += ct; } } END { if (abort >= 0) { exit abort; } if (nwordShort > 0) { printf "%d short words (%d short tokens) found:\n", nwordShort, ntokenShort > "/dev/stderr"; } for (p in prefixCt) { ct = prefixCt[p]; printf "%7d %s\n", ct, p; } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function table_error(msg) { printf "error in elem table: %s\n", msg > "/dev/stderr"; abort = 1; exit 1; }