#! /usr/bin/gawk -f # Last edited on 2001-01-17 02:59:59 by stolfi BEGIN { abort = -1; usage = ( "compute-token-entropy-profile \\\n" \ " < INFILE > OUTFILE" \ ); # Assumes the input records have fields # # COUNT ENTROPY PREFIX # # where # # PREFIX is an EVA word prefix, with capitalized ligatures and elements # marked off by {}, and prefixed with a word-start marker {_} # # COUNT is the number of tokens that have that prefix; # # ENTROPY is the conditional entropy of the next symbol (including # word stop) following that PREFIX. # # The program combines the data for all prefixes of the same # length (defined as number of {}-delimited elements), # and writes out for them a single record with the format # # CHARPOS POSCOUNT FREQ AVENTROPY WTENTROPY # # where # # CHARPOS is a character position index within the word, # starting with 1; # # POSCOUNT is the number of tokens that have # at least CHARPOS-1 elements. It is equal to # the sum of COUNT for all PREFIXes with # CHARPOS elements (including the initial "{_}"). # # FREQ is the fraction of those tokens, relative to # all tokens. # # AVENTROPY is the average entropy of the CHARPOSth # element of a token (including word-stop), computed # only among the tokens that have at least CHARPOS-1 # elements. It is equal to sum of ENTROPY*COUNT # for all PREFIXes with CHARPOS elements (including "{_}"), # divided by POSCOUNT. # # WTENTROPY is the same, weighted by FREQ. # # Note that the sum of WTENTROPY for all CHARPOS # must be equal to the token entropy. # indexed by character position: split("", posCt); # `posCt[k]' = tot count of prefixes with `k' elems. split("", posCtEnt); # `posCtEnt[k]' = sum of COUNT*ENTROPY for those prefixes. maxPos = 0; } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } /./ { if (NF != 3) { data_error("bad line format"); } ct = $1; ent = $2; pref = $3; if (pref !~ /^[{}_a-zA-Z?]+$/) { data_error("bad prefix"); } w = pref; # split prefix into elements, and compute the next char position: gsub(/[}][{]/, "} {", w); pos = split(w, welem, " "); # Element consistency check: if (welem[1] != "{_}") { data_error("no leading {_}"); } for (i = 2; i <= pos; i++) { e = welem[i]; if (e !~ /^[{][a-zA-Z?]+[}]$/) { data_error(("badly formed elem \"" e "\"")); } } # Tally entry: posCt[pos] += ct; posCtEnt[pos] += ent*ct; if (pos > maxPos) { maxPos = pos; } next; } END { if (abort >= 0) { exit abort; } totCt = posCt[1]; for (pos = 1; pos <= maxPos; pos++) { ct = posCt[pos]; if (ct > totCt) { data_error("inconsistent prefix counts"); } freq = ct/totCt; avEnt = posCtEnt[pos]/ct; wtEnt = posCtEnt[pos]/totCt; printf "%3d %7d %7.5f %6.3f %6.3f\n", pos, ct, freq, avEnt, wtEnt; } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function table_error(msg) { printf "error in elem table: %s\n", msg > "/dev/stderr"; abort = 1; exit 1; }