#! /usr/bin/gawk -f # Last edited on 2001-01-17 03:27:20 by stolfi BEGIN { abort = -1; usage = ( "compute-hk-entropies \\\n" \ " < INFILE > OUTFILE" \ ); # Assumes the input records have fields # # COUNT ENTROPY NGRAM # # where # # NGRAM is a string of zero or more EVA elements, with capitalized # ligatures and elements marked off by {}, and prefixed with a # dummy marker "{...}" # # COUNT is the number of tokens that have that prefix; # # ENTROPY is the conditional entropy of the next symbol (including # word stop) following that NGRAM. # # The program combines the data for all NGRAMs of the same # length (defined as number of {}-delimited elements), # and writes out for them a single record with the format # # ORDER TOTCOUNT AVENTROPY # # where # # ORDER is the length of the NGRAMs that produced # this entry, starting with 1 (the "{}" marker). # # TOTCOUNT is the total count of those NGRAMs. # # AVENTROPY is the average entropy of the next symbol, # given the ORDER-1 previous symbols. It is equal to sum of ENTROPY*COUNT # for all NGRAMs that contributed to this entry, # divided by TOTCOUNT. # indexed by character position: split("", totCt); # `totCt[k]' = tot count of ngrams with `k' elems. split("", totCtEnt); # `totCtEnt[k]' = sum of COUNT*ENTROPY for those ngrams. maxk = 0; } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } /./ { if (NF != 3) { data_error("bad line format"); } ct = $1; ent = $2; ngram = $3; if (ngram !~ /^[{}_a-zA-Z?]+$/) { data_error("bad ngram"); } w = ngram; # split ngram into elements, and compute the element count `k': gsub(/[}][{]/, "} {", w); k = split(w, welem, " "); # Element consistency check: if (welem[1] != "{}") { data_error("no leading {}"); } for (i = 2; i <= k; i++) { e = welem[i]; if (e !~ /^[{][_a-zA-Z?]+[}]$/) { data_error(("badly formed elem \"" e "\"")); } } # Tally entry: totCt[k] += ct; totCtEnt[k] += ent*ct; if (k > maxk) { maxk = k; } next; } END { if (abort >= 0) { exit abort; } totCtPrev = totCt[1]; for (k = 1; k <= maxk; k++) { ct = totCt[k]; if (ct > totCtPrev) { data_error("inconsistent ngram counts"); } avEnt = totCtEnt[k]/ct; printf "%3d %7d %6.3f\n", k, ct, avEnt; } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function table_error(msg) { printf "error in elem table: %s\n", msg > "/dev/stderr"; abort = 1; exit 1; }