#! /usr/bin/gawk -f # Last edited on 2001-01-04 19:33:38 by stolfi BEGIN { abort = -1; usage = ( \ "compute-cond-entropy < INFILE > OUTFILE" \ ); # Expects the input to have records of the form # # COUNT EVENT SYMB # # representing COUNT occurrences of SYMB after the specified EVENT. # The records must be sorted by WORD. # # Outputs for each EVENT one line with format # # TOTCOUNT ENTROPY EVENT # # where TOTCOUNT is the sum of all COUNT for the EVENT, and ENTROPY is the # conditional entropy of the next SYMB given that EVENT. Assumes that # all events are mutually exclusive, and all SYMB for each EVENT are # distinct. curEvent = ""; curEventCt = 0; # Total count for the current event. nSymb = 0; # Count of symbols for this event. split("", symbCt); # Counts of each symbol after the current event. } (abort >= 0) { exit abort; } /./ { if (NF != 3) { data_error("bad field count"); } event = $2; if (event != curEvent) { totalize_event_entropy(); curEvent = event; } count = $1; if (count !~ /^[+]*[.0-9]*[0-9][.0-9]*$/) { data_error("bad count format"); } symbCt[nSymb] = count; nSymb++; curEventCt += count; next; } END { totalize_event_entropy(); } function totalize_event_entropy( i, entropy) { if (curEvent != "") { for (i = 0; i < nSymb; i++) { symbCt[i] /= curEventCt; } entropy = 0; for (i = 0; i < nSymb; i++) { if (symbCt[i] > 0) { entropy += -symbCt[i]*log(symbCt[i]); } } entropy /= log(2.0); printf "%7d %8.4f %s\n", curEventCt, entropy, curEvent; } split("", symbCt); nSymb = 0; curEventCt = 0; } function data_error(msg) { printf "line %d: %s\n", NR, msg >> "/dev/stderr"; abort = 1; exit 1; } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1; }