#! /usr/bin/gawk -f # Last edited on 1998-07-15 23:22:48 by stolfi BEGIN { usage = ( "compute-cond-tuple-info \\\n" \ " -v order=ORDER \\\n" \ " -v place=PLACE \\\n" \ " [ -v alphabetSize=NUM ] \\\n" \ " < COUNTFILE > INFOTBL" \ ); # The file COUNTFILE must have entries COUNT WORD where COUNTS # is a positive occurrence count for the string WORD in some text. # The length of all WORDs must be ORDER. # # Computes the conditional information content for the PLACEth # character of each WORD in COUNTFILE, assuming the rest of WORD is # known. Outputs entries INFO WORD for each input WORD. # # The optional "alphabetSize" value is used as the number of # possible categories in probability estimation. If given, the # input should not have more than that number of distinct letters in # the PLACEth position of all tuples. If not given, the actual # number of distinct letters in that position is used instead. # # Also prints to "stderr" a summary of the data. abort = -1; if (order == "") { error("should define \"order\""); } if ((order < 1) || (order > 20)) { error("funny \"order\""); } if (place == "") { error("should define \"place\""); } if ((place < 1) || (place > order)) { error("funny \"place\""); } if (alphabetSize == "") { alphabet_size = 0; } if ((alphabetSize < 0) || (alphabetSize > 168)) { error("funny \"alphabetSize\""); } split("", full_count); split("", part_count); split("", letter_count); n_letters = 0; tot_count = 0; n_tuples = 0; } // { if (abort >= 0) { exit(abort); } if (NF != 2) { error(("line " NR ": format error")); } c = $1; w = $2; if (length(w) != order) { error(("line " NR ": wrong word length")); } if (w in full_count) { error(("line " NR ": tuple \"" w "\" repeated")); } full_count[w] = c; p = ( substr(w,1,place-1) substr(w, place+1, order-place) ); part_count[p] += c; a = substr(w,place,1); if(! (a in letter_count)) { n_letters ++; } letter_count[a] += c; tot_count += c; n_tuples ++; } END { if (abort >= 0) { exit(abort); } # Print entropy table if (alphabet_size == 0) { n_cats = n_letters; } else { n_cats = alphabet_size; if (n_letters > alphabetSize) { error(("true alphabet size = " n_letters)); } } scale = -1/log(2.0); tot_info = 0; for(w in full_count) { p = (substr(w,1,place-1) substr(w,place+1,order-place)); a = substr(w,place,1); cf = full_count[w]; cp = part_count[p]; # Assume that, "a priori", all histograms are equally likely: fr = (cf + 1)/(cp + n_cats); info = scale*log(fr); printf "%7.3f %s\n", info, w; tot_info += cf*info; } # print summary print_summary(tot_info, n_tuples, order, place, alphabet_size, n_letters); } function print_summary(tot_info, n_tuples, order, place, a_nom, a_obs, err,nch,hmed,left,right) { err = "/dev/stderr"; hmed = tot_info/tot_count; left = place-1 ; right = order - place; nch = tot_count + order - 1; printf "# full text: %d characters (including fillers)\n", nch > err; printf "# context: left = %d right = %d (order = %d)\n", left, right, order > err; printf "# alphabet size:" > err; if (a_nom != 0) { printf " nominal = %d", a_nom > err; } printf " actual = %d\n", a_obs > err; printf "# mean conditional entropy h_%d: %6.3f bits/character\n", order, hmed > err; } function error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 0; }