#! /n/gnu/bin/gawk -f # Last edited on 1998-07-12 05:07:10 by stolfi BEGIN { usage = ( "compute-tuple-info \\\n" \ " -v alphabetSize=NUM \\\n" \ " < COUNTFILE > INFOTBL" \ ); # Reads a file of COUNT WORD pairs, as produced by "uniq -c", # containing the observed number of occurrences of # each substring of a fixed size K in some finite text sample. # Outputs a similar file with INFO WORD lines, where INFO is # the information content of WORD (i.e. log base 2 of the # estimated frequency of WORD in the infinite text. # # If S is the total number of K-grams in the sample (that is, the # sample had length S+K-1) then the estimated frequency of a word # that occurs W times is (W + 1)/(S + A^K) where A is the alphabet # size (168 for the printable subset of ISO Latin-1). # # Note that words that do NOT occur in the sample still have # estimated probabilities 1/(S + A^K), hence their information # contents is log_2(S + A^K). abort = -1; if (alphabetSize == "") { error("should define \"alphabetSize\""); } if ((alphabetSize < 1) || (alphabetSize > 168)) { error("funny \"alphabetSize\""); } A = aphabetSize; S = 0; K = 0; split("", count); } // { if (abort >= 0) { exit(abort); } if (NF != 2) { fatal_error(("line " NF ": bad input format")); } c = $1; w = $2; count[w] += c; S += $1; if (K == 0) { K = length(w); } else if (K != length(w)) { fatal_error(("line " NF ": incongruent word length")); } next; } END { if (abort >= 0) { exit(abort); } # Computes logden = -log_2(1/(S + A^K)) # = log_2(S + A^K) = log_2(A^K) + log_2(1 + S/A^K) logden = K*log(A); if (log(S) - logden > log(0.0001)) { logden = logden + log(1 + exp(log(S)-logden)); } scale = -1/log(2.0); for (w in count) { # Computes info = -log2((count[w] + 1)/(S + A^K)) info = scale*(- log(count[w]+1) + logden); printf "%7.3f %s\n", info, w; } } function error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 0; }