#! /usr/bin/gawk -f # Last edited on 1999-07-28 02:01:26 by stolfi # Reads a a sequence of records of the form COUNT WORD FNUM # where COUNT is the number of occurrences of WORD on page FNUM. # # Assumes each page is a finite random sample of some # frequency distribution over all words, that is characteristic # of that page. The distribution is estimated by Bayesian # inference assuming all distributions over the full vocabulary # (plus one "absent" word) are equally likely. # # Outputs for each WORD a line with TOTCOUNT BITS WORD, where TOTCOUNT # is the total number of occurrences of WORD, and BITS is the estimated # number of bits of information that the occurrence of WORD in a gives # about the page's identity, computed from those estimated per-page # frequency distributions. BEGIN { abort = -1; split("", ct_wp); split("", ct_p); split("", ct_w); nw = 0; np = 0; printf "reading per-page word counts...\n" > "/dev/stderr"; } (abort >= 0) { exit abort; } /./{ if (NF != 3) { error("bad NF"); } c = $1; w = $2; p = $3; if ((w,p) in ct_wp) { error(("repeated pair " w " " p)); } ct_wp[w,p] += c; if (! (p in ct_p)) { np++; } ct_p[p] += c; if (! (w in ct_w)) { nw++; } ct_w[w] += c; } END { if(abort >= 0) { exit abort; } printf "computing information per word...\n" > "/dev/stderr"; for (w in ct_w) { # Below, "f" if the estimated frequency of word "w" in the # language of page "p", and # "totf" is the sum of "f" for word "w" over all pages "p". # Also "toth" if the sum of "f*ln(f)" over # The estimated probability "q" of page "p" given that a randomly # picked word is "w" is then "f/totf". totf = 0; toth = 0; for (p in ct_p) { if ((w,p) in ct_wp) { f = (ct_wp[w,p]+1)/(ct_p[p] + nw + 1); } else { f = 1/(ct_p[p] + nw + 1); } # printf "%8.5f %s\n", f, p > "/dev/stderr"; totf += f; toth += -f*log(f) } # The estimated entropy of the page "p", given # the word "w", is then the sum of "q*lg(q)" over all # pages; which turns out to be h = (toth / totf + log(totf))/log(2); # printf "------- -----------\n%7.5f %s\n\n", h, w > "/dev/stderr"; # The information given by "w" about "p" is then bits = log(np)/log(2) - h; printf "%7d %7.4f %s\n", ct_w[w], bits, w; } } function error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit 1; }