#! /usr/bin/gawk -f # Last edited on 1999-07-28 01:50:25 by stolfi BEGIN { abort = -1; # Reads a file with records COUNT PAGE WORD where # COUNT is the number of occurrences of WORD on page PAGE. # # Computes the information given by each word about the # page number, based on Bayes estimation of the frequency # of each word in each page. # # Writes for each word w the information H(w) carried by # w about the page number, and the most likely page # to contain w: split("", pwct); split("", wct); split("", pct); nwords = 0; npages = 0; } (abort >= 0) { exit abort; } (NF != 3) { file_error("wrong num of fields"); } /./ { n = $1; p = $2; w = $3; if ((p,w) in pwct) { file_error("repeated word/page pair"); } pwct[p,w] += n; if (! (w in wct)) { nwords ++; } wct[w] += n; if (! (p in pct)) { npages ++; } pct[p] += n; } END { if (abort >= 0) { exit abort; } # pwfr[p,w] is the estimated frequency of word w on the language of page p. # wfr[w] is the "a priori" probability of w assuming all pages # are equally likely. split("", pwfr); for (w in wct) { wfr[w] = 0; for (p in pct) { pwfr[p,w] = (pwct[p,w] + 1)/(pct[p] + nwords); wfr[w] += pwfr[p,w]; } wfr[w] /= npages; # printf "%7.5f %s\n", wfr[w], w > "/dev/stderr"; } # wh[w] is the information about PNUM given by one occurrence of word w. split("", wh); for (w in wct) { wh[w] = 0; maxg = 0; maxp = 0; for (p in pct) { wpg = (pwfr[p,w]/npages)/wfr[w]; wh[w] += -wpg*log(wpg)/log(2); if (wpg > maxg) { maxg = wpg; maxp = p; } } printf "%7.5f %03d %7.5f %s\n", wh[w], maxp, maxg, w; } } function file_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1; }