#! /usr/bin/gawk -f # Last edited on 1999-07-18 07:43:53 by stolfi # Reads a a sequence of records of the form COUNT WORD FNUM # where COUNT is the number of occurrences of WORD on page FNUM. # # Assumes each page is a finite random sample of some # frequency distribution over all words, that is characteristic # of that page. The distribution is estimated by Bayesian # inference assuming all distributions over the full vocabulary # (plus one "absent" word) are equally likely. # # Outputs for each WORD a line with TOTCOUNT NPAGES INDEX WORD, where TOTCOUNT # is the total number of occurrences of WORD, NPAGES is the number # of pages where the word occurs, and INDEX is an indicator # of how lumpily is the WORD distributed over the pages. BEGIN { abort = -1; split("", ct_wp); split("", ct_p); split("", ct_w); ct = 0; nw = 0; np = 0; split("", log_comb); printf "reading per-page word counts...\n" > "/dev/stderr"; } (abort >= 0) { exit abort; } /./{ if (NF != 3) { error("bad NF"); } c = $1; w = $2; p = $3; if ((w,p) in ct_wp) { error(("repeated pair " w " " p)); } ct_wp[w,p] += c; if (! (p in ct_p)) { np++; } ct_p[p] += c; if (! (w in ct_w)) { nw++; } ct_w[w] += c; ct += c; } END { if(abort >= 0) { exit abort; } printf "computing information per word...\n" > "/dev/stderr"; for (w in ct_w) { N = ct_w[w]; totq = 0; nwp = 0; for (p in ct_p) { n = ct_p[p]; if ((w,p) in ct_wp) { m = ct_wp[w,p]; nwp++; } else { m = 0; } q = (m+1)*m*(m-1); # if (q > 0) { printf "%7d %7d %8.4f %s\n", m, n, q, w > "/dev/stderr"; } totq += q; } x = totq/(N*N*N); # printf "------- ------- -------- -----------\n" > "/dev/stderr"; # printf "%7d %7d %8.4f %s\n\n", ct_w[w], nwp, x, w > "/dev/stderr"; if (x > 0) { printf "%7d %7d %8.4f %s\n", ct_w[w], nwp, x, w; } } } function error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit 1; }