#! /usr/bin/gawk -f # Last edited on 1999-07-28 01:50:54 by stolfi BEGIN { abort = -1; # Reads a file with records COUNT PAGE WORD where # COUNT is the number of occurrences of WORD on page PAGE. # # Writes a file with record of the form # split("", pwct); split("", wct); split("", pct); nwords = 0; npages = 0; } (abort >= 0) { exit abort; } (NF != 3) { file_error("wrong num of fields"); } /./ { n = $1; p = $2; w = $3; if ((p,w) in pwct) { file_error("repeated word/page pair"); } pwct[p,w] += n; if (! (w in wct)) { nwords ++; } wct[w] += n; if (! (p in pct)) { npages ++; } pct[p] += n; ct += n; } END { if (abort >= 0) { exit abort; } # wfr[w] is the estimated freq of w in the overall language. # pwfr[p,w] is the estimated frequency of word w in the "language" of page p. for (w in wct) { maxfr = -1; wfr = (wct[w] + 1)/(ct + nwords); for (p in pct) { pwfr = (pwct[p,w] + 1)/(pct[p] + nwords); if (pwfr > maxfr) { maxfr = pwfr; pmax = p; } } specf = maxfr/wfr; printf "%7.5f %7.5f %03d %7.5f %d %d %d %s\n", wfr, maxfr, pmax, specf, pwct[pmax,w], pct[pmax], wct[w], w ; } } function file_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1; }