#! awk -f # Last edited on 1998-07-17 06:20:41 by stolfi BEGIN{ usage = "compute-chunkiness < TABLE > LIST" # Reads a table with entries of the form COUNT FNUM WORD PNUM # Prints a list FLATNESS WORD wher FLATNESS is 0 for # words that occur only in one page,and maximum for # evenly distributed words. abort = -1; # Number of distinct pages and words nw = 0; np = 0; # Number of distinct pages per word and distinct words per pages split("", nw_p); split("", np_w); # Number of word occurrences no = 0; # Number of word occurrences per page and per word split("", no_w); split("", no_p); # Number of word occurrences per (page,word) pair split("", no_wp); # Lazy table "n -> log(n!)" split("", lfac); lfac[0] = 0; } /./{ if (abort >= 0) { exit abort; } if(NF != 4) { error(("line " NR ": bad format")); } o = $1; p = $2; f = $3; w = $4; if((w,p) in no_wp) { error(("line " NR ": repeated entry")); } no_wp[w,p] = o; if (! (p in no_p)) { ff[np] = f; pp[np] = p; np++; no_p[p] = 0; } no_p[p] += o; if (! (w in no_w)) { ww[nw] = w; nw++; no_w[w] = 0; } no_w[w] += o; no += o; next; } END{ if (abort >= 0) { exit abort; } for (w in no_w) { # Compute the strangeness of word "w" m = no; mw = no_w[w]; wen = logfac(mw) - (mw/m)*logfac(m); printf " %8.3f %3d/%3d %-6s %-15s\n", \ wen/log(2), no_w[w], no, "", w > "/dev/stderr"; for (p in no_p) { mwp = no_wp[w,p]; mp = no_p[p]; wpen = (mw/m)*logfac(mp) - logfac(mwp); printf " %8.3f %3d/%3d %-6s %-15s\n", \ wpen/log(2), no_wp[w,p], no_p[p], p, w > "/dev/stderr"; wen += wpen; } wen = wen/log(2); printf"%8.3f %s\n", wen, w; printf"%8.3f %s\n", wen, w > "/dev/stderr"; } } function logcomb(v, i,k,m,s) { # log of the number of ways of tossing "m" items # into "k" buckets such that the total counts # are "v[i]", where "k" is the number of entries # of "v" and "m" is their sum. k = 0; m = 0; for (i in v) { k++; m += v[i]; } s = logfac(m); for (i in v) { s -= logfac(v[i]); } return (s); } function logfac(n, i,s) { # log of factorial of "n" i = n + 0; while (!(i in lfac)) { if (i == 0) { error("bang " n); } i--; } s = lfac[i]; while (i < n) { i++; s += log(i); lfac[i] = s; } return (s); } function error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 0; }