#! /usr/bin/gawk -f # Last edited on 1999-01-18 22:34:17 by stolfi BEGIN { abort = -1; # Reads a file with records COUNT PAGE WORD where # COUNT is the number of occurrences of WORD on page PAGE. # # Computes the histogram of each word over all pages. # # Writes a file with records of the form # # WORD TOTCT NPAGES NMISS SHAPE # # where TOTCT is the total occurrence count of the string, NPAGES is # the number of pages where the word occurs, NMISS is th enumber of # pages where the word doesn't occur, and SHAPE is the shape of the # word's per-page distribution, defined here as the multiset of the # nonzero per-page counts of that word, sorted in decreasing order. split("", pwct); split("", wct); split("", pct); nwords = 0; npages = 0; } (abort >= 0) { exit abort; } (NF != 3) { file_error("wrong num of fields"); } /./ { n = $1; p = $2; w = $3; if ((p,w) in pwct) { file_error("repeated word/page pair"); } pwct[p,w] += n; if (! (w in wct)) { nwords ++; } wct[w] += n; if (! (p in pct)) { npages ++; } pct[p] += n; ct += n; } END { if (abort >= 0) { exit abort; } for (w in wct) { split("", shape); ns = 0; for (p in pct) { if (pwct[p,w] != 0) { shape[ns] = pwct[p,w]; ns++; } } # Sort entries: for (i=0; i "/dev/stderr"; abort = 1; exit 1; }