#! /usr/bin/gawk -f # Last edited on 1999-01-18 23:20:30 by stolfi BEGIN { abort = -1; # Reads a file with records COUNT PAGE WORD where # COUNT is the number of occurrences of WORD on page PAGE. # # Outputs a file in the same format where the per-PAGE and # per-WORD totals are correct but the pair counts have been # randomized. split("", pwct); split("", wct); split("", pct); nwords = 0; npages = 0; } (abort >= 0) { exit abort; } (NF != 3) { file_error("wrong num of fields"); } /./ { n = $1; p = $2; w = $3; if ((p,w) in pwct) { file_error("repeated word/page pair"); } pwct[p,w] += n; if (! (w in wct)) { nwords ++; } wct[w] += n; if (! (p in pct)) { npages ++; } pct[p] += n; ct += n; } END { if (abort >= 0) { exit abort; } split("", pwct); printf "expanding word sample...\n" > "/dev/stderr"; split("", tx); ntx = 0; for (w in wct) { m = wct[w]; for (i=0;i "/dev/stderr"; for(i=ntx-1;i>=1;i--) { j = int((i+1)*rand()-0.000001); if (j != i) { tmp = tx[i]; tx[i] = tx[j]; tx[j] = tmp; } } printf "collecting into pages...\n" > "/dev/stderr"; k = 0; split("", pwct); for (p in pct) { m = pct[p]; for(i=0; i "/dev/stderr"; abort = 1; exit 1; }