#! /usr/bin/gawk -f # Last edited on 1999-01-30 17:30:58 by stolfi BEGIN { abort = -1; usage = ( "list-page-champs [ -v maxChamps=NUM ] < INFILE > OUTFILE" ); # Reads from INFILE recors in the format # # COUNT SEC PNUM FNUM WORD # # where SEC is a section tag, PNUM is a page's p-number, # # Assumes they are grouped by SEC and PNUM and then sorted by COUNT # decreasing. Writes for each page a record of the form # # SEC PNUM FNUM W1(C1) W2(C2) ... Wn(Cn) # # where W1, W2, ... Wn are the NUM (default 1) most popular WORds # in page PNUM, and C1, C2, ... Cn are the respective COUNTs. if (maxChamps == "") { maxChamps = 1; } cur_key = ""; split("", top_wd); split("", top_ct); } (abort >= 0) { exit abort; } /./ { ct = $1 sc = $2; pn = $3; fn = $4; wd = $5; key = (sc " " pn " " fn) if (key != cur_key) { flush_page(); nChamps = 0; cur_key = key; last_ct = 999999; } if (last_ct < ct) { error("out of order"); } if (nChamps < maxChamps) { top_wd[nChamps] = wd; top_ct[nChamps] = ct; nChamps++; } } END { if (abort >= 0) { exit abort; } flush_page(); } function flush_page( i) { if (cur_key != "") { printf "%-14s", cur_key; for(i=0; i> "/dev/stderr"; abort = 1; exit 1; } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1; }