#! /n/gnu/bin/gawk -f # Last edited on 1999-01-16 07:13:50 by stolfi # Reads a file containing records with the format # # FREQ PNUM STRING # 1 2 4 # # where # # PNUM is a sequential page number, "001" to "234". # # STRING is a non-empty string. # # FREQ is a count of occurrences of STRING on page PNUM. # # Outputs a file containing, for each distinct STRING, one record # with the format # # TOTFR MAXFR SPECF PMAX STRING # 1 2 3 4 6 # # where # # TOTFR is the total occurrences of STRING. # # MAXFR is the maximum occurrence count in any page. # # SPECF is the ratio MAXFR/TOTFR # # PMAX is one of the pages where STRING occurs with freq. MAXFR. # function clear_counts() { # Resets all page and total counts to zero totct = 0; maxct = 0; maxpg = "???"; maxfl = "f???"; } function output_word() { # Writes an output record specf = maxct/totct printf "%7d %7d %5.3f %s %s %s\n", totct, maxct, specf, maxpg, maxfl, curwd; } BEGIN { abort = -1; curwd = ""; clear_counts(); } (abort >= 0) { exit; } /./ { ct = $1; pg = $2; wd = $3; if (wd != curwd) { if (curwd != "") output_word(); clear_counts(); curwd = wd; } totct += ct; if (ct >= maxct) {maxct = ct; maxpg = pg; maxfl = fl;} next; }