#! /usr/bin/gawk -f
# Last edited on 1999-07-28 01:50:54 by stolfi

BEGIN {
  abort = -1;
  
  # Reads a file with records COUNT PAGE WORD where
  # COUNT is the number of occurrences of WORD on page PAGE.
  #
  # Writes a file with record of the form
  #
  
  split("", pwct);
  split("", wct);
  split("", pct);
  nwords = 0;
  npages = 0;
}

(abort >= 0) { exit abort; }

(NF != 3) { file_error("wrong num of fields"); }

/./ { 
  n = $1; p = $2; w = $3; 
  if ((p,w) in pwct) { file_error("repeated word/page pair"); }
  pwct[p,w] += n;
  if (! (w in wct)) { nwords ++; }
  wct[w] += n;
  if (! (p in pct)) { npages ++; }
  pct[p] += n;
  ct += n;
}

END {
  if (abort >= 0) { exit abort; }
  
  # wfr[w] is the estimated freq of w in the overall language.
  # pwfr[p,w] is the estimated frequency of word w in the "language" of page p.
  for (w in wct)
    { maxfr = -1;
      wfr = (wct[w] + 1)/(ct + nwords);
      for (p in pct)
        { pwfr = (pwct[p,w] + 1)/(pct[p] + nwords);
          if (pwfr > maxfr) { maxfr = pwfr; pmax = p; }
        }
      specf = maxfr/wfr;
      printf "%7.5f %7.5f %03d %7.5f %d %d %d %s\n", 
             wfr, maxfr, pmax, specf, pwct[pmax,w], pct[pmax], wct[w], w ;
    }
}
  
function file_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1;
}