#! /usr/bin/gawk -f
# Last edited on 1999-07-28 01:50:25 by stolfi

BEGIN {
  abort = -1;
  
  # Reads a file with records COUNT PAGE WORD where
  # COUNT is the number of occurrences of WORD on page PAGE.
  #
  # Computes the information given by each word about the 
  # page number, based on Bayes estimation of the frequency
  # of each word in each page.
  #
  # Writes for each word w the information H(w) carried by 
  # w about the page number, and the most likely page
  # to contain w:
  
  split("", pwct);
  split("", wct);
  split("", pct);
  nwords = 0;
  npages = 0;
}

(abort >= 0) { exit abort; }

(NF != 3) { file_error("wrong num of fields"); }

/./ { 
  n = $1; p = $2; w = $3; 
  if ((p,w) in pwct) { file_error("repeated word/page pair"); }
  pwct[p,w] += n;
  if (! (w in wct)) { nwords ++; }
  wct[w] += n;
  if (! (p in pct)) { npages ++; }
  pct[p] += n;
}

END {
  if (abort >= 0) { exit abort; }
  
  # pwfr[p,w] is the estimated frequency of word w on the language of page p.
  # wfr[w] is the "a priori" probability of w assuming all pages
  # are equally likely.
  split("", pwfr);
  for (w in wct)
    { wfr[w] = 0;
      for (p in pct)
        { pwfr[p,w] = (pwct[p,w] + 1)/(pct[p] + nwords);
          wfr[w] += pwfr[p,w];
        }
      wfr[w] /= npages;
      # printf "%7.5f %s\n", wfr[w], w > "/dev/stderr";
    }

  # wh[w] is the information about PNUM given by one occurrence of word w.
  split("", wh);
  for (w in wct)
    { wh[w] = 0;
      maxg = 0; maxp = 0;
      for (p in pct)
        { wpg = (pwfr[p,w]/npages)/wfr[w]; 
          wh[w] += -wpg*log(wpg)/log(2);
          if (wpg > maxg) { maxg = wpg; maxp = p; }
        }
      printf "%7.5f %03d %7.5f %s\n", wh[w], maxp, maxg, w;
    }
}
  
function file_error(msg)
{
  printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1;
}