#! /n/gnu/bin/gawk -f
# Last edited on 1998-07-12 05:07:10 by stolfi

BEGIN {
  usage = ( "compute-tuple-info \\\n" \
            "  -v alphabetSize=NUM \\\n" \
            "  < COUNTFILE > INFOTBL" \
          );

  # Reads a file of COUNT WORD pairs, as produced by "uniq -c",
  # containing the observed number of occurrences of 
  # each substring of a fixed size K in some finite text sample.
  # Outputs a similar file with INFO WORD lines, where INFO is
  # the information content of WORD (i.e. log base 2 of the 
  # estimated frequency of WORD in the infinite text.
  #
  # If S is the total number of K-grams in the sample (that is, the
  # sample had length S+K-1) then the estimated frequency of a word
  # that occurs W times is (W + 1)/(S + A^K) where A is the alphabet
  # size (168 for the printable subset of ISO Latin-1).
  #
  # Note that words that do NOT occur in the sample still have 
  # estimated probabilities 1/(S + A^K), hence their information
  # contents is log_2(S + A^K). 

  abort = -1;
  if (alphabetSize == "") { error("should define \"alphabetSize\""); } 
  if ((alphabetSize < 1) || (alphabetSize > 168)) { error("funny \"alphabetSize\""); } 
  A = aphabetSize;
  S = 0;
  K = 0;
  split("", count);
  
}

// {
  if (abort >= 0) { exit(abort); }
  if (NF != 2) { fatal_error(("line " NF ": bad input format")); }
  c = $1;
  w = $2;
  count[w] += c;
  S += $1;
  if (K == 0)
    { K = length(w); }
  else if (K != length(w))
    { fatal_error(("line " NF ": incongruent word length")); }
  next;  
}

END {
  if (abort >= 0) { exit(abort); }
  # Computes logden = -log_2(1/(S + A^K)) 
  #   = log_2(S + A^K) = log_2(A^K) + log_2(1 + S/A^K)
  logden = K*log(A);
  if (log(S) - logden > log(0.0001))
    { logden = logden + log(1 + exp(log(S)-logden)); }
  scale = -1/log(2.0);
  for (w in count)
    { 
      # Computes info = -log2((count[w] + 1)/(S + A^K))
      info = scale*(- log(count[w]+1) + logden);
      printf "%7.3f %s\n", info, w;
    }
}

function error(msg)
{ printf "%s\n", msg >>  "/dev/stderr";
  abort = 1; exit 0;
}