#! /usr/gawk -f
# Last edited on 2004-09-26 10:30:33 by stolfi

BEGIN{
  abort = -1;
  usage = ( ARGV[0] "-v N={N} < SAMPLE.flo > SAMPLE.prr" ); 
  
  # Reads a file with one record per distinct word, in the format
  #   
  #   "{COUNT} {FIRST} {LAST} {WORD}"
  #   
  # as produced by find-first-last-occs.
  # Writes one record per word in the format
  #   
  #   "{COUNT} {FFRAC} {LFRAC} {LPROB} {WORD}"
  #   
  # where
  #   
  #   {FFRAC} and {LFRAC} are {FIRST} and {LAST} mapped
  #   linearly from 0..N-1 to [0_1];
  #   
  #   {PROB} is the probability that a word with {K}
  #   occurrences randomly distributed in {N} tokens
  #   will have a range width {LAST-FIRST} less than
  #   or equal to the one observed for {WORD}
  
  if (N == "") { arg_error("must define N"); }
  
}

/./ { 
  if (NF != 4) { data_error(("bad NF = " NF)); }
  ct = $1; fo = $2; lo = $3; wd = $4;
  fp = fo/(N-1); lp = lo/(N-1);