#! /usr/gawk -f # Last edited on 2004-09-26 10:30:33 by stolfi BEGIN{ abort = -1; usage = ( ARGV[0] "-v N={N} < SAMPLE.flo > SAMPLE.prr" ); # Reads a file with one record per distinct word, in the format # # "{COUNT} {FIRST} {LAST} {WORD}" # # as produced by find-first-last-occs. # Writes one record per word in the format # # "{COUNT} {FFRAC} {LFRAC} {LPROB} {WORD}" # # where # # {FFRAC} and {LFRAC} are {FIRST} and {LAST} mapped # linearly from 0..N-1 to [0_1]; # # {PROB} is the probability that a word with {K} # occurrences randomly distributed in {N} tokens # will have a range width {LAST-FIRST} less than # or equal to the one observed for {WORD} if (N == "") { arg_error("must define N"); } } /./ { if (NF != 4) { data_error(("bad NF = " NF)); } ct = $1; fo = $2; lo = $3; wd = $4; fp = fo/(N-1); lp = lo/(N-1);