#! /usr/bin/gawk -f # Last edited on 1999-07-21 17:41:38 by stolfi BEGIN { usage = "estimate-freqs -v nItems=NITEMS < INFILE > OUTFILE"; # Reads a file of COUNT ITEM pairs, as produced by "uniq -c", # that respresents a finite sample of some discrete random variable. # Outputs a similar file with COUNT FREQ ITEM lines, where FREQ is # the estimated frequency of ITEM in the variable. # The estimator used is (COUNT + 1) / (TOTCOUNT + NITEMS), # where TOTCOUNT is the sampel size (sum of all COUNTs), and # NITEMS is the number of distinct values that the variable # may nominally assume. # # Also outputs an extra line with COUNT = 0, # FREQ = 1/(TOTCOUNT + NITEMS), and ITEM = "*ETC*", # representing all ITEM values that are not listed # explicitly. abort = -1; if (nItems == "") { fatal_error("must define \"nItems\""); } totCount = 0; n = 0; xItems = 0; } (abort >=0 ) { exit abort; } ($2 == "*ETC*") { next; } /^([#]|[ ]*$)/ { ct[n] = "#"; it[n] = $0; n++; next; } // { if (NF != 2) { fatal_error(("line " NF ": bad input format")); } totCount += $1; ct[n] = $1; it[n] = $2; n++; xItems++; next; } END { if (abort >= 0) { exit abort; } if (xItems > nItems) { fatal_error("too many items"); } den = (totCount + nItems); for (i=0; i "/dev/stderr"; abort = 1; exit 1; }