#! /usr/bin/gawk -f # Last edited on 1999-07-28 01:40:00 by stolfi function error(msg) { printf "%s\n", msg > "/dev/stderr"; abort=1; exit(1); } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; abort=1; exit(1); } BEGIN { abort = -1; printf "**" > "/dev/stderr"; usage = "$0 -v dic=DICFILE < COUNTFILE > PROBFILE"; if (ARGC != 1) { arg_error (("usage: " usage "\n")); } # Reads a list of words DICFILE, in some specific order, and a file # of word counts, as produced by "uniq -c". Outputs another file # PROBFILE whose lines have the form PROB WORD, where WORD # enumerates all the words of DICFILE, in their proper order, and # PROB is an estimate of the probability of WORD in the infinte # text. if (dic == "") { arg_error (("usage: " usage "\n")); } split("", wd); split("", ct); sizeD = 0; while ((getline < dic) > 0) { wd[sizeD] = $1; sizeD ++; if (NF != 1) { arg_error((dic ", line " sizeD "\": bad format")); } ct[$1] = 0; } if (ERRNO != "0") { arg_error((dic ": " ERRNO)); } printf "size(D) = %6d ", sizeD > "/dev/stderr"; close(dic); sizeSD = 0; } (abort >= 0) { exit abort; } /^#/ { next; } /./ { if (NF != 2) { error((counts ", line " NR "\": bad format")); } c = $1; w=$2; if (w in ct) { ct[w] += c; sizeSD += c; } next; } END { if (abort >= 0) { exit abort; } printf "size(S.D) = %7d\n", sizeSD > "/dev/stderr"; for (i=0; i < sizeD; i++) { w = wd[i]; c = ct[w]; efr = ((c+1)/(sizeSD+sizeD)); printf "%7.5f %s\n", efr, w; } }