#! /usr/bin/gawk -f # Last edited on 2000-02-03 12:02:38 by stolfi # Reads from stdin a bunch of pairs of the form PROB WORD, # one per line (such as produced by "uniq -c"), where the # same WORD may occur several times. Adds all PROBs for # the same WORD, and writes the resulting TOTPROB WORD pairs # to standard output, in some order. BEGIN { abort = -1; usage = "combine-probs < INFILE > OUTFILE"; if (ARGC != 1) { error(("usage: " usage)); } split("", pr); } (abort >= 0) { exit abort; } /./ { if (! match($0, /^[ \t]*[0-9]*[.][0-9]*[ \t]+/)) { error(("line " NR ": bad format \"" $0 "\"")); } p = $1; w = substr($0, RLENGTH+1); pr[w] += p; } END { if (abort >= 0) { exit abort;} for (w in pr) { if (pr[w] >= 0.000005) { printf "%7.5f %s\n", pr[w], w; } } } function error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit 1; }