#! /usr/bin/gawk -f # Last edited on 1998-08-02 19:52:06 by stolfi BEGIN { abort = -1; usage = "compute-int-freqs [ -v cum=BOOL | -v rem=BOOL ] < INFILE > OUTFILE"; # # Reads a file of COUNT ITEM pairs, as produced by "uniq -c" # Outputs a similar file with FREQ ITEM lines, where FREQ is # the fraction of COUNT relative to the total of all COUNTs, scaled # to [1..999] and rounded. # # If "cum" is set, FREQ is computed from the sum of all COUNTs up to # this one (inclusive). If "rem" is set, FREQ is computed from the # sum of all counts following this one (exclusive). if (cum == "") { cum = 0; } if (rem == "") { rem = 0; } if (cum && rem) { error(("compute-int-freqs: you may specify only one of \"cum\" and \"rem\"")); } total = 0; k = 0; nitems = 0; } /^([#]|[ ]*$)/ { if (abort >= 0) { exit abort; } ct[k] = "#"; it[k] = $0; k++; next; } // { if (abort >= 0) { exit abort; } if (NF != 2) { fatal_error(("line " NF ": bad input format")); } total += $1; nitems++; ct[k] = $1; it[k] = $2; k++; next; } END { if (abort >= 0) { exit abort; } den = total; if (cum) { num = 0; } else if (rem) { num = den; } for (i=0; i