#! /usr/bin/gawk -f # Last edited on 2002-01-20 21:32:57 by stolfi BEGIN{ abort = -1; usage = ( \ "compute-elem-count-distrib \\\n" \ " [ -v sampleSep=SEP ] \\\n" \ " < WORDFILE > FREQFILE " \ ); # Input records must be # # COUNT WORD LENGTH # # where WORD is a nonempty string, and LENGTH is # some integer. Outputs, for each distinct LENGTH, a line # # LENGTH COUNT FREQ XWORD # # where COUNT is the total count for that LENGTH, FREQ is the # relative frequency, and XWORD is (by default) a sample WORD of # that length. # # Optionally, if "sampleSep" is set to a non-empty string SEP, the # output XWORD is the concatenation of all WORDs of the same # LENGTH, separated by the string SEP. # hi = -1; split("", mct); split("", wsample); } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } /./ { if (NF != 3) { data_error(("wrong number of fields")); } ct = $1; w = $2; m = $3; if (! (m in mct)) { wsample[m] = w; } else if (sampleSep != "") { wsample[m] = ( wsample[m] sampleSep w ); } mct[m] += ct; totct += ct; hi = (m > hi ? m : hi); } END { printf "# len count freq %s\n", (sampleSep == "" ? "example" : "strings"); printf "# --- ------ ------ ------------------\n"; for(m = 0; m <= hi+1; m++) { printf " %3d %6d %6.4f %s\n", m, mct[m], mct[m]/totct, wsample[m]; } } function data_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; }