#! /usr/bin/gawk -f # # Usage: "$0 -v nmin=NNN -v mw=N.NNN mc=N.NNN # # Computes the ratio of two counts for a list of patterns. # The input must be the output of compare-freqs, in the # format " NT FT NL FL patt", where "NT","NL" are # two counts, and "FT","FL" the corresponding relative # frequencies. The output will have the format # " NT FT NL FL rat mk patt" where "rat=(NL+1)/(NT+mc)". # # The "mk" field is a class code, assigned based on the # ratio and its certainty, and the parameters "mw", "mc", # and "nmin", as follows: function classify(NT, NL, nmin, mw, mc) { if ((NT < nmin) && (2*mw*(NL+1) < (NT+mc))) { return "-?" } # unimportant but NL low else if ((NT < nmin) && (2*mw*(NL+1) >= (NT+mc))) { return "+?" } # unimportant but NL high else if (mw*(NL+1) < nmin) { return "oo" } # NL practically zero else if ((NL-1) > NT - nmin) { return "##" } # NL practically NT else if (mw*(NL-1) > NT - nmin) { return "||" } # NL practically maximum expected else if (2*mw*(NL+1) < (NT+mc)) { return "--" } # NL on the low side else if (2*mw*(NL+1) >= (NT+mc)) { return "++" } # NL on the high side else { return "!!" } # program error } /^##/ { $0 = substr($0, 3); printf "##%11.11s %11.11s RelFr MK %s\n", $1, $2, $3; next } /^# / { $0 = substr($0, 3); printf "# %11.11s %11.11s ----- -- %s\n", $1, $2, $3; next } /[0-9]\.[0-9]/ { if (mw == 0) { print "must define mw" > "/dev/stderr"; exit 1; } if (mc == 0) { print "must define mc" > "/dev/stderr"; exit 1; } if (nmin == 0) { print "must define nmin" > "/dev/stderr"; exit 1; } NT = $1 NL = $3 rat = ((NL+1)/(NT+mc)); mark = classify(NT, NL, nmin, mw, mc) printf " %5d %5.3f %5d %5.3f %6.3f %s %s\n", $1, $2, $3, $4, rat, mark, $5; next }