#! /usr/bin/gawk -f # # Usage: "$0 -v nmin=NNN -v mw=N.NNN mc=N.NNN # # Computes the ratio of two counts for a list of patterns. # The input must be the output of compare-freqs, in the # format " NT FT NL FL patt", where "NT","NL" are # two counts, and "FT","FL" the corresponding relative # frequencies. The output will have the format # " NT FT NL FL rat mk patt" where "rat=(NL)/(NT+2)". # # The "mk" field is a class code, assigned based on the # ratio and its certainty, and the parameters "mw", "mc", # and "nmin", as follows: function classify(NT, NL, ratio, nmin, mw, mc) { if (ratio >= 1.0/mw) { if (NT >= nmin) { return "++" } # Probably word break else { return "+?" } # unimportant but looks more like a word break } else if (ratio >= 0.005) { if (NL >= nmin) { return "::" } # possible syllabe break else { return ":?" } # uncertain but looks more like syllabe break } else { if (2*NT < mc) { return "??" } # too rare, can't tell else if (NT < 2*mc) { return "-?" } # uncertain but looks more like non-break else { return "--" } # non-break } } /^##/ { $0 = substr($0, 3); printf "##%11.11s %11.11s RelFr MK %s\n", $1, $2, $3; next } /^# / { $0 = substr($0, 3); printf "# %11.11s %11.11s ----- -- %s\n", $1, $2, $3; next } /[0-9]\.[0-9]/ { if (mw == 0) { print "must define mw" > "/dev/stderr"; exit 1; } if (mc == 0) { print "must define mc" > "/dev/stderr"; exit 1; } if (nmin == 0) { print "must define nmin" > "/dev/stderr"; exit 1; } NT = $1 NL = $3 rat = (NL/(NT+2)); mark = classify(NT, NL, rat, nmin, mw, mc) printf " %5d %5.3f %5d %5.3f %6.3f %s %s\n", $1, $2, $3, $4, rat, mark, $5; next }