#! /usr/bin/gawk -f # Last edited on 1999-07-28 01:42:43 by stolfi BEGIN{ ok = 1; usage = "compute-lang-index -v coeffs=CFILE < SFILE"; # # where each CFILE line has a signed coefficient and a # word from the reference dictionary, and SFILE contains # the sample text, one word per line. # # After reading both files, prints to stdout two numbers # # count = number of words in SFILE that are listed # in CFILE; # # index = sum of coefficients of those words, divided # by their count. # # error = nominal uncertainty of the "index" value. # if (coeffs == "") { error(("usage: " usage)); } split("", c); split("", count); sizeD = 0; sumC2 = 0; while ((getline < coeffs) > 0) { sizeD++; if (NF != 2) { error((coeffs ", line " sizeD "\": bad format")); } cf = $1; w = $2; c[w] = cf; sumC2 += (cf * cf); count[w] = 0; } if (ERRNO != "0") { error((coeffs ": " ERRNO)); } close(coeffs); sizeSD = 0; } /./{ if (! ok) { exit(1); } if (NF != 1) { error(("line " NR ": bad word")); } w = $1; if (w in c) { sizeSD++; count[w]++; } next; } END{ if (! ok) { exit(1); } sumCF = 0; for (w in c) { fr = (count[w]+1)/(sizeSD + sizeD); sumCF += c[w]*fr; } scale = sqrt(sumC2); printf "%7d %+8.5f %7.5f\n", sizeSD, sumCF/scale, sqrt((1.0/6.0)/(sizeSD + sizeD)); } function error(msg) { printf "%s\n", msg > "/dev/stderr"; ok=0; exit(1); }