#! /usr/bin/gawk -f # Last edited on 2004-01-20 03:56:52 by stolfi BEGIN { usage = ( ARGV[0] "\\\n" \ " -v bias=NUM \\\n" \ " -v minskew=NUM -v minhits=NUM \\\n" \ " < INFILE.wfr > OUTFILE.pms" \ ); # Input is a midfix-prefix-suffix frequency file, with fields {COUNT MID PRE SUF}. # # Gathers all record with same {MID}, then for each pair of # {PRE}s {A,B} and each pair of {SUF}s {X,Y}, evaluates the # skewness of the quadruple. # {COUNT MID PRE SUF} where {PRE·MID·SUF = WORD}. # # Only generates combinations where the {MID} field has at least # {minmid} elems; {PRE} has between 1 and {maxpre} elems, and # similarly for {SUF}. abort = -1; if (bias == "") { bias = 1; } if (minskew == "") { minskew = 0.10; } if (minhits == "") { minhits = 3; } omid = ""; L2 = log(2.0); clear_tables(); } (abort >= 0) { exit abort; } /^[ ]*[0-9]/ { if (NF != 4) { data_error(("bad NF = " NF)); } ct = $1; mid = $2; pre = $3; suf = $4; if (mid != omid) { if (omid != "") { process_midfix(); clear_tables(); } omid = mid; } if (! (pre in ct_pre)) { ct_pre[pre] = 0; pre_s[npre] = pre; npre++; } if (! (suf in ct_suf)) { ct_suf[suf] = 0; suf_s[nsuf] = suf; nsuf++; } ct_pre[pre] += ct; ct_suf[suf] += ct; ct_pair[pre,suf] = ct; ct_mid += ct; next; } END { if (abort >= 0) { exit abort; } if (omid != "") { process_midfix(); } } function clear_tables() { npre = 0; split("", pre_s); split("", ct_pre); nsuf = 0; split("", suf_s); split("", ct_suf); split("", ct_pair); ct_mid = 0; } function process_midfix( i,j,A,B,X,Y,nAX,nAY,nBX,nBY,nA,nB,nX,nY,n,d,s) { if ((npre < 2) || (nsuf < 2) || (ct_mid < minhits)) { return; } printf "%7d %s\n", ct_mid, omid > "/dev/stderr"; for (ia = 0; ia < npre; ia++) { A = pre_s[ia]; if (ct_pre[A] >= minhits) { for (ib = ia+1; ib < npre; ib++) { B = pre_s[ib]; if (ct_pre[B] >= minhits) { for (jx = 0; jx < nsuf; jx++) { X = suf_s[jx]; if (ct_suf[X] >= minhits) { nAX = ct_pair[A,X]; nBX = ct_pair[B,X]; nX = nAX + nBX; for (jy = 0; jy < nsuf; jy++) { Y = suf_s[jy]; if (ct_suf[Y] >= minhits) { nAY = ct_pair[A,Y]; nBY = ct_pair[B,Y]; nY = nAY + nBY; nA = nAX + nAY; nB = nBX + nBY; if ((nA >= minhits) && (nB >= minhits) && (nX >= minhits) && (nY >= minhits)) { n = nAX+nAY+nBX+nBY; d = (log(nAX+bias) - log(nAY+bias) - log(nBX+bias) + log(nBY+bias))/L2; s = d*d; if (s > minskew) { printf "%7.2f %d %d %d %d %d %s %s %s %s %s\n", \ s, n, nAX, nAY, nBX, nBY, omid, A, B, X, Y; } } } } } } } } } } } function data_error(msg) { printf "line %d: %s\n", NR, msg >> "/dev/stderr"; abort = 1; exit 1; } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1; }