#! /n/gnu/bin/gawk -f # Usage: "$0 [-v maxlen=MAXLEN] [-v wdefs=WDEFS] [-v ctwd=CTWD] [-v nblocks=NBLOCKS] \ # [-v percent={0|1}] [v avg={0|1}] < INFILE > OUTFILE" # # This script reads from standard input a list of pairs BLOCK WORD OCC, # where BLOCK is a block number (from 1), WORD is a word occurring # in that block, and OCC is WORD as it actually occurred in the block. The # file should be sorted by WORD and then OCC. # # If "wdefs" is specified, it also reads from the file called WDEFS a # list of records of the form # # WORD CLASS SKEY DESCR # # where WORD is a Voynich word, CLASS is a short (3 bytes or less) tag # for WORD, SKEY is a short key for sorting (4 bytes or less, # e.g. page number), and DESCR is an arbitrary string (without # embedded blanks) for WORD. In case of multiple entries for the same # word, the last one takes precedence. # # The script prints a table of the form # # WORD OCC CLASS SKEY AVP DEV TOTCT XXX...XXX YYY...YYY DESCR # # where # # WORD is one word occurring in the input data # # CLASS is the class of WORD as specified in the WEFS file, or "-" if # no class was specified; # # SKEY is the sorting key associated with the words in the WDEFS file, # or "-" if none; # # AVP and VAR (printed only if avg=1) are the mean and variance # of the block number for this word; # # TOTCT is the total number of occurrences of WORD; # # XXX...XXX are the counts of how many times that word # occurred in each block (with '.' meaning 0); # # YYY...YYY (printed only if PERCENT is 1) are the same counts # expressed as percentages of TOTCOUNT. # # DESCR is the description string associated with the words # in the WDEFS file, or "-" if none; # # Each count is printed with CTWD bytes. If CTWD > 1 then the maximum # value printed MAXCT is 10^(CTWD-1)-1, with at least one leading blank; # else MAXCT is 9. The percentages are scaled from # [0% _ 100%] to [0 _ MAXCT] and rounded. BEGIN { word = ""; wocc = ""; if (maxlen == 0) maxlen=16; if (nblocks == 0) nblocks=20 if (ctwd == 0) ctwd = 1 if (ctwd == 1) { maxct = 9 } else { maxct=1; for (i=1;i 0) { split (lin, wfld); if ((! (4 in wfld)) || (5 in wfld)) { printf "bad wdefs = %s\n", lin > "/dev/stderr"; continue; } wd = wfld[1]; clas = wfld[2]; skey = wfld[3]; desc = wfld[4]; wdclas[wd] = clas; wdskey[wd] = skey; wddesc[wd] = desc; } close (wdefs); } } function avp(c, i, s, n) { # Computes the average word position from histogram "c" s = 0.0 n = 0 for (i in c) { s += (i-0.5)*c[i]; n += c[i] } return s/n } function dev(c, a, i, d, bias, slop, ss, n) { # Computes the estimated standard deviation of the word position from the # histogram "c" and average position "a" # The biasterm tries to fix the deviation so that # rare words do not come out looking localized. ss = 0.0 n = 0 for (i in c) { d = (i-0.5) - a; ss += (d*d)*c[i]; n += c[i] } slop = (nblocks-1.0)/n bias = (1.0 + slop*slop)/12.0 return sqrt(ss/n + bias) } function printword(w, o, t, c, i,a,d) { # prints total count "t", word "w", occurrence "o", # average position, deviation, and location map "c" printf "%-*s ", maxlen, w; printf "%-*s ", maxlen, o; if (w in wdclas) { clas = wdclas[w]; skey = wdskey[w]; desc = wddesc[w]; } else { clas = ""; skey = ""; desc = ""; } printf "%3.3s ", (clas == "" ? "-" : clas); printf "%4.4s ", (skey == "" ? "-" : skey); if (avg) { a = avp(c); d = dev(c, a); printf "%5.1f %5.1f ", a, d; } printf "%5d ", t; for (i=1; i<=nblocks; i++) { if (c[i] == 0) printf "%*s", ctwd, "." else if (c[i] >= maxct) printf "%*d", ctwd, maxct else printf "%*d", ctwd, c[i] } if (percent != 0) { printf " " for (i=1; i<=nblocks; i++) { if (c[i] == 0) printf "%*s", ctwd, "." else if (c[i] >= t) printf "%*d", ctwd, maxct else printf "%*d", ctwd, int((c[i]*maxct)/t + 0.5) } } printf " %s", (desc == "" ? "-" : desc); printf "\n" } /./ { if (($2 != word) || ($3 != wocc)) { if (word != "") printword(word, wocc, totct, wmap) for (i=1; i<=nblocks; i++) wmap[i] = 0; totct = 0; word = $2; wocc = $3; } totct++ block = $1 if ( ((block + 0) != block) || (block < 1) || (block > nblocks) ) { printf "bad block number = %d\n", block > "/dev/stderr" exit 1 } wmap[block]++ } END { if (word != "") printword(word, wocc, totct, wmap); }