#! /n/gnu/bin/gawk -f # usage # compute-strangeness \ # -v wordCounts=WCNTFILE \ # -v pageCounts=PSIZFILE \ # < WORDPAGE.pwct \ # > WORDPAGE.pwst # # The input file must have fields WPCOUNT FNUM WORD # where WPCOUNT is the number of occurrences of WORD in FNUM. # # The output will have fields WPCOUNT WCOUNT PCOUNT STRANG FNUM WORD # where PCOUNT is the number of words in the page, # WCOUNT is the total count for WORD in the whole book, # and STRANG is a measure of how anomalous is the # WPCOUNT, given the page's pcount and overall probability # of WORD. function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr" abort = 1 exit } BEGIN { abort = 0; # Read word count table: if (wordCounts == "") { error("must specify \"-v wordCounts=FILE\"\n"); } split("", wCount); nWords = 0; nOccsW = 0; while((getline lin < wordCounts) > 0) { split(lin, fld); if ((3 in fld) || ! (2 in fld)) { error("bad wordCounts entry = \"" lin "\""); } if (fld[1] in wCount) { error("repeated key = \"" lin "\""); } wCount[fld[1]] = fld[2]; nWords++; nOccsW += fld[2]; } close (wordCounts); printf "loaded %6d word counts (%d word occurrences)\n", nWords, nOccsW > "/dev/stderr"; # Read word count per page table: if (pageCounts == "") { error("must specify \"-v pageCounts=FILE\"\n"); } split("", pCount); nPages = 0; nOccsP = 0; while((getline lin < pageCounts) > 0) { split(lin, fld); if ((3 in fld) || ! (2 in fld)) { error("bad pageCounts entry = \"" lin "\""); } if (fld[1] in pCount) { error("repeated key = \"" lin "\""); } pCount[fld[1]] = fld[2]; nPages++; nOccsP += fld[2]; } close (pageCounts); printf "loaded %6d page sizes (%d word occurrences)\n", nPages, nOccsP > "/dev/stderr"; if (nOccsW != nOccsP) { error("incongruent occurrence counts\n"); } nOccs = nOccsW; # printf "%6s %6s %6s %6s %-6s %s\n", # "#[w,p]", "#[w]", "#[p]", "str", "fnum", "word" > "/dev/stderr"; # printf "%6s %6s %6s %6s %-6s %s\n", # "------", "------", "------", "------", "------", "------" > "/dev/stderr"; } function strangeness(m, n, p, q,f,g,s,t,e) { # Computes the strangeness of having m or more occurrences # of a word in n trials, given that the word does occur # at least once, and given that the overall frequency # of the word in the book is p. e = 1/(n+1); p = sqrt((p*p + e*e)/(1 + e*e)); q = 1-p; f = m/(n+1); g = 1-f; s = log(f/p); t = log(g/q); return s-t; } /./ { wpct = $1; fnum = $2; word = $3; if (!(fnum in pCount)) { error("page \"" fnum "\" not in page size table\n"); } pct = pCount[fnum]; if (!(word in wCount)) { error("word \"" word "\" not in word count table\n"); } wct = wCount[word]; str = strangeness(wpct, pct, wct/nOccs); print wpct, wct, pct, sprintf("%6.4f", str), fnum, word; next; }