#! /usr/bin/gawk -f # Last edited on 2000-07-10 08:44:59 by stolfi # Reads a file with entries for all patterns and all pages # of one section, of the form # # PTOTCT PFREQ STOTCT SFREQ STRANG FNUM KNUM PATTERN WORDLIST # # where PTOTCT STOTCT are the counts of the PATTERN in the page FNUM # (= page KNUM of section) and in the whole section; PFREQ and SFREQ # are the PATTERN's estimated frequencies in the page and section; # STRANG is a measure of how anomalous the PTOTCT is, and WORDLIST is # the list of all WORD values associated to PATTERN in the page. # # Outputs a list of words that should be colored in this # section, one per line. BEGIN { abort = -1; usage = ( "choose-peculiar-words \\\n" \ " -v maxPatterns=NUM \\\n" \ " -v maxDensity=FRAC \\\n" \ " -v minStrangeness=NUM \\\n" \ " < PATTERNS.strx \\\n" \ " > WORDS.dic" ); if (maxPatterns == 0) { error("must specify \"-v maxPatterns=NUM\""); } if (maxDensity == 0) { error("must specify \"-v maxDensity=FRAC\""); } if (minStrangeness == 0) { error("must specify \"-v minStrangeness=NUM\""); } split("", tot_pf); # Word occurrences per word and page. split("", str_pf); # Word occurrences per word and page. split("", tot_p); # Word occurrences per word split("", tot_f); # Word occurrences per page split("", pat_w); # Pattern of each word split("", knum_f); # k-num of each fnum split("", fnum_k); # f-number of each knum kMax = 0; # Maximum knum tot1 = 0; # Total tokens (from per-section counts) tot2 = 0; # Total tokens (from per-page counts) nPairs = 0; } (abort >= 0) { exit abort; } /./ { if (NF != 9) { error("bad entry = \"" $0 "\""); } pgCt = $1; pgFr = $2; scCt = $3; scFr = $4; strang = $5; fnum = $6; knum = $7; pat = $8; wdlist = $9; # if ((pat,fnum) in tot_pf) { error("repeated key = \"" $0 "\""); } # tot_pf[pat,fnum] = pgCt; if (pat in tot_p) { if (tot_p[pat] != scCt) { error(("inconsistent section totals")); } } else { tot_p[pat] = scCt; tot1 += scCt; } tot_f[fnum] += pgCt; if ((knum in fnum_k) && (fnum_k[knum] != fnum)) { error("bad knum"); } knum_f[fnum] = knum; fnum_k[knum] = fnum; if (knum > kMax) { kMax = knum; } str_pf[pat,fnum] = strang; nwd = split(wdlist, wd, ","); for (i = 1; i <= nwd; i++) { w = wd[i]; if (w in pat_w) { if (pat != pat_w[w]) { error(("inconsistent pattern " w " " pat)); } } else { pat_w[w] = pat; } } tot2 += pgCt; nPairs++; } END { if (abort >= 0) { exit abort; } if (tot1 != tot2) { error(("inconsistent totals = " tot1 "," tot2)); } printf "loaded %6d patern-page counts (%d tokens)\n", nPairs, tot1 > "/dev/stderr"; # chosen[pat] is defined if patern pat has been chosen for coloring. split("", chosen); nPatChosen = 0; # nColored[fnum] is the number of tokens already colored in page fnum. split("", nColored); while (nPatChosen < maxPatterns) { # Choose another pattern to color # pMax is the strangest unPatChosen pat, # fMax is the fnum where it occurs, # sMax is its strangeness. sMax = -1; for (pf in tot_pf) { split(pf, pfx, SUBSEP); pat = pfx[1]; fnum = pfx[2]; pfct = tot_pf[pat,fnum]; maxColored = maxDensity * tot_f[fnum]; if ((pfct >= 2) && (! (chosen[pat])) && (nColored[fnum] < maxColored)) { strn = str_pf[pat,fnum]; if (strn > sMax) { sMax = strn; pMax = pat; fMax = fnum; } } } printf " * %-15s %-6s %8.4f\n", pMax, fMax, sMax > "/dev/stderr"; if (sMax < minStrangeness) break; # Update nColored, delete pat from tot_pf table, # Compute mean knum of pattern: chosen[pMax] = 1; nPatChosen++; kTop = 0; nTop = 0; for (fnum in tot_f) { if ((pMax,fnum) in tot_pf) { pgCt = tot_pf[pMax,fnum]; nColored[fnum] += pgCt; if (pgCt > nTop) { nTop = pgCt; kTop = knum_f[fnum]; } delete tot_pf[pMax,fnum]; } } # Compute hue from mean knum: hue = (kTop + 0.5)/(kMax + 1); printf "%6.4f %s\n", hue, pMax; } printf "%7d patterns chosen\n", nPatChosen > "/dev/stderr"; printf "densities of colored words per page:\n" > "/dev/stderr"; for (fnum in nColored) { dens = nColored[fnum]/tot_f[fnum]; printf "%-6s %7.5f %s\n", fnum, dens, (dens > maxDensity ? "+" : "-") > "/dev/stderr"; } } function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit; }