# Last edited on 2000-07-11 10:39:16 by stolfi function strangeness(c_pf, pgSize, c_p, scSize, f,g,p,q,pgEx,pr0,pr1) { # Computes the strangeness of having "c_pf" or more occurrences # of a word in a page with "pgSize" tokens, given that the word # occurs "c_p" times in a section with "scSize" tokens. p = (c_p+1)/(scSize+1); q = 1-p; if (c_pf < p*pgSize) { pgEx = int(p*pgSize); if (c_pf > pgEx) { error(("bad floor " p " " pgSize " " pgEx " " c_pf)); } pr0 = log_binom(p, pgEx, q, pgSize - pgEx); pr1 = log_binom(p, c_pf, q, pgSize - c_pf); return(pr1 - pr0); } else { pgEx = pgSize - int(pgSize - p*pgSize); if (c_pf < pgEx) { error(("bad ceiling " p " " pgSize " " pgEx " " c_pf)); } pr0 = log_binom(p, pgEx, q, pgSize - pgEx); pr1 = log_binom(p, c_pf, q, pgSize - c_pf); return(pr0 - pr1); } } function read_page_counts( lin,fld,nfld,c,p,wds) { # Read word / pattern count per page table: if (pageCounts == "") { error("must specify \"pageCounts\""); } split("", pgtot_p); split("", pglst_p); nPatsP = 0; pgSize = 0; while((getline lin < pageCounts) > 0) { if (! match(lin, /^[#]/)) { gsub(/[:]/, " ", lin); nfld = split(lin, fld); if (nfld != 3) { error("bad pageCounts entry = \"" lin "\""); } c = fld[1]; w = fld[2]; p = fld[3]; if (c !~ /^[0-9]+$/) { error(("bad format:" lin)); } if (! (p in pgtot_p)) { pgtot_p[p] = 0; pglst_p[p] = ""; } pgtot_p[p] += c; if (w in pat_w) { if (p != pat_w[w]) { error(("inconsistent pattern :" p)); } } else { pglst_p[p] = (pglst_p[p] "," w); pat_w[w] = p; } nPatsP++; pgSize += c; } } close (pageCounts); printf "loaded %6d word/pattern pairs in page (%d tokens)\n", nPatsP, pgSize > "/dev/stderr"; } function read_sec_counts( lin,fld,nfld,c,p) { # Read word count table: if (secCounts == "") { error("must specify \"secCounts\""); } split("", sctot_p); nPatsS = 0; scSize = 0; while((getline lin < secCounts) > 0) { if (! match(lin, /^[#]/)) { gsub(/[:]/, " ", lin); nfld = split(lin, fld); if (nfld != 3) { error("bad secCounts entry = \"" lin "\""); } c = fld[1]; w = fld[2]; p = fld[3]; if (c !~ /^[0-9]+$/) { error(("bad format:" lin)); } if (! (p in sctot_p)) { sctot_p[p] = 0; } sctot_p[p] += c; nPatsS++; scSize += c; } } close (wordCounts); printf "loaded %6d word/pattern pairs in section (%d tokens)\n", nPatsS, scSize > "/dev/stderr"; } function compute_hues( k,pat,ksum) { for (pat in selected) { scCt = sctot_p[pat]; if (scCt > 1) { for (k = 1; k <= nfnums; k++) { fnum = fnum_k[k]; if ((pat,fnum) in pgtot_pf) { pgCt = pgtot_pf[pat,fnum]; ksum += pgCt*k; } } whue[pat] = (ksum/scCt - 0.5)/nfnums; } } }