#! /usr/bin/gawk -f # Last edited on 2000-07-11 10:43:18 by stolfi # usage # compute-strangeness \ # -v secCounts=SECFILE.pwct \ # -v pageCounts=PAGEFILE.pwct \ # < INFILE.pwct \ # > OUTFILE.strf # # The input file INFILE.pwct must have records with fields # # COUNT FNUM KNUM WORD PATTERN # # where COUNT is the number of occurrences of WORD (which belongs to # class PATTERN) in the page FNUM (whose index in the section is # KNUM). # # The output file OUTFILE.strf have one line per PATTERN with fields # # PCOUNT PFREQ SCOUNT SFREQ LUMPY STRANG FNUM KNUM PATTERN WORDLIST # # where PCOUNT SCOUNT are the counts of the PATTERN in the page FNUM # and in the whole section, PFREQ and SFREQ are the PATTERN's # estimated frequencies in the page and section, LUMPY is a measure of # the lumpiness of the PATTERN's distribution over the pages of this # section, STRANG is the contribution of this page to LUMPY and # WORDLIST is the list of all WORD values associated to PATTERN in the # page. # # Each WORD must be associated to only one PATTERN. BEGIN { abort = -1; split("", pat_w); # Pattern corresponding to each word. split("", tot_pf); # Token counts of each pattern in each page. split("", tot_p); # Token counts of each pattern. split("", tot_f); # Token counts in each page. tot = 0; split("", lst_pf); # List of words matching each pattern in each page. split("", knum_f); # Index (k-number) of each page in section. split("", fnum_k); # Label (f-number) of the page with given index (k-number). split("", np_f); # Num patters occuring in each page. split("", nf_p); # Num pages where each pattern appears. nf = 0; np = 0; } /^ *[#]/ { next; } /^ *$/ { next; } /./ { if (NF != 5) { error(("bad input format: " $0)); } c = $1; fnum = $2; knum = $3; pat = $4; w = $5; if (c !~ /^[0-9]+$/) { error(("bad count: " $0)); } if ((w in pat_w) && (pat != pat_w[w])) { error(("inconsistent word/pattern :" $0)); } if (knum in fnum_k) { if (fnum != fnum_k[knum]) { error(("inconsistent knum/fnum :" $0)); } } else { fnum_k[knum] = fnum; nf++; } if (fnum in knum_f) { if (knum != knum_f[fnum]) { error(("inconsistent fnum/knum :" $0)); } } else { knum_f[fnum] = knum; } lst_pf[pat,fnum] = (lst_pf[pat,fnum] "," w); pat_w[w] = pat; } if (! (fnum in tot_f)) { nf++; } tot_f[fnum] += c; if (! (pat in tot_p)) { np++; } tot_p[pat] += c; if (! ((pat,fnum) in tot_pf)) { tot_pf[pat,fnum] = 0; np_f[fnum] ++; nf_p[pat]++; } tot_pf[pat,fnum] += c; tot += c; } (abort >= 0) { exit abort; } END { if(abort >= 0) { exit abort; } compute_strangs(); output_strangs(); exit 0; } function compute_strangs( pat,fnum,c_pf,r_pf, e_fp,r_fp, s_p,r_p,c_p,e_p) { split("", lumpy_f); # Lumpiness indicator of the page distr of each pattern. split("", strang_fp); # Frequency enhancement indicator of each pattern in each page. split("", prob_fp); # Prob. of page for each pattern, corrected for page size. for (pat in tot_p) { c_p = tot_p[pat]; r_p = (c_p + 1)/(tot + np); s_p = 0.0; for (fnum in knum_f) { c_f = tot_f[fnum]; c_pf = tot_pf[pat,fnum]; r_pf = (c_pf + r_p)/(c_f + 1); prob_fp[fnum,pat] = r_pf; s_p += r_pf; } if (s_p+0 != 0) { for (fnum in knum_f) { prob_fp[fnum,pat] /= s_p; } } e_p = 0; for (fnum in knum_f) { r_fp = prob_fp[fnum,pat]; e_fp = - r_fp*log(r_fp); strang_fp[fnum,pat] = r_fp; e_p += e_fp; } m_p = log(np) - e_p; if (m_p < 0) { m_p = 0; } lump_p[pat] = m_p; } } function output_strangs( pat,knum,fnum,c_pf,c_pf,r_pf,c_p,r_p,c,wds,str) { c = tot; for (p in tot_p) { c_p = tot_p[p]; r_p = (c_p + 1)/(c + np); lmp = lump_p[pat]; for (fnum in knum_f) { knum = knum_f[fnum]; c_pf = tot_pf[pat,fnum]; c_f = tot_f[fnum]; r_pf = (c_pf + 1)/(c_f + np); wds = substr(lst_pf[p,fnum],2); str = strang_fp[fnum,pat]; printf "%7d %7d %6.4f %6.4f %s, %02d, %s %s\n", c_pf, c_p, lmp, str, fnum, knum, p, wds; } } fflush("/dev/stdout"); } function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr" abort = 1 exit }