#! /usr/bin/gawk -f # Last edited on 2000-07-10 08:12:35 by stolfi function error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit; } # Usage: # create-color-tables \ # -v colorPatterns=FILE.dic \ # -v outDir=ODIR \ # [ -v minStrangeness=MINSTR ] \ # [ -v minLum=MINLUM ] \ # [ -v maxStrangeness=MAXSTR ] \ # [ -v maxLum=MAXLUM ] \ # [ -v uniqueColor=UNICOLOR ] \ # < INFILE # # Input INFILE must have fields # # PTOTCT PFREQ STOTCT SFREQ STRANG FNUM KNUM PATTERN WORDLIST # # where PTOTCT STOTCT are the counts of the PATTERN in the page FNUM # and in the whole section, PFREQ and SFREQ are the PATTERN's # estimated frequencies in the page and section, STRANG is a measure # of how anomalous the PTOTCT is, and WORDLIST is the list of all WORD # values associated to PATTERN in the page. # # Writes two separate files per page: ODIR/FNUM.clr (pattern-to-color table) # and ODIR/FNUM.spw (special patterns table). # # ODIR/FNUM.spw has fields # # PTOTCT STOTCT STRANG COLOR FNUM KNUM PATTERN WORDLIST # # and contains only those patterns listed in the colorPatterns file. # that occur in the page. # # ODIR/FNUM.clr has fields # # PATWD COLOR # # where PATWD is either a PATTERN or a WORD, # and includes all words from patterns listed in the colorPatterns file, # plus those with global STOTCT=1 (colored UNICOLOR). BEGIN { abort = -1; if (colorPatterns == "") { error("must define \"colorPatterns\""); } if (outDir == "") outDir = "."; if (minStrangeness == "") minStrangeness = 0.0; if (minLum == "") minLum = 0.30; if (maxStrangeness == "") maxStrangeness = -1.0; if (maxLum == "") maxLum = 0.70; if (uniqueColor == "") uniqueColor = "5577ff"; rgb[0] = 0; rgb[1] = 0; rgb[2] = 0; phi = (sqrt(5)-1)/2; split("", whue); split("", selected); nexthue = 0; nPatterns = 0; maxStrang = 0.0; split("", pgtot_pf); split("", sctot_p); split("", str_pf); split("", wds_pf); split("", knum_f); read_patterns(); } (abort >= 0) { exit abort; } /./ { if (NF != 9) { error("bad format"); } pgCt = $1; pgFr = $2; scCt = $3; scFr = $4; strang = $5; fnum = $6; knum = $7; pat = $8; wdlist = $9; pgtot_pf[pat,fnum] = pgCt; str_pf[pat,fnum] = strang; wds_pf[pat,fnum] = wdlist; if ((fnum in knum_f) && (knum != knum_f[fnum])) { error("inconsistent knum"); } knum_f[fnum] = knum; if (p in sctot_p) { if (scCt != sctot_p[pat]) { error (("inconsistent section count " pat)); } } else { sctot_p[pat] = scCt; } if (strang > maxStrang) { maxStrang = strang; } next; } END { if (abort >= 0) { exit abort; } if (maxStrangeness < minStrangeness) { maxStrangeness = maxStrang; } for (fnum in knum_f) { setfile(fnum); for (pat in sctot_p) { if ((pat,fnum) in pgtot_pf) { pgCt = pgtot_pf[pat,fnum]; strang = str_pf[pat,fnum]; wdlist = wds_pf[pat,fnum]; scCt = sctot_p[pat]; if (scCt == 1) { putcolor(fnum, pat, wdlist, uniqueColor); } else if (pat in selected) { hue = whue[pat]; rgb_from_hue(rgb, hue); y = y_from_strangeness(strang); rgb_fix_y(y, rgb); color = xcolor_from_rgb(rgb); putcolor(fnum, pat, wdlist, color); putdata(pgCt, scCt, strang, color, fnum, knum, pat, wdlist); } } } } } function abs(x) { return (x >= 0 ? x : -x); } function y_from_rgb(rgb) { return 0.30*rgb[0] + 0.60*rgb[1] + 0.10*rgb[2]; } function rgb_fix_y(y, rgb, yy, ar, aw, ab) { # mixes white or black into "rgb" so that its intensity is "y". yy = y_from_rgb(rgb); if (yy < y) { # mix white ar = (1-y)/(1-yy); aw = (y-yy)/(1-yy); rgb[0] = ar*rgb[0] + aw; rgb[1] = ar*rgb[1] + aw; rgb[2] = ar*rgb[2] + aw; } else if (yy > y) { # mix black ar = y/yy; rgb[0] = ar*rgb[0] + aw; rgb[1] = ar*rgb[1] + aw; rgb[2] = ar*rgb[2] + aw; } } function gamma(r) { return r } function xcolor_from_rgb(rgb, rr, gg, bb) { rr = int(gamma(rgb[0])*255 + 0.5); gg = int(gamma(rgb[1])*255 + 0.5); bb = int(gamma(rgb[2])*255 + 0.5); return sprintf("%02x%02x%02x", rr, gg, bb); } function rgb_from_hue(rgb, h, hf, hi) { while (h >= 1) { h = h - 1; } while (h < 0) { h = h + 1; } h = 6*h; hi = int(h); hf = h - hi; if (hi == 0) { rgb[0] = 1; rgb[1] = hf; rgb[2] = 0; } else if (hi == 1) { rgb[0] = 1-hf; rgb[1] = 1; rgb[2] = 0; } else if (hi == 2) { rgb[0] = 0; rgb[1] = 1; rgb[2] = hf; } else if (hi == 3) { rgb[0] = 0; rgb[1] = 1-hf; rgb[2] = 1; } else if (hi == 4) { rgb[0] = hf; rgb[1] = 0; rgb[2] = 1; } else if (hi == 5) { rgb[0] = 1; rgb[1] = 0; rgb[2] = 1-hf; } } function setfile(fnum) { if (fnum != ofnum) { if (cfile != "") { close(cfile); } if (dfile != "") { close(dfile); } printf "%s...\n", fnum > "/dev/stderr"; ofnum = fnum; cfile = (outDir "/" fnum ".clr"); printf "#\n" > cfile; dfile = (outDir "/" fnum ".spw"); printf "#\n" > dfile; } } function putcolor(fnum, pat, wdlist, color, nw,w,i) { setfile(fnum); print pat, color > cfile; nw = split(wdlist, w, ","); for (i=1; i<=nw; i++) { if (w[i] != pat) { print w[i], color > cfile; } } } function putdata(pgCt, scCt, strang, color, fnum, knum, pat, wdlist) { setfile(fnum); print pgCt, scCt, pCount, strang, color, fnum, knum, pat, wdlist > dfile; } function y_from_strangeness(strang, y) { y = (abs(strang) - minStrangeness)/(maxStrangeness - minStrangeness); # print y, (maxStrangeness - minStrangeness), abs(strang) > "/dev/stderr"; if (y > 1) { y = 1; } if (y < 0) { y = 0; } return minLum + y * (maxLum - minLum); } function read_patterns( lin,fld,nfld,pat,hue) { # Read list of colored patterns, and assigns them hues: split("", selected); split("", whue); nPatterns=0; while ((getline lin < colorPatterns) > 0) { if (! match(lin, /^[#]/)) { nfld = split(lin,fld); if (nfld != 2) { error("bad pattern format"); } hue = fld[1]; pat = fld[2]; if (pat in selected) { error("duplicate pattern in colorPattern file"); } whue[pat] = hue; selected[pat] = 1; nPatterns++; } } if (ERRNO != "0") { error((colorPatterns ": " ERRNO)); } close (colorPatterns); printf "read %d patterns to color\n", nPatterns > "/dev/stderr"; }