#! /usr/bin/gawk -f # Last edited on 1999-01-06 06:43:08 by stolfi # Usage: # # cat RAWMAP \ # | format-word-location-map \ # [-v nblocks=NBLOCKS] \ # [-v maxlen=MAXLEN] \ # [-v ctwd=CTWD] \ # [-v html={0|1}] \ # [-v title=TITLE] \ # [-v blockHeadings=TITLE] \ # [-v totOnly={0|1}] \ # [-v showProps={0|1}] \ # [-v showPattern={0|1}] \ # [-v showLineNumber={0|1}] \ # [-v showAbsCounts={0|1}] \ # [-v showRelCounts={0|1}] \ # [-v showAvgPos={0|1}] \ # > OUTFILE" # # The script reads a raw word location map, as produced by # make-word-location-map, consisitng of records of the form # # TOTCT XXX...XXX PATTN STRING TAG PNUM LOC OBS LANG # # and formats it nicely, adding the following fields: # # AVP and DVP the mean and variance # of the block number for this STRING; # # YYY...YYY the per-block occurrence counts, # divided by the line's TOTCT. # # The following options request specific fields to be printed: # # showAvgPos=1 prints the fields AVG and DVP # # showAbsCounts=1 prints the absolute counst per block XXX...XXX # # showRelCounts=1 prints the relative counts per block YYY...YYY # # showPattern=1 prints the PATT field in addition to the STRING. # # showLineNumber=1 prints a sequential line number (starting from 0). # # showProps=1 prints the fields TAG PNUM LOC OBS LANG # # If TAG = "=" the line is interpreted as a pattern total, # and formatted specially. # # If "totOnly" is 1 then only lines with TAG = "=" are printed. # # Blank lines in the input produce blank lines in the output. # # Each per-block count is printed with CTWD bytes. If CTWD > 1 then # the maximum value printed MAXCT is 10^(CTWD-1)-1, with at least one # leading blank; else MAXCT is 9. The percentages are scaled from [0% # _ 100%] to [0 _ MAXCT] and rounded. function html_beg_line(tg) { if (tg == "-") { printf ""; } else if (tg == "+") { printf ""; } else if (tg == "=") { printf ""; } else if (tg == "h") { printf ""; } else { printf ""; } } function html_end_line(tg) { printf ""; } function avp(c, i, s, n) { # Computes the average string position from histogram "c" s = 0.0 n = 0 for (i in c) { s += (i-0.5)*c[i]; n += c[i] } return s/n } function dev(c, a, i, d, bias, slop, ss, n) { # Computes the estimated standard deviation of the string position # from the histogram "c" and average position "a" # The biasterm tries to fix the deviation so that # rare strings do not come out looking localized. ss = 0.0 n = 0 for (i in c) { d = (i-0.5) - a; ss += (d*d)*c[i]; n += c[i] } slop = (nblocks-1.0)/n bias = (1.0 + slop*slop)/12.0 return sqrt(ss/n + bias) } function print_line(pa, st, tg, tt, ct, pn, lc, ob, lg, av,dv,bn) { # Prints a line for pattern "pa", string "st", tag "tg", total count "tt", # per-block counts "ct[0..nblocks-1]", p-number "pn", location "lc", # obs "ob", language "lg". # Also computes the average position and deviation. # Also increments the line counter. if (html) html_beg_line(tg); if (showLineNumber) printf "%5d ", line_count; if (showPattern) printf "%-*s ", maxlen, pa; printf "%-*s ", maxlen, st; if (showAvgPos) { av = avp(c); dv = dev(c, av); printf "%5.1f %5.1f ", av, dv; } printf "%5d ", tt; if (showAbsCounts) { printf " "; for (bn=0; bn= maxct) printf "%*d", ctwd, maxct else printf "%*d", ctwd, ct[bn] } printf " "; } if (showRelCounts) { printf " "; for (bn=0; bn= maxct) printf "%*d", ctwd, maxct else printf "%*d", ctwd, rct } printf " "; } if (showProps) { printf " %1s", (lg == "-" ? "?" : lg); printf " %-4s", (pn == "-" ? "" : ("p" pn)); printf " %-15s", (lc == "-" ? "" : lc); printf " %s", (ob == "-" ? "" : ob); } if (html) html_end_line(tg); printf "\n"; line_count++; } function print_headings_major( av,dv,bn) { # Prints the major column headings. # Must match print_line. if (html) html_beg_line("h"); if (showLineNumber) printf "%5s ", " "; if (showPattern) printf "%-*s ", maxlen, " "; printf "%-*s ", maxlen, " "; if (showAvgPos) { printf "%5s %5s ", " ", " "; } printf "%5s ", " "; if (showAbsCounts) { printf " "; printf "%-*s", nblocks*ctwd, "abs counts"; printf " "; } if (showRelCounts) { printf " "; printf "%-*s", nblocks*ctwd, "rel counts"; printf " "; } if (showProps) { printf " %1s", " "; printf " %-4s", " "; printf " %-15s", " "; printf " %s", " "; } if (html) html_end_line("h"); printf "\n" } function print_headings_minor( av,dv,bn) { # Prints the minor column headings. # Must match print_line. if (html) html_beg_line("h"); if (showLineNumber) printf "%5s ", "line"; if (showPattern) printf "%-*s ", maxlen, "pattern"; printf "%-*s ", maxlen, "word(s)"; if (showAvgPos) { printf "%5s %5s ", "av.bl", "dv.bl"; } printf "%5s ", "totct"; if (showAbsCounts) { printf " "; for (bn=0; bn 0) { html_beg_line("h"); for (i=1;i\n"; printf "Voynich Manuscript - %s\n", title; printf "\n"; printf "

Voynich Manuscript

\n"; printf "

%s

\n", title; printf "\n"; printf "
\n";
  print_headings_major();
  if (blockHeadings != "")
    { print_dashes(); html_block_headings(blockHeadings); }
  print_headings_minor();
  print_dashes();
} 

function html_tail()
{
  printf "
\n"; printf "
\n"; printf "\n"; printf "\n"; } BEGIN { abort = 0; if (maxlen == 0) maxlen=16; if (nblocks == 0) error("must specify \"-v nblocks\""); if (title == "") title = "Word occurrence map"; if (ctwd == 0) ctwd = 1 if (ctwd == 1) { maxct = 9 } else { maxct=1; for (i=1;i