#! /usr/bin/gawk -f # Last edited on 1999-07-20 04:47:42 by stolfi # Usage: # # cat RAWMAP \ # | format-label-parag-map \ # -v nParags=NPARAGS \ # [-v blockTable=BLOCKTABLE] \ # [-v maxLen=MAXLEN] \ # [-v countWidth=CTWD] \ # [-v countSeparator=CHAR] \ # [-v html={0|1}] \ # [-v title=TITLE] \ # [-v blockHeadings=TITLE] \ # [-v totOnly={0|1}] \ # [-v mostBlocks=FRACTION] \ # [-v showPattern={0|1}] \ # [-v showLineNumber={0|1}] \ # [-v showAbsCounts={0|1}] \ # [-v showRelCounts={0|1}] \ # [-v showAvgPos={0|1}] \ # > OUTFILE" # # The script reads a raw word location map, as produced by # make-word-location-map, consisting of records of the form # # TOTCT COUNTS PATT STRING TAG # # where the COUNTS are NPARAGS counts of occurrences of STRING or PATT # per paragraph. # # This script reads a table that maps the paragraphs into "blocks", # which could be pages, folios, bifolios, or other arbitrary units. # The script condenses the COUNTS into ABSCTS of occurrences per block. # The script prints for each line the following data, in # order: # # TAG LINE PATT STRING TOTCT BLTOT NMOST AVB ENT ABSCTS RELCTS # * * * * * * # # where TAG, PATT, STRING and TOTCT are as read from # the input, and # # LINE is a sequential line number; # # BLTOT is the total occurrences in all listed blocks # which may be less than TOTCT (see below); # # NMOST is the minimum number of distinct blocks that account # for 80% or more of all occurrences in the listed blocks; # # AVB is the mean block index; # # ENT is the entropy of the word's distribution (bits); # # ABSCTS are the input COUNTS, condensed ber block; # # RELCTS are the ABSCTS divided by the line's TOTCT and # scaled to fit the output format. # # The output fields marked "*" are optional see below. # # If TAG = "=" the line is interpreted as a pattern total, # and formatted specially. # # If "totOnly" is 1 then only lines with TAG = "=" are printed. # # Blank lines and #-lines in the input are ignored. # # Actually only a subset of the paragraph COUNTS is totalled and # printed. The paragraphs to print and their order are read from the # BLOCKTABLE. Each record of the BLOCKTABLE must contain fields # # PARAG BLTAG # # where PARAG is a paragraph number, as in the main input, and BLTAG # is an arbitrary string identifying the block. The BLOCKTABLE must be # sorted by BLTAG. Blank lines in this file generate a blank column in # the corresponding place of the printout. #-lines are ignored. If the # BLOCKTABLE is not specified, assumes each paragraph is a block by # itself. # # The following options request specific fields to be printed: # # showAvgPos=1 prints the fields AVG and DVP # # showAbsCounts=1 prints the absolute counst per block XXX...XXX # # showRelCounts=1 prints the relative counts per block YYY...YYY # # showPattern=1 prints the PATT field in addition to the STRING. # # showLineNumber=1 prints a sequential line number (starting from 0). # # Each per-block count is printed with CTWD bytes. If CTWD > 1 then # the maximum value printed MAXCT is 10^(CTWD-1)-1, with at least one # leading blank; else MAXCT is 9. The percentages are scaled from [0% # _ 100%] to [0 _ MAXCT] and rounded. # # # # function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr"; abort = 1; exit 1; } function html_beg_line(tg) { if (tg == "-") { printf ""; } else if (tg == "+") { printf ""; } else if (tg == "=") { printf ""; } else if (tg == "h") { printf ""; } else { printf ""; } } function html_end_line(tg) { printf ""; } function avp(c, i, s, n) { # Computes the average block index from the block count vector "c" s = 0.0; n = 0; for (i in c) { s += (i-0.5)*c[i]; n += c[i]; } return (n == 0 ? 0 : s/n); } function dev(c, a, i, d, bias, slop, ss, n, q) { # Computes the estimated standard deviation of the block index # from the per-block count vector "c" and average position "a" # The bias term tries to fix the deviation so that # rare strings do not come out looking localized. ss = 0.0; n = 0; for (i in c) { d = (i-0.5) - a; ss += (d*d)*c[i]; n += c[i]; } slop = (nBlocks-1.0)/(n == 0 ? 1 : n); bias = (1.0 + slop*slop)/12.0; q = (n == 0 ? 0 : ss/n); return sqrt(q + bias); } function ent(c, i,h,N,P,q) { # Computes the entropy of the probability distribution # implied by the histogram "c". h = 0.0; N = 0; P = 0; for (i in c) { P++; N += c[i]; } for (i in c) { q = c[i]; if(q > 1) { h += q*log(q); } } h /= (N*log(2)); return h; } function nmost(c,tc, i,j,b,nb,tb,sb,ci,tmp) { # Computes the minimum number of entries in the given per-block # count vector "c" that add to at least "tc" occurrences. nb = 0; split("", b); # "b[0..nb-1]" are the current block index set. tb = 0; # Total of "c[b[0..nb-1]]". sb = 0; # "b[0..sb-1]" are sorted by c[b[i]], decreasing. for (i in c) { ci = c[i]; if (ci > 0) { if (tb >= tc) { # Bubble the smallest entry in "b" to the last position: for (j=sb; j 1)) { sb = j-1; } } } # Discard it: nb--; tb -= c[b[nb]]; } # Add this entry: b[nb] = i; tb += ci; nb++; } } if (tb < tc) { error("nmost: inconsistent tc"); } split("", b); return (nb); } function print_line(pa, st, tg, tt, ct, tb,nm,av,dv,en,bn,pn,xc) { # Prints a line for pattern "pa", string "st", tag "tg", total count "tt", # per-block counts "ct[0..nBlocks-1]". # Also computes the number of blocks with most of the # occurrences, the average block index, and the entropy of the # distribution. # Also increments the line counter. if (html) { html_beg_line(tg); } else { printf "%s ", tg; } if (showLineNumber) printf "%5d ", line_count; if (showPattern) printf "%-*s ", maxLen, pa; printf "%-*s ", maxLen, st; tb = 0; for(bn in ct) { tb += ct[bn]; } nm = nmost(ct, mostBlocks*tb); printf "%5d %5d %3d ", tt, tb, nm; if (showAvgPos) { av = avp(ct); # dv = dev(ct, av); en = ent(ct); printf "%5.1f %5.2f ", av, en; } if (showAbsCounts) { printf " "; for (bn=0; bn= maxCount) { xc = maxCount; } else { xc = ct[bn]; } } printf "%*s", countWidth, xc; } printf " "; } if (showRelCounts) { printf " "; for (bn=0; bn= maxCount) { xc = maxCount; } else { xc = rct; } } printf "%*s", countWidth, xc; } printf " "; } if (html) { html_end_line(tg); } printf "\n"; line_count++; } function print_headings_major( av,dv,bn) { # Prints the major column headings. # Must match print_line. if (html) { html_beg_line("h"); } else { printf " "; } if (showLineNumber) printf "%5s ", " "; if (showPattern) printf "%-*s ", maxLen, " "; printf "%-*s ", maxLen, " "; printf "%5s %5s %3s ", " ", " ", " "; if (showAvgPos) { printf "%5s %5s ", " ", " "; } if (showAbsCounts) { printf " "; printf "%-*s", nBlocks*countWidth, "abs counts"; printf " "; } if (showRelCounts) { printf " "; printf "%-*s", nBlocks*countWidth, "rel counts"; printf " "; } if (html) { html_end_line("h"); } printf "\n" } function print_headings_minor( av,dv,bn) { # Prints the minor column headings. # Must match print_line. if (html) { html_beg_line("h"); } else { printf "t "; } if (showLineNumber) printf "%5s ", "line"; if (showPattern) printf "%-*s ", maxLen, "pattern"; printf "%-*s ", maxLen, "word(s)"; printf "%5s %5s %3s ", "totct", "totbl", "mbl"; if (showAvgPos) { printf "%5s %5s ", "av.bl", "en.bl"; } if (showAbsCounts) { printf " "; printf "%-*s", nBlocks*countWidth, ""; printf " "; } if (showRelCounts) { printf " "; printf "%-*s", nBlocks*countWidth, ""; printf " "; } if (html) { html_end_line("h"); } printf "\n" } function print_dashes( i,bn) { # Prints dashes for all fields. # Must match print_line. if (html) { html_beg_line("h"); } else { printf "- "; } if (showLineNumber) printf "----- "; if (showPattern) { for (i=0;i 0) { if (html) { html_beg_line("h"); } for (i=1;i\n"; printf "Voynich Manuscript - %s\n", title; printf "\n"; printf "

Voynich Manuscript

\n"; printf "

%s

\n", title; printf "\n"; printf "
\n";
    }
  else
    { printf "# %s\n", title;
      printf "\n";
    }
  print_headings_major();
  if (blockHeadings != "")
    { print_dashes(); print_block_headings(blockHeadings); print_dashes(); }
  print_headings_minor();
  print_dashes();
} 

function print_tail()
{
  if (html)
    { printf "
\n"; printf "
\n"; printf "\n"; printf "\n"; } print_dashes(); } function load_block_table(file, lin,pn,bt,bn,fld,nfld,bft) { # Loads the (partial) mapping of paragraphs into blocks, # and the block order. Defines "nBlocks", the tables # "block_from_parag" and "parags_in_block". nBlocks = 0; split("", parags_in_block); split("", block_from_parag); split("", bft); while (getline lin < file) { if (! match(lin, /^[#]/)) { nfld = split(lin, fld); if (nfld == 0) { # dummy block bn = nBlocks; nBlocks++; parags_in_block[bn] = 0; } else if (nfld == 2) { pn = fld[1]; bt = fld[2]; if (bt in bft) { bn = bft[bt]; } else { bn = nBlocks; nBlocks++; bft[bt] = bn; parags_in_block[bn] = 0; } block_from_parag[pn] = bn; parags_in_block[bn]++; } } } if (ERRNO != "0") { error((file ": " ERRNO)); } } function make_default_block_table( pn,bn,pi) { # Creates the default list of the paragraphs that should be printed # and their order. Defines "nBlocks" and the # tables "parags_in_block" and "block_from_parag". nBlocks = nParags; split("", parags_in_block); split("", block_from_parag); for (pi = 1; pi <= nParags; pi++) { pn = sprintf("%04d", pi); bn = pi-1; block_from_parag[pn] = bn; parags_in_block[bn] = 1; } } BEGIN { abort = -1; if (maxLen == 0) maxLen=16; if (nParags == "") { error("must specify \"-v nParags\""); } if (title == "") { title = "Word-paragraph occurrence map"; } if (blockTable != "") { load_block_table(blockTable); } else { make_default_block_table(); } if (countWidth == 0) { countWidth = 1; } if (countWidth == 1) { maxCount = 9; } else { maxCount=1; for (i=1;i= 0) exit abort; if (NF != 4 + nParags) error("wrong number of fields"); ttct = $1; patt = $(nParags+2); strn = $(nParags+3); tagg = $(nParags+4); if (totOnly && (tagg != "=")) next; for (bn=0; bn <= nBlocks; bn++) { blct[bn] = 0; } for (pi=1; pi <= nParags; pi++) { pn = sprintf("%04d", pi); if (pn in block_from_parag) { bn = block_from_parag[pn]; blct[bn] += $(pi+1); } } if ((patt != prev_patt) && (prev_patt != "") && (! totOnly)) printf "\n"; print_line(patt, strn, tagg, ttct, blct); prev_patt = patt; } END { if (abort >= 0) { exit abort; } print_tail(); }