#! /usr/bin/gawk -f # Last edited on 1999-07-14 21:10:20 by stolfi # Usage: # cat INFILE.evt \ # | colorize-text -f eva2erg.gawk \ # -v verbose=BOOL \ # -v indent=INDENT \ # -v headers=HEADERS \ # -v colorTable=COLORTABLE \ # -v commentColor=COMMCOLOR \ # [-v defaultColor=DEFCOLOR] \ # -v comments=COMMENTS \ # [EQUIVOPTIONS] \ # > OUTFILE.html # # This script turns an EVA text INFILE.evt into an HTML file with # colorized words. Each word is looked up in a user-provided color # dictionary COLORTABLE. If the word is not found, it is reduced by # some equivalence function and looked up again. # # The input may be in EVT format (with location code in columns 1-19) # or in pure text format. # # An extra blank line is printed after each paragraph terminator "=". # The spacing is done with "\n" not "
", assuming # the output will be inserted in a
...
# environment. # # If HEADERS is true, provides also the HTML headers and # the
 directive.  
#
# Lines beginning with "#" are assumed to be comments.
# If COMMENTS is 1 they are printed with COMMCOLOR.
# Otherwise they are treated as blank lines.  
#
# In any case, blank lines or blank comments are supressed when they
# occur between non-comment lines.
#
# In other contexts, multiple consecutive blank lines (or blank
# comments) are collapsed to a single blank line or blank comment,
# depending on the context.
#
# The COLORTABLE file should have entries PATTERN COLOR, where PATTERN
# is an EVA string and COLOR is an HTML color (six hexadecimal
# digits).  If COLOR is omitted, SPECIALCOLOR is assumed.
#
# Words that are not found in the table are left uncolored.
#
# EQUIVOPTIONS are assignments of the form -v OPTION=BOOL where OPTION
# is an option variable of eva2erg.gawk, and BOOL is 0 or 1.

BEGIN {
  abort = 0;
  noColor = "------";
  if (commentColor == "") { commentColor = noColor; }
  if (defaultColor == "") { defaultColor = "ff0000"; }
  if (version == "") { version = "*"; }
  if (showSimilar == "") { showSimilar = 0; }
  current_color = noColor;
  if (verbose) 
    { 
      printf "options:\n" > "/dev/stderr";
      if (erase_ligatures)     printf "  erase_ligatures\n"     > "/dev/stderr";
      if (erase_plumes)        printf "  erase_plumes\n"        > "/dev/stderr";
      if (ignore_gallows_eyes) printf "  ignore_gallows_eyes\n" > "/dev/stderr";
      if (join_ei)             printf "  join_ei\n"             > "/dev/stderr";
      if (equate_aoy)          printf "  equate_aoy\n"          > "/dev/stderr";
      if (collapse_ii)         printf "  collapse_ii\n"         > "/dev/stderr";
      if (equate_eights)       printf "  equate_eights\n"       > "/dev/stderr";
      if (equate_pt)           printf "  equate_pt\n"           > "/dev/stderr";
      if (erase_q)             printf "  erase_q\n"             > "/dev/stderr";
      if (erase_word_spaces)   printf "  erase_word_spaces\n"   > "/dev/stderr";
    }
  
  # "lastpar" tells whether previous non-blank line was a paragraph end.
  lastpar = 0;
  # "lastblank" tells whether previous line was blank (comment or not).
  lastblank = 0;
  # "lastcomm" tells whether the previous line (ignoring blanks) was a comment.
  lastcomm = 0;
  
  # Read color table:
  if (colorTable == "") 
    { error("must specify \"-v colorTable=FILE\"\n"); }
  split("", dic);
  split("", rdic);
  nMap=0;
  while((getline lin < colorTable) > 0) 
    { 
      if (! match(lin, /^[#]/)) 
        {
          nfld = split(lin, fld);
          if (nfld == 0) 
            { continue; }
          else if (nfld > 2)
            { tbl_error("bad entry = \"" lin "\""); }

          w = fld[1];
          if (nfld == 2) 
            { c = fld[2];
              gsub(/^#/, "", c);
              if (! match(c, /^[0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]$/))
                { tbl_error("bad color = \"" c "\""); }
            }
          else
            { c = defaultColor; }
          if (w in dic) { tbl_error("repeated key = \"" lin "\""); }
          dic[w] = c;
          if (showSimilar)
            { rdic[reduce_word(w)] = reduce_color(c); }
          nMap++;
        }
    }
  if (ERRNO != "0") { error((colorTable ": " ERRNO)); }
  close (colorTable);
  if (verbose)
    { printf "loaded %6d color table entries\n", nMap > "/dev/stderr"; }
  if (headers) { print_html_head("Colored text"); }
}

/^##/ { next; }

/^[ ]*$/ { lastblank = 1; next; }

/^#[ ]*$/ { lastblank = 1; next; }

/^#/ { 
  if (abort) exit;
  txt = $0;
  if (comments)
    { 
      if (lastpar || lastblank)
        { if (lastcomm) { print_word("#", commentColor); }
          printf "\n"; 
          lastblank = 0;
        }
      print_word(txt, commentColor);
      printf "\n";
      lastcomm = 1; lastblank = 0; lastpar = 0;
    }
  else
    { lastcomm = 0; lastblank = 1; }
  next;
}

/./ {
  if (abort) exit;
  
  # Extracts the location code and transcriber code:
  
  if (match($0, /^]*>/)) 
    { loc = substr($0, 1, RLENGTH);
      ver = substr(loc, RLENGTH-1, 1);
      if((version != ".") && (ver != version)) { next; }
      skip = RLENGTH;
    }
  else if (substr($0,1,1) == "<") 
    { file_error("bad location code");
    }
  else 
    { loc = (""); 
      skip = 0;
    }

  # Print blanks if appropriate:
  
  if (version == ".")
    { if (lastblank) { printf "\n"; } }
  else
    { if ((lastblank && lastcomm) || lastpar) { printf "\n"; } }
  
  printf "%*s", indent, "";
  if (loc != "") { print_word(sprintf("%-19s", loc), commentColor); }
  if (skip < length($0)) 
    { txt = erg_erase_comments(substr($0,1+skip));
      # Erase EVA fillers:
      gsub(/[!%]/, "", txt);
      # Replace ".," by spaces
      gsub(/[.,]/, " ", txt);
      # Insert spaces around "-" and "="
      gsub(/[-]/, " - ", txt);
      gsub(/[=]/, " = ", txt);
      # Remove spurious spaces
      gsub(/^  */, "", txt);
      gsub(/  *$/, "", txt);
      gsub(/   */, " ", txt);
      # Now process word by word:
      process_line(txt, dic, rdic);
    }
  printf "\n";
  lastpar = (substr(txt,length(txt),1) == "=");
  lastcomm = 0; lastblank = 0;
  next;
}

END { 
  if (current_color != noColor)
    { printf ""; }
  if (headers) { print_html_tail(); }
}

function print_html_head(title)
{
  printf "\n";
  printf "Voynich Manuscript - %s\n", title;
  printf "\n";
  printf "

%s

\n", title; printf "
\n";
}

function print_html_tail(title)
{
  printf "
\n"; printf "\n"; printf "\n"; } function tbl_error(msg) { error(("color table: " msg)); } function file_error(msg) { error(("line " FNR ": " msg)); } function error(msg) { printf "\n"; print_word(msg, "ffdd00"); printf "\n"; abort = 1; exit 1; } function iso_to_html(str) { # Converts an ISO Latin-1 string to HTML. # Basically, protects the characters [<>&]. gsub(/&/, "\\\&", str); gsub(//, "\\\>", str); return str; } function print_word(w, color) { # Prints word "w" in the given color. # Assumes the current color is "current_color" if (color != current_color) { if (current_color != noColor) { printf "" } if (color != noColor) { printf "", color; } current_color = color; } printf "%s", iso_to_html(w); } function reduce_word(str) { # Converts a textstring with no embedded "{}"s to the # requested encoding. str = (erase_ligatures ? erg_erase_ligatures(str) : str); str = (erase_plumes ? erg_erase_plumes(str) : str); str = (ignore_gallows_eyes ? erg_ignore_gallows_eyes(str) : str); str = (join_ei ? erg_join_ei(str) : str); str = (equate_aoy ? erg_equate_aoy(str) : str); str = (collapse_ii ? erg_collapse_ii(str) : str); str = (equate_eights ? erg_equate_eights(str) : str); str = (equate_pt ? erg_equate_pt(str) : str); str = (erase_q ? erg_erase_q(str) : str); str = (erase_word_spaces ? erg_erase_word_spaces(str) : erg_unify_word_spaces(str)); return erg_pack(str); } function reduce_color(col) { # Returns a dimmed version of the color "col" # For the time beging, just return "col" itself. return(col); } function process_word(w, dic, rdic, \ x, color) { # Prints word "w" colorized according to the # given the color tables "dic" (exact matches) and "rdic" (similar words). # Assumes the current color is "current_color" if ((w == "-")||(w == "=")) { color = noColor; } else if (w in dic) { color = dic[w]; } else if (showSimilar) { x = reduce_word(w); if (x in rdic) { color = rdic[x]; } else { color = noColor; } } else { color = noColor; } print_word(w, color); } function process_line(str, dic, rdic, \ i, k, kb, m, b, c) { # Prints line "str" with each word colorized according to the # given color tables "dic" (exact matches) and "rdic" (similar words). # Assumes "str" has been cleaned of comments, and # words are separated by spaces. # Assumes the current color is "current_color" str = (" " str " "); m = length(str); n = 0; b = substr(str,1,1); if (b != " ") { file_error("internal padding error"); exit; } for(k=2; k<=m; k++) { c = substr(str,k,1); if ((b == " ") && (c != " ")) { kb = k; } if ((b != " ") && (c == " ")) { if (n>0) printf " "; process_word(substr(str, kb, k-kb), dic, rdic) n++; } b = c; } if (c != " ") { error("internal padding error"); exit; } }