#! /usr/bin/gawk -f # Last edited on 2000-07-10 08:14:10 by stolfi # Usage: # cat INFILE.evt \ # | colorize-text -f eva2erg.gawk \ # -v verbose=BOOL \ # -v indent=INDENT \ # -v colorTable=COLORTABLE \ # -v textColor=TEXTCOLOR \ # [-v defaultColor=DEFCOLOR] \ # [EQUIVOPTIONS] \ # > OUTFILE.html # # This script turns an EVA text INFILE.evt into an HTML file with # colorized words. Each word is reduced by some equivalence function # and looked up in a user-provided color dictionary COLORTABLE. # # The input may be in EVT format (with location code in columns 1-19) # or in pure text format. # # Lines are separated on output by "\n", or "\n\n" after a "=". This # is OK if the output is to be inserted in a
...
# environment; in other contexts, it mey be necessary to insert a
# at the ened of each line. # # The COLORTABLE file should have entries PATTERN COLOR, where PATTERN # is an EVA string and COLOR is an HTML color (six hexadecimal # digits). # # The script assumes that words without explicit # directive will appear in TEXTCOLOR If a # word is not found in the table, it is set in DEFCOLOR (a six-digit # hex string, TEXTCOLOR if not specified. # # EQUIVOPTIONS are assignments of the form -v OPTION=BOOL where OPTION # is an option variable of eva2erg.gawk, and BOOL is 0 or 1. function iso_to_html(str) { # Converts an ISO Latin-1 string to HTML. # Basically, protects the characters [<>&]. gsub(/&/, "\\\&", str); gsub(//, "\\\>", str); return str; } function print_word(w, color) { # Prints word "w" in the given color. # Assumes the current color is "current_color" if (color != current_color) { if (current_color != textColor) { printf "" } if (color != textColor) { printf "", color; } current_color = color; } printf "%s", iso_to_html(w); } function reduce_word(str) { # Converts a textstring with no embedded "{}"s to the # requested encoding. str = (erase_ligatures ? erg_erase_ligatures(str) : str); str = (erase_plumes ? erg_erase_plumes(str) : str); str = (ignore_gallows_eyes ? erg_ignore_gallows_eyes(str) : str); str = (join_ei ? erg_join_ei(str) : str); str = (equate_aoy ? erg_equate_aoy(str) : str); str = (collapse_ii ? erg_collapse_ii(str) : str); str = (equate_eights ? erg_equate_eights(str) : str); str = (equate_pt ? erg_equate_pt(str) : str); str = (erase_q ? erg_erase_q(str) : str); str = (erase_word_spaces ? erg_erase_word_spaces(str) : erg_unify_word_spaces(str)); return erg_pack(str); } function process_word(w, dic, \ x, color) { # Prints word "w" colorized according to the # given "dic" table. # Assumes the current color is "current_color" if ((w == "-")||(w == "=")) { color = textColor; } else { x = reduce_word(w); if (x in dic) {color = dic[x];} else {color = defaultColor;} } print_word(w, color); } function process_line(str, dic, \ i, k, kb, m, b, c) { # Prints line "str" with each word colorized according to the # given "dic" table. # Assumes "str" has been cleaned of comments, and # words are separated by spaces. # Assumes the current color is "current_color" str = (" " str " "); m = length(str); n = 0; b = substr(str,1,1); if (b != " ") { error("internal padding error"); exit; } for(k=2; k<=m; k++) { c = substr(str,k,1); if ((b == " ") && (c != " ")) { kb = k; } if ((b != " ") && (c == " ")) { if (n>0) printf " "; process_word(substr(str, kb, k-kb), dic) n++; } b = c; } if (c != " ") { error("internal padding error"); exit; } } BEGIN { abort = 0; if (textColor == "") { error("must specify \"-v textColor=...\""); } if (defaultColor == "") { defaultColor = textColor; } current_color = textColor; if (verbose) { printf "options:\n" > "/dev/stderr"; if (erase_ligatures) printf " erase_ligatures\n" > "/dev/stderr"; if (erase_plumes) printf " erase_plumes\n" > "/dev/stderr"; if (ignore_gallows_eyes) printf " ignore_gallows_eyes\n" > "/dev/stderr"; if (join_ei) printf " join_ei\n" > "/dev/stderr"; if (equate_aoy) printf " equate_aoy\n" > "/dev/stderr"; if (collapse_ii) printf " collapse_ii\n" > "/dev/stderr"; if (equate_eights) printf " equate_eights\n" > "/dev/stderr"; if (equate_pt) printf " equate_pt\n" > "/dev/stderr"; if (erase_q) printf " erase_q\n" > "/dev/stderr"; if (erase_word_spaces) printf " erase_word_spaces\n" > "/dev/stderr"; } if (colorTable == "") { error("must specify \"-v colorTable=FILE\"\n"); } split("", dic); # Read color table: nMap=0; while((getline lin < colorTable) > 0) { if (! match(lin, /^ *[#]/)) { nfld = split(lin, fld); if (nfld != 2) { error("bad colorTable entry = \"" lin "\""); } if (fld[1] in dic) { error("repeated key = \"" lin "\""); } dic[fld[1]] = fld[2]; nMap++; } } close (colorTable); if (verbose) { printf "loaded %6d color table entries\n", nMap > "/dev/stderr"; } } /^#/ { if (abort) exit; txt = iso_to_html($0); print_word(txt, textColor); printf "\n"; next; } /./ { if (abort) exit; # Extracts the location code: if (match($0, /^]*>/)) { loc = sprintf("%-19s", substr($0,1,RLENGTH)); skip = RLENGTH; } else if (substr($0,1,1) == "<") { error("bad location code"); } else { loc = (""); skip = 0; } printf "%*s", indent, ""; print_word(loc, textColor); if (skip < length($0)) { txt = erg_erase_comments(substr($0,1+skip)); # Erase EVA fillers: gsub(/[!%]/, "", txt); # Replace ".," by spaces gsub(/[.,]/, " ", txt); # Insert spaces around "-" and "=" gsub(/[-]/, " - ", txt); gsub(/[=]/, " = ", txt); # Remove spurious spaces gsub(/^ */, "", txt); gsub(/ *$/, "", txt); gsub(/ */, " ", txt); # Now process word by word: process_line(txt, dic); } printf "\n"; if (substr(txt,length(txt),1) == "=") printf "\n" next; } END { if (current_color != textColor) { printf ""; } } function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr"; abort = 1; exit 1; }