#! /usr/bin/gawk -f # Usage: # cat INFILE.evt \ # | colorize-text -f word-equiv.gawk \ # -v colorTable=COLORTABLE \ # [-v missing=MISSCOLOR] \ # [-v default=DEFCOLOR] \ # [EQUIVOPTIONS] \ # > OUTFILE.html # # This script turns an EVA text INFILE.evt into an HTML file with # colorized words. Each word is mapped by weq_reduce and looked up in # a user-provided color dictionary COLORTABLE. # # The input may be in EVT format (with location code in columns 1-19) # or in pure text format. # # Lines are separated on output by "\n", or "\n\n" after a "=". This # is OK if the output is to be inserted in a
...
# environment; in other contexts, it mey be necessary to insert a
# at the ened of each line. # # The COLORTABLE file should have entries PATTERN COLOR, where PATTERN # is an EVA string and COLOR is an HTML color (six hexadecimal # digits). # # If a word is not found in the table, it is set in MISSCOLOR (a # six-digit hex string, 000000 if not specified. The script assumes # that words without explicit color will be shown in DEFCOLOR (000000) # # EQUIVOPTIONS are assignments of the form -v OPTION=BOOL where OPTION # is an option variable of word-equiv.gawk, and BOOL is 0 or 1. function iso_to_html(str) { # Converts an ISO Latin-1 string to HTML. # Basically, protects the characters [<>&]. gsub(/&/, "\\\&", str); gsub(//, "\\\>", str); return str; } function print_word(w, color) { # Prints word "w" in the given color. # Assumes the current color is "current_color" if (color != current_color) { if (current_color != default) { printf "" } if (color != default) { printf "", color; } current_color = color; } printf "%s", iso_to_html(w); } function process_word(w, dic, \ x, color) { # Prints word "w" colorized according to the # given "dic" table. # Assumes the current color is "current_color" if ((w == "-")||(w == "=")) { color = default; } else { x = weq_reduce(w); if (x in dic) {color = dic[x];} else {color = missing;} } print_word(w, color); } function process_line(str, dic, \ i, k, kb, m, b, c) { # Prints line "str" with each word colorized according to the # given "dic" table. # Assumes "str" has been cleaned of comments, and # words are separated by spaces. # Assumes the current color is "current_color" str = (" " str " "); m = length(str); n = 0; b = substr(str,1,1); if (b != " ") { error("internal padding error"); exit; } for(k=2; k<=m; k++) { c = substr(str,k,1); if ((b == " ") && (c != " ")) { kb = k; } if ((b != " ") && (c == " ")) { if (n>0) printf " "; process_word(substr(str, kb, k-kb), dic) n++; } b = c; } if (c != " ") { error("internal padding error"); exit; } } BEGIN { abort = 0; if (default == "") { default = "000000"; } if (missing == "") { missing = "000000"; } if (colorTable == "") { error("must specify \"-v colorTable=FILE\"\n"); } split("", dic); # Read color table: nMap=0; while((getline lin < colorTable) > 0) { split(lin, fld); if ((3 in fld) || ! (2 in fld)) { error("bad colorTable entry = \"" lin "\""); } if (fld[1] in dic) { error("repeated key = \"" lin "\""); } dic[fld[1]] = fld[2]; nMap++; } close (colorTable); printf "loaded %6d color table entries\n", nMap > "/dev/stderr"; current_color = default; } /^#/ { if (abort) exit; txt = iso_to_html($0); print_word(txt, default); printf "\n"; next; } /./ { if (abort) exit; # Extracts the location code: if (match($0, /^]*>/)) { loc = sprintf("%-19s", substr($0,1,RLENGTH)); skip = RLENGTH; } else if (substr($0,1,1) == "<") { error("bad location code"); } else { loc = (""); skip = 0; } print_word(loc, default); if (skip < length($0)) { txt = weq_erase_comments(substr($0,1+skip)); # Erase EVA fillers: gsub(/[!%]/, "", txt); # Replace ".," by spaces gsub(/[.,]/, " ", txt); # Insert spaces around "-" and "=" gsub(/[-]/, " - ", txt); gsub(/[=]/, " = ", txt); # Remove spurious spaces gsub(/^ */, "", txt); gsub(/ *$/, "", txt); gsub(/ */, " ", txt); # Now process word by word: process_line(txt, dic); } printf "\n"; if (substr(txt,length(txt),1) == "=") printf "\n" next; } END { if (current_color != default) { printf ""; } }