#! /usr/bin/gawk -f # Last edited on 1998-12-08 06:10:53 by stolfi # # A filter that removes purely aesthetic EVA capitalization (where the # ligature is implied by a subsequent "h" character) from an EVMT # format file. # BEGIN { abort = -1; } //{ if (abort >= 0) { exit abort; } } /^[#]/ { print; next; } /^[<]f[0-9]+[rv]?[0-6]?(|[.][A-Za-z][A-Za-z0-9]?)[>]/ { # Page/unit header lines print; next; } /^[<]/ { # Normal EVMT-format text lines lin = $0; p = index(lin, ">"); if (p == 0) { format_error("unmatched `<'"); } loc = substr(lin,1,p); txt = substr(lin, p+1); gsub(/^[ ]*/, "", txt); printf "%-18s %s\n", loc, cleanup(txt); next; } // { # Non-EVMT text lines print cleanup($0); } function cleanup(txt, k,cmt,chunk,res,oldchunk) { res = ""; while (txt != "") { # break off from "txt" a comment-free chunk and the following comment: k = index(txt, "{"); if (k == 0) { chunk = txt; cmt = ""; txt = ""; } else { chunk = substr(txt, 1, k-1); txt = substr(txt, k); k = index(txt, "}"); if (k == 0) { format_error("unclosed `{'"); } cmt = substr(txt, 1, k); txt = substr(txt, k+1); } # Decapitalize the chunk and add it to the result oldchunk = ""; while (chunk != oldchunk) { oldchunk = chunk; # most common cases: gsub(/[H]h/, "hh", chunk); gsub(/[C]h/, "ch", chunk); gsub(/[S]h/, "sh", chunk); if (chunk == oldchunk) { # more complicated cases: chunk = gensub(/[H]([!%]*)h/, "h\\1h", "g", chunk); chunk = gensub(/[T]([!%]*)h/, "t\\1h", "g", chunk); chunk = gensub(/[K]([!%]*)h/, "k\\1h", "g", chunk); chunk = gensub(/[P]([!%]*)h/, "p\\1h", "g", chunk); chunk = gensub(/[F]([!%]*)h/, "f\\1h", "g", chunk); chunk = gensub(/[C]([!%ktpf]*)h/, "c\\1h", "g", chunk); chunk = gensub(/[S]([!%ktpf]*)h/, "s\\1h", "g", chunk); chunk = gensub(/[I]([!%ktpf]*)h/, "i\\1h", "g", chunk); } } res = (res chunk cmt); } return (res); } function format_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit 1; }