#! /usr/bin/gawk -f # Last edited on 2001-01-02 01:54:46 by stolfi # # Replaces 8-bit weirdo codes by "*{&NNN}" where NNN is the decimal # code of the weirdo. # # Also replaces any capitalized (ligated) EVA letter "C" by "c{&C}" # where "c" is the lower-case equivalent. # # See also "remove-needless-capitalization" BEGIN { abort = -1; # Recoding table: tbl["A"] = "a{&A}"; tbl["E"] = "e{&E}"; tbl["F"] = "f{&F}"; tbl["H"] = "h{&H}"; tbl["I"] = "i{&I}"; tbl["K"] = "k{&K}"; tbl["O"] = "o{&O}"; tbl["P"] = "p{&P}"; tbl["S"] = "s{&S}"; tbl["R"] = "r{&R}"; tbl["T"] = "t{&T}"; tbl["Y"] = "y{&Y}"; tbl["c'"] = "s{&c'}"; tbl["e'"] = "s{&e'}"; tbl["I'"] = "r{&I'}"; tbl["I\""] = "r{&I\"}"; tbl["o'"] = "o{&o'}"; tbl["O'"] = "o{&O'}"; tbl["O\""] = "o{&O\"}"; tbl["q\""] = "q{&q\"}"; tbl["y'"] = "y{&y'}"; tbl["\202"] = "*{&130}"; tbl["\203"] = "*{&131}"; tbl["\204"] = "*{&132}"; tbl["\205"] = "*{&133}"; tbl["\206"] = "*{&134}"; tbl["\207"] = "*{&135}"; tbl["\210"] = "*{&136}"; tbl["\211"] = "k{&137}"; tbl["\212"] = "*{&138}"; tbl["\213"] = "*{&139}"; tbl["\214"] = "*{&140}"; tbl["\215"] = "*{&141}"; tbl["\216"] = "*{&142}"; tbl["\217"] = "r{&143}"; tbl["\220"] = "*{&144}"; tbl["\221"] = "*{&145}"; tbl["\222"] = "*{&146}"; tbl["\223"] = "*{&147}"; tbl["\224"] = "*{&148}"; tbl["\225"] = "*{&149}"; tbl["\226"] = "c{&150}"; tbl["\227"] = "*{&151}"; tbl["\230"] = "*{&152}"; tbl["\231"] = "*{&153}"; tbl["\232"] = "*{&154}"; tbl["\233"] = "*{&155}"; tbl["\234"] = "*{&156}"; tbl["\235"] = "*{&157}"; tbl["\236"] = "*{&158}"; tbl["\237"] = "*{&159}"; # "\240"=&160 is   tbl["\241"] = "*{&161}"; # "¡" tbl["\242"] = "k{&162}"; # "¢" tbl["\243"] = "*{&163}"; # "£" tbl["\244"] = "r{&164}"; # "¤" tbl["\245"] = "m{&165}"; # "¥" tbl["\246"] = "*{&166}"; # "¦" tbl["\247"] = "q{&167}"; # "§" tbl["\250"] = "*{&168}"; # "¨" tbl["\251"] = "*{&169}"; # "©" tbl["\252"] = "*{&170}"; # "ª" tbl["\253"] = "*{&171}"; # "«" tbl["\254"] = "r{&172}"; # "¬" tbl["\255"] = "y{&173}"; # "­" tbl["\256"] = "*{&174}"; # "®" tbl["\257"] = "*{&175}"; # "¯" tbl["\260"] = "k{&176}"; # "°" tbl["\261"] = "p{&177}"; # "±" tbl["\262"] = "t{&178}"; # "²" tbl["\263"] = "p{&179}"; # "³" tbl["\264"] = "p{&180}"; # "´" tbl["\265"] = "p{&181}"; # "µ" tbl["\266"] = "*{&182}"; # "¶" tbl["\267"] = "*{&183}"; # "·" tbl["\270"] = "f{&184}"; # "¸" tbl["\271"] = "x{&185}"; # "¹" tbl["\272"] = "p{&186}"; # "º" tbl["\273"] = "*{&187}"; # "»" tbl["\274"] = "*{&188}"; # "¼" tbl["\275"] = "y{&189}"; # "½" tbl["\276"] = "*{&190}"; # "¾" tbl["\277"] = "*{&191}"; # "¿" tbl["\300"] = "*{&192}"; # "À" tbl["\301"] = "*{&193}"; # "Á" tbl["\302"] = "*{&194}"; # "Â" tbl["\303"] = "*{&195}"; # "Ã" tbl["\304"] = "*{&196}"; # "Ä" tbl["\305"] = "*{&197}"; # "Å" tbl["\306"] = "*{&198}"; # "Æ" tbl["\307"] = "*{&199}"; # "Ç" tbl["\310"] = "h{&200}"; # "È" tbl["\311"] = "d{&201}"; # "É" tbl["\312"] = "t{&201}"; # "Ê" tbl["\313"] = "*{&203}"; # "Ë" tbl["\314"] = "*{&204}"; # "Ì" # "\314"=&205 not in use tbl["\316"] = "*{&206}"; # "Î" tbl["\317"] = "*{&207}"; # "Ï" tbl["\320"] = "*{&208}"; # "Ð" tbl["\321"] = "*{&209}"; # "Ñ" tbl["\322"] = "*{&210}"; # "Ò" tbl["\323"] = "*{&211}"; # "Ó" tbl["\324"] = "*{&212}"; # "Ô" tbl["\325"] = "*{&213}"; # "Õ" tbl["\326"] = "*{&214}"; # "Ö" tbl["\327"] = "t{&215}"; # "×" tbl["\330"] = "*{&216}"; # "Ø" # "\331"=&217 to "\373"=&251 not in use tbl["\374"] = "*{&252}"; # "ü" tbl["\375"] = "*{&253}"; # "ý" tbl["\376"] = "*{&254}"; # "þ" tbl["\377"] = "*{&255}"; # "ÿ" for (c in tbl) { d = tbl[c]; if(match(d, /^.{[&][0-9][0-9][0-9]}$/)) { d = substr(d,4,3); oct[d] = c; } } } //{ if (abort >= 0) { exit abort; } } /^[#]/ { print; next; } /^[<]f[0-9]+[rv]?[0-6]?(|[.][A-Za-z][A-Za-z0-9]?)[>]/ { # Page/unit header lines print; next; } /^[<]/ { # Normal EVMT-format text lines lin = $0; p = index(lin, ">"); if (p == 0) { format_error("unmatched `<'"); } loc = substr(lin,1,p); txt = substr(lin, p+1); gsub(/^[ ]*/, "", txt); printf "%-18s %s\n", loc, basify_text(txt); next; } // { # Non-EVMT text lines print basify_text($0); } function basify_text(txt, k,cmt,chunk,res,oldchunk) { res = ""; while (txt != "") { # break off from "txt" a comment-free chunk and the following comment: k = index(txt, "{"); if (k == 0) { chunk = txt; cmt = ""; txt = ""; } else { chunk = substr(txt, 1, k-1); txt = substr(txt, k); k = index(txt, "}"); if (k == 0) { format_error("unclosed `{'"); } cmt = substr(txt, 1, k); txt = substr(txt, k+1); } # Basify the chunk and add it to the result if (match(chunk, /[^-.,=*a-z]/)) { chunk = basify_chunk(chunk); } res = (res chunk cmt); } return (res); } function basify_chunk(txt, n,i,c,d,skip,res) { if (! match(txt, /[^-a-vx-z,.=*?!%]/)) { return txt; } res = ""; n = length(txt); for (i=1;i<=n;i++) { c = substr(txt,i,1); if ((c < "\040") || (c > "\176")) { if (c in tbl) { c = tbl[c]; } else { format_error("invalid 8-bit char"); } } else if ((c == "&") && match(substr(txt,i), /^[&][#]?([0-2][0-9][0-9])[;]?/)) { skip = RLENGTH-1; c = substr(txt,i+1,skip); gsub(/[#]/, "", c); gsub(/[;]$/, "", c); if (c in oct) { c = tbl[oct[c]]; i += skip; } else { format_error("invalid &-code"); } } else { if ((i < n) && (index("'\"", (d = substr(txt,i+1,1))) != 0)) { c = (c d); i++; } if (c in tbl) { c = tbl[c]; } } res = (res c); } return (res); } function format_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit 1; }