#! /bin/gawk -f # Last edited on 2004-05-26 07:53:20 by stolfi BEGIN { abort = -1; usage = ( "reencode-words-koi8r-for-tex \\\n" \ " [ -v field=NUM ] \\\n" \ " < INFILE.wct > OUTFILE.tex" \ ); # Assumes that field number FIELD of the input is a # word in Koi8r encoding, # possibly factored into letters by braces "{}" # # Adds `\' in front of braces, maps Koi8r codes # to TeX cyrillic sequences if (field == "") { arg_error("must specify \"field\""); } } (abort >= 0) { exit abort; } /^ *([#]|$)/ { print; next; } /./ { if (NF < field) { data_error("bad NF"); } w = $(field); if (w !~ /^[£ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß³àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ]+$/) { data_error(("bad word \"" w "\"")); } w = reencode_koi8r_for_tex(w); $(field) = w; print; next; } function reencode_koi8r_for_tex(wd) { w = wd; # Protect special characters: gsub(/[{]/, "\\{", w); gsub(/[}]/, "\\}", w); gsub(/[\#]/, "\\#", w); # Just in case gsub(/[&]/, "\\&", w); # Just in case gsub(/[$]/, "\\$", w); # Just in case gsub(/[%]/, "\\%", w); # Just in case # Do nothing for now # # Put a marker in front of each letter to identify unmapped ones: # w = gensub(/(.)/, "·\\1", "g", w); # # Map lowercase letters to TeX escapes: # gsub(/[·][a]/, "\\alpha", w); # gsub(/[·][b]/, "\\beta", w); # gsub(/[·][c][·][h]/, "\\chi", w); # gsub(/[·][ç]/, "\\psi", w); # .... # gsub(/[·][X]/, "\\Xi", w); # gsub(/[·][Z]/, "\\Zeta", w); # Sanity check: if (w ~ /[·]/) { data_error(("bad word \"" wd "\"")); } return w; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }