#! /bin/gawk -f # Last edited on 2004-02-18 10:38:26 by stolfi BEGIN { abort = -1; usage = ( "reencode-words-greek-for-tex \\\n" \ " [ -v field=NUM ] \\\n" \ " < INFILE.wct > OUTFILE.tex" \ ); # Assumes that field number FIELD of the input is a # word in ad-hoc Greek encoding, # possibly factored into letters by braces "{}" # Allows both expanded and condensed encodings: # # eh ë (eta) th ð (theta) ch q (chi) # ph f (phi) ow ô (omega) ps ç ß (psi) # # Does not allow for breathings, final sigma, or other discritics. # Adds `\' in front of braces, maps letters to TeX math-greek sequences # (Should use Greek font instead, but...). if (field == "") { arg_error("must specify \"field\""); } } (abort >= 0) { exit abort; } /^ *([#]|$)/ { print; next; } /./ { if (NF < field) { data_error("bad NF"); } w = $(field); if (w !~ /^[-*{}a-ik-uwxzëôðçA-IK-UWXZËÔÐÇ]+$/) { data_error(("bad word \"" w "\"")); } w = reencode_greek_for_tex(w); $(field) = w; print; next; } function reencode_greek_for_tex(wd) { w = wd; # Protect special characters: gsub(/[{]/, "\\{", w); gsub(/[}]/, "\\}", w); gsub(/[\#]/, "\\#", w); # Just in case gsub(/[&]/, "\\&", w); # Just in case gsub(/[$]/, "\\$", w); # Just in case gsub(/[%]/, "\\%", w); # Just in case # Put a marker in front of each letter to identify unmapped ones: w = gensub(/(.)/, "·\\1", "g", w); # Map lowercase letters to TeX escapes: gsub(/[·][a]/, "\\alpha", w); gsub(/[·][b]/, "\\beta", w); gsub(/[·][c][·][h]/, "\\chi", w); gsub(/[·][ç]/, "\\psi", w); gsub(/[·][d]/, "\\delta", w); gsub(/[·][e][·][h]/, "\\eta", w); gsub(/[·][e]/, "\\epsilon", w); gsub(/[·][ë]/, "\\eta", w); gsub(/[·][f]/, "\\psi", w); gsub(/[·][g]/, "\\gamma", w); gsub(/[·][i]/, "\\iota", w); gsub(/[·][k]/, "\\kappa", w); gsub(/[·][l]/, "\\lambda", w); gsub(/[·][m]/, "\\mu", w); gsub(/[·][n]/, "\\nu", w); gsub(/[·][o][·][w]/, "\\omega", w); gsub(/[·][o]/, "\\omicron", w); gsub(/[·][ô]/, "\\omega", w); gsub(/[·][p][·][h]/, "\\phi", w); gsub(/[·][p][·][s]/, "\\psi", w); gsub(/[·][p]/, "\\pi", w); gsub(/[·][q]/, "\\chi", w); gsub(/[·][r]/, "\\rho", w); gsub(/[·][s]/, "\\sigma", w); gsub(/[·][t][·][h]/, "\\theta", w); gsub(/[·][t]/, "\\tau", w); gsub(/[·][ð]/, "\\theta", w); gsub(/[·][u]/, "\\upsilon", w); gsub(/[·][x]/, "\\xi", w); gsub(/[·][z]/, "\\zeta", w); # Ditto for uppercase gsub(/[·][A]/, "\\Alpha", w); gsub(/[·][B]/, "\\Beta", w); gsub(/[·][C][·][hH]/, "\\Chi", w); gsub(/[·][Ç]/, "\\Psi", w); gsub(/[·][D]/, "\\Delta", w); gsub(/[·][E][·][hH]/, "\\Eta", w); gsub(/[·][E]/, "\\Epsilon", w); gsub(/[·][Ë]/, "\\Eta", w); gsub(/[·][F]/, "\\Psi", w); gsub(/[·][G]/, "\\Gamma", w); gsub(/[·][I]/, "\\Iota", w); gsub(/[·][K]/, "\\Kappa", w); gsub(/[·][L]/, "\\Lambda", w); gsub(/[·][M]/, "\\Mu", w); gsub(/[·][N]/, "\\Nu", w); gsub(/[·][O][·][wW]/, "\\Omega", w); gsub(/[·][O]/, "\\Omicron", w); gsub(/[·][Ô]/, "\\Omega", w); gsub(/[·][P][·][hH]/, "\\Phi", w); gsub(/[·][P][·][sS]/, "\\Psi", w); gsub(/[·][P]/, "\\Pi", w); gsub(/[·][Q]/, "\\Chi", w); gsub(/[·][R]/, "\\Rho", w); gsub(/[·][S]/, "\\Sigma", w); gsub(/[·][T][·][hH]/, "\\Theta", w); gsub(/[·][T]/, "\\Tau", w); gsub(/[·][Ð]/, "\\Theta", w); gsub(/[·][U]/, "\\Upsilon", w); gsub(/[·][X]/, "\\Xi", w); gsub(/[·][Z]/, "\\Zeta", w); # Sanity check: if (w ~ /[·]/) { data_error(("bad word \"" wd "\"")); } return w; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }