#! /bin/gawk -f # Last edited on 2012-05-05 19:53:44 by stolfilocal BEGIN { abort = -1; usage = ( "reencode-words-pinyin-for-tex \\\n" \ " [ -v field=NUM ] \\\n" \ " < INFILE.wct > OUTFILE.tex" \ ); # Assumes that field number FIELD of the input is a # word in Chinese pinyin encoding, with numeric suffix tones, # possibly factored into letters by braces "{}", # and possibly followed by "." and a numeric disambiguating code. # Allows for both "ü"/"ê" and "u:"/"e^" convention. # Adds `\' in front of some characters to allow typesetting # in TeX (with proper fonts). if (field == "") { arg_error("must specify \"field\""); } } (abort >= 0) { exit abort; } /^ *([#]|$)/ { print; next; } /./ { if (NF < field) { data_error("bad NF"); } w = $(field); if (w !~ /^([-*{}a-zA-Z0-9?:^.]|ü|ë|Ü|Ë)+$/) { data_error(("bad word \"" w "\"")); } w = reencode_pinyin_for_tex(w); $(field) = w; print; next; } function reencode_pinyin_for_tex(w) { # Protect special characters: gsub(/[{]/, "\\{", w); gsub(/[}]/, "\\}", w); gsub(/[#]/, "\\#", w); # Just in case gsub(/[&]/, "\\&", w); # Just in case gsub(/[$]/, "\\$", w); # Just in case gsub(/[%]/, "\\%", w); # Just in case # Change special letters to TeX accents: gsub(/[e][\^]/, "{\\^e}", w); gsub(/[u][:]/, "{\\\"u}", w); gsub(/[ü]/, "{\\\"u}", w); gsub(/[ê]/, "{\\^e}", w); gsub(/[E][\^]/, "{\\^E}", w); gsub(/[U][:]/, "{\\\"U}", w); gsub(/[Ü]/, "{\\\"U}", w); gsub(/[Ê]/, "{\\^E}", w); # Mark tone and suffix for possible special processing: w = gensub(/([0-9])[.]([0-9]+)$/, "\\\\tn{\\1}{\\2}", "", w); w = gensub(/[.]([0-9]+)$/, "\\\\tn{}{\\1}", "", w); w = gensub(/([0-9])$/, "\\\\tn{\\1}{}", "", w); w = gensub(/([^{}0-9])$/, "\\1\\\\tn{}{}", "", w); return w; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }