#! /bin/gawk -f
# Last edited on 2004-05-26 07:53:20 by stolfi

BEGIN {
  abort = -1;
  usage = ( "reencode-words-koi8r-for-tex \\\n" \
    "  [ -v field=NUM ] \\\n" \
    "  < INFILE.wct > OUTFILE.tex" \
  );

  # Assumes that field number FIELD of the input is a 
  # word in Koi8r encoding, 
  # possibly factored into letters by braces "{}"
  # 
  # Adds `\' in front of braces, maps Koi8r codes
  # to TeX cyrillic sequences

  if (field == "") { arg_error("must specify \"field\""); }
}

(abort >= 0) { exit abort; }

/^ *([#]|$)/ { print; next; }

/./ { 
  if (NF < field) { data_error("bad NF"); }
  w = $(field);
  if (w !~ /^[£ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß³àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ]+$/)
    { data_error(("bad word \"" w "\"")); }
  w = reencode_koi8r_for_tex(w);
  $(field) = w;
  print; next;
}

function reencode_koi8r_for_tex(wd)
{
  w = wd;
  # Protect special characters:
  gsub(/[{]/,  "\\{", w);
  gsub(/[}]/,  "\\}", w);
  gsub(/[\#]/, "\\#", w);  # Just in case
  gsub(/[&]/,  "\\&", w);  # Just in case
  gsub(/[$]/,  "\\$", w);  # Just in case
  gsub(/[%]/,  "\\%", w);  # Just in case
  # Do nothing for now
  
  # # Put a marker in front of each letter to identify unmapped ones:
  # w = gensub(/(.)/, "·\\1", "g", w);
  # # Map lowercase letters to TeX escapes:
  # gsub(/[·][a]/,        "\\alpha", w);
  # gsub(/[·][b]/,        "\\beta", w);
  # gsub(/[·][c][·][h]/,  "\\chi", w);
  # gsub(/[·][ç]/,        "\\psi", w);
  # ....
  # gsub(/[·][X]/,        "\\Xi", w);
  # gsub(/[·][Z]/,        "\\Zeta", w);
  # Sanity check:
  
  if (w ~ /[·]/) { data_error(("bad word \"" wd "\"")); }
  return w;
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1; exit 1;
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit 1;
}