#! /usr/bin/gawk -f # Last edited on 1999-02-01 09:55:18 by stolfi # Usage: # cat INFILE \ # | collapse-text -f eva2erg.gawk \ # [-v describe_equiv=BOOL] \ # [-v append_tilde=BOOL] \ # [-v field=FIELDNUM] \ # [EQUIVOPTIONS] \ # > OUTFILE # # Maps each piece of text in INFILE to a reduced alphabet, # with certain amount of "error tolerance", by a # built-in equivalence function. # # If "append_tilde" is set, appends a tilde "~" to the mapped # text, to indicate that it is a elem class and not a # raw elem. # # If "field" is specified, only the indicated field gets # mapped to its equivalence class. Othwerwise, the # input may be in EVT format (with location code in columns 1-19) # or in pure text format; either way, all text elems are mapped. # # EQUIVOPTIONS are assignments of the form -v OPTION=BOOL where OPTION # is an option variable of eva2erg.gawk, and BOOL is 0 or 1. # # If "describe_equiv" is set, prints to stderr a description # of the equivalence used, and exits without processing any records. function error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit 1; } function print_equiv() { printf "element equivalence:\n" > "/dev/stderr"; if (erase_ligatures) printf " erase_ligatures\n" > "/dev/stderr"; if (map_ee_to_ch) printf " map_ee_to_ch\n" > "/dev/stderr"; if (map_sh_to_ch) printf " map_sh_to_ch\n" > "/dev/stderr"; if (erase_plumes) printf " erase_plumes\n" > "/dev/stderr"; if (ignore_gallows_eyes) printf " ignore_gallows_eyes\n" > "/dev/stderr"; if (join_ei) printf " join_ei\n" > "/dev/stderr"; if (equate_aoy) printf " equate_aoy\n" > "/dev/stderr"; if (collapse_ii) printf " collapse_ii\n" > "/dev/stderr"; if (equate_eights) printf " equate_eights\n" > "/dev/stderr"; if (equate_pt) printf " equate_pt\n" > "/dev/stderr"; if (erase_q) printf " erase_q\n" > "/dev/stderr"; if (erase_word_spaces) printf " erase_word_spaces\n" > "/dev/stderr"; if (unify_word_spaces) printf " unify_word_spaces\n" > "/dev/stderr"; if (crush_invalid_words) printf " crush_invalid_words\n" > "/dev/stderr"; if (append_tilde) printf " append_tilde\n" > "/dev/stderr"; } function reduce_text(str) { # Converts a textstring with no embedded "{}"s to the # requested encoding. str = (erase_ligatures ? erg_erase_ligatures(str) : str); str = (map_ee_to_ch ? erg_map_ee_to_ch(str) : str); str = (map_sh_to_ch ? erg_map_sh_to_ch(str) : str); str = (erase_plumes ? erg_erase_plumes(str) : str); str = (ignore_gallows_eyes ? erg_ignore_gallows_eyes(str) : str); str = (join_ei ? erg_join_ei(str) : str); str = (equate_aoy ? erg_equate_aoy(str) : str); str = (collapse_ii ? erg_collapse_ii(str) : str); str = (equate_eights ? erg_equate_eights(str) : str); str = (equate_pt ? erg_equate_pt(str) : str); str = (erase_q ? erg_erase_q(str) : str); str = (erase_word_spaces ? erg_erase_word_spaces(str) : str); str = (unify_word_spaces ? erg_unify_word_spaces(str) : str); str = (crush_invalid_words ? erg_crush_invalid_words(str) : str); str = erg_pack(str); if (append_tilde) { str = gensub(/([^-=,. ]+[~]*)/, "\\1~", "g", str); } return str; } BEGIN { abort = -1; if (field == "") { field = 0; } if (describe_equiv) { print_equiv(); abort = 0; exit 0; } } /^#/ { if (abort >= 0) exit abort; print; next; } /./ { if (abort >= 0) exit abort; # Extracts the location code: if (field == 0) { loc = (""); skip = 0; if (match($0, /^]*>/)) { loc = sprintf("%-19s", substr($0,1,RLENGTH)); skip = RLENGTH; } else if (substr($0,1,1) == "<") { error(("line " NR ": bad location code")); } if (skip < length($0)) { txt = erg_erase_comments(substr($0,1+skip)); # Remove spurious spaces gsub(/[ ]/, "!", txt); # Now map elems: txt = reduce_text(txt); } printf "%s%s\n", loc, txt; } else { if (field > NF) { printf "** line %d: not enough fields\n", NR > "/dev/stderr"; exit 1; } $(field) = reduce_text($(field)); print; } next; }