#! /usr/bin/gawk -f # Must specify -f eva2erg.gawk # Last edited on 1999-01-05 22:14:45 by stolfi # Adds a match key that can be used to find similar words in Voynichese # text. # # cat INFILE \ # | add-match-key -f eva2erg.gawk \ # [ -v inField=IFLDNUM ] \ # [ -v outField=OFLDNUM ] \ # > OUTFILE # # This script reads from stdin one or more records that contain a # Voynichese word KEY. It outputs the same redords, each augmented with a # "reduced" version RKEY of the same word. # # The KEY word is assumed to be field IFLDNUM of each record # (the first field by default), and the RKEY is inserted as field # OFLDNUM (first by default). # BEGIN { abort = 0; if (inField == "") inField = 1; if (outField == "") outField = 1; printf "options:\n" > "/dev/stderr"; # Select options: erase_ligatures = 1; erase_plumes = 1; ignore_gallows_eyes = 1; join_ei = 1; equate_bn = 1; equate_aoy = 1; collapse_ii = 1; equate_eights = 1; equate_pt = 1; erase_q = 1; erase_word_spaces = 1; # Print options: if (erase_ligatures) printf " erase_ligatures\n" > "/dev/stderr"; if (erase_plumes) printf " erase_plumes\n" > "/dev/stderr"; if (ignore_gallows_eyes) printf " ignore_gallows_eyes\n" > "/dev/stderr"; if (join_ei) printf " join_ei\n" > "/dev/stderr"; if (equate_aoy) printf " equate_aoy\n" > "/dev/stderr"; if (equate_bn) printf " equate_bn\n" > "/dev/stderr"; if (collapse_ii) printf " collapse_ii\n" > "/dev/stderr"; if (equate_eights) printf " equate_eights\n" > "/dev/stderr"; if (equate_pt) printf " equate_pt\n" > "/dev/stderr"; if (erase_q) printf " erase_q\n" > "/dev/stderr"; if (erase_word_spaces) printf " erase_word_spaces\n" > "/dev/stderr"; } /./ { if (abort) exit; if (NF < inField) { error("not enough input fields\n"); } printout(recode_text(erg_erase_comments($(inField))), outField); next; } function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr" abort = 1 exit } function recode_text(str) { # Converts a textstring with no embedded "{}"s to the # requested encoding. # Delete EVMT fillers: str = gensub(/[!% ]/, "", "g", str); # Normalize spaces and surround with one space str = gensub(/[-/=,.]+/, ".", "g", str); str = gensub(/^[-/=,.]*/, ".", "g", str); str = gensub(/[-/=,.]*$/, ".", "g", str); # Special hacks not handled by eva2erg.gawk routines: gsub(/u/, "en", str); gsub(/z/, "k", str); str = (erase_ligatures ? erg_erase_ligatures(str) : str); str = (erase_plumes ? erg_erase_plumes(str) : str); str = (ignore_gallows_eyes ? erg_ignore_gallows_eyes(str) : str); str = (join_ei ? erg_join_ei(str) : str); str = (equate_aoy ? erg_equate_aoy(str) : str); str = (equate_bn ? erg_equate_bn(str) : str); str = (collapse_ii ? erg_collapse_ii(str) : str); str = (equate_eights ? erg_equate_eights(str) : str); str = (equate_pt ? erg_equate_pt(str) : str); str = (erase_q ? erg_erase_q(str) : str); str = (erase_word_spaces ? erg_erase_word_spaces(str) : erg_unify_word_spaces(str)); return erg_pack(str); # Ensure no words have disappeared str = gensub(/[.][.]/, ".o.", "g", str); str = gensub(/[.][.]/, ".o.", "g", str); # Delete spaces str = gensub(/[.]/, "", "g", str); # Guard against empty string: if (str == "") { str = "o"; } return str; } function printout(mw, fn, i) { # prints $0 with "mw" inserted as field "$(fn)" if (NF < fn-1) { error("not enough output fields\n"); } if (fn == 1) { print mw, $0; } else if (fn == NF+1) { print $0, mw; } else { for (i=1;i