#! /usr/bin/gawk -f # Neeeds -f eva2erg.gawk # Last edited on 1999-01-05 22:06:30 by stolfi # Adds a match key adequate for gathering together "similar" Geez phrases. # # cat INFILE \ # | add-eno-match-key -f eva2erg.gawk \ # [ -v inField=IFLDNUM ] \ # [ -v outField=OFLDNUM ] \ # > OUTFILE # # This script reads from stdin one or more records that contain a Geez # (classical Ethiopian) word KEY. It outputs the same records, each # preceded by a "reduced" version RKEY of the same word (and one # blank). # # The KEY word is assumed to be field IFLDNUM of each record # (the first field by default). function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr" abort = 1 exit } function recode_text(str) { # Converts a textstring with no embedded "{}"s to the # requested encoding. # Delete EVMT fillers: str = gensub(/[!% ]/, "", "g", str); # Normalize spaces and surround with one space str = gensub(/[-/=,.]+/, ".", "g", str); str = gensub(/^[-/=,.]*/, ".", "g", str); str = gensub(/[-/=,.]*$/, ".", "g", str); # Remove the phonetic markers [`W]: str = gensub(/W/, "", "g", str); str = gensub(/[`]([a-zA-Z])/, "\\1", "g", str); # Collapse all numbers (Arabic and Ethiopian) to "0": str = gensub(/[`0-9]+/, "0", "g", str); # Letter "a" is used (rarely) instead of "e": str = gensub(/a/, "e", "g", str); # Map uppercase to lowercase: while (match(str, /[A-Z]/)) { str = gensub(/A/, "a", "g", str); str = gensub(/B/, "b", "g", str); str = gensub(/C/, "c", "g", str); str = gensub(/D/, "d", "g", str); str = gensub(/E/, "e", "g", str); str = gensub(/F/, "f", "g", str); str = gensub(/G/, "g", "g", str); str = gensub(/H/, "h", "g", str); str = gensub(/I/, "i", "g", str); str = gensub(/J/, "j", "g", str); str = gensub(/K/, "k", "g", str); str = gensub(/L/, "l", "g", str); str = gensub(/M/, "m", "g", str); str = gensub(/N/, "n", "g", str); str = gensub(/O/, "o", "g", str); str = gensub(/P/, "p", "g", str); str = gensub(/Q/, "q", "g", str); str = gensub(/R/, "r", "g", str); str = gensub(/S/, "s", "g", str); str = gensub(/T/, "t", "g", str); str = gensub(/U/, "u", "g", str); str = gensub(/V/, "v", "g", str); str = gensub(/W/, "w", "g", str); str = gensub(/X/, "x", "g", str); str = gensub(/Y/, "y", "g", str); str = gensub(/Z/, "z", "g", str); } # Collapse similar(?) consonants: str = gensub(/[jyzx]/, "j", "g", str); str = gensub(/[khqgc]/, "k", "g", str); str = gensub(/[pb]/, "b", "g", str); str = gensub(/[fvw]/, "v", "g", str); # Collapse similar(?) vowels: str = gensub(/[ei]/, "e", "g", str); str = gensub(/[ou]/, "u", "g", str); # Ensure no words have disappeared str = gensub(/[.][.]/, ".u.", "g", str); str = gensub(/[.][.]/, ".u.", "g", str); # Delete spaces and the syllable separator ['] str = gensub(/[.']/, "", "g", str); # Guard against empty string: if (str == "") { str = "u"; } return str; } function printout(mw, fn, i) { # prints $0 with "mw" inserted as field "$(fn)" if (NF < fn-1) { error("not enough output fields\n"); } if (fn == 1) { print mw, $0; } else if (fn == NF+1) { print $0, mw; } else { for (i=1;i