#! /usr/bin/gawk -f # Neeeds -f eva2erg.gawk # Last edited on 1999-01-05 22:04:00 by stolfi # Adds a match key for bringing together "similar" phrases # in badly spellt Engllishe toungge. Based on phonetic similarity. # # cat INFILE \ # | add-lac-match-key -f eva2erg.gawk \ # [ -v inField=IFLDNUM ] \ # [ -v outField=OFLDNUM ] \ # > OUTFILE # # This script reads from stdin one or more records that contain an # Engllishe word KEY. It outputs the same records, each augmented with a # "reduced" version RKEY of the same word. # # The KEY word is assumed to be field IFLDNUM of each record # (the first field by default), and the RKEY is inserted as field # OFLDNUM (first by default). function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr" abort = 1 exit } function recode_text(str) { # Converts a textstring with no embedded "{}"s to the # requested encoding. # Delete EVMT fillers: str = gensub(/[!% ]/, "", "g", str); # Normalize spaces and surround with one space str = gensub(/[-/=,.]+/, ".", "g", str); str = gensub(/^[-/=,.]*/, ".", "g", str); str = gensub(/[-/=,.]*$/, ".", "g", str); # Letter "x" by its "ks" sound: str = gensub(/x/, "ks", "g", str); # Letters "ch", "sh", "sch" by "x": str = gensub(/[cs]h/, "x", "g", str); str = gensub(/sch/, "x", "g", str); # Group "ph" becomes "f": str = gensub(/ph/, "f", "g", str); # Group "wr" becomes "r": str = gensub(/wr/, "r", "g", str); # Letter "f" often sounds as "v" and fice-fersa: str = gensub(/v/, "f", "g", str); # Group "ight" is equivalent to "ite" str = gensub(/ight/, "ite", "g", str); # Group "ought" is equivalent to "aut" str = gensub(/ought/, "aut", "g", str); # Otherwise the group "gh" is soundless except before vowels str = gensub(/gh([.bcdfghjklmnpqrstvwxz])/, "\\1", "g", str); # Otherwise the non-initial letter "h" is often soundless: str = gensub(/([^.])h/, "\\1", "g", str); # Letters "qu" and "gu" are "k" and "g", usually: str = gensub(/qu/, "k", "g", str); str = gensub(/gu([aeiou])/, "g\\1", "g", str); # Remove duplicated letters: str = gensub(/[ck]+/, "k", "g", str); str = gensub(/[b]+/, "b", "g", str); str = gensub(/[d]+/, "d", "g", str); str = gensub(/[f]+/, "f", "g", str); str = gensub(/[g]+/, "g", "g", str); str = gensub(/[l]+/, "l", "g", str); str = gensub(/[mn]+/, "n", "g", str); str = gensub(/[p]+/, "p", "g", str); str = gensub(/[r]+/, "r", "g", str); str = gensub(/[sz]+/, "s", "g", str); str = gensub(/[t]+/, "t", "g", str); str = gensub(/[v]+/, "v", "g", str); # Soften "c" before "e" and "i": str = gensub(/[s][c]([ei])/, "s\\1", "g", str); str = gensub(/[c]([ei])/, "s\\1", "g", str); # Soften "g" before "e" and "i": str = gensub(/[g]([ei])/, "j\\1", "g", str); # Remove plural endings, posessives, 3rd person: str = gensub(/[i][e][s][.]/, "y.", "g", str); str = gensub(/([^.])[e][s][.]/, "\\1.", "g", str); str = gensub(/([^.])[s][.]/, "\\1.", "g", str); # Remove past tense endings: str = gensub(/([^.])[ei][dt][.]/, "\\1.", "g", str); str = gensub(/([^.])[d][.]/, "\\1.", "g", str); # Remove present continuous endings: str = gensub(/([^.])ing[.]/, "\\1.", "g", str); # Remove final "e", if not alone: str = gensub(/([^.])[e][.]/, "\\1.", "g", str); # The "pt" group may be spelled "t" str = gensub(/pt/, "p", "g", str); # Groups "ai", "ay", "ea" becomes "e": str = gensub(/a[iy]/, "e", "g", str); str = gensub(/ea/, "e", "g", str); # Collapse back vowels (with any adjacent front ones): str = gensub(/[eiy]*[oawu]+[oaueiwy]*/, "o", "g", str); # Delete back vowels between consonants (except final "s"): str = gensub(/([b-df-hj-np-tvxz])o([b-df-hj-np-tvxz])/, "\\1\\2", "g", str); str = gensub(/([b-df-hj-np-tvxz])o([b-df-hj-np-tvxz])/, "\\1\\2", "g", str); # Collapse front vowels to "e" str = gensub(/[eiy]+/, "e", "g", str); # Ensure no words have disappeared str = gensub(/[.][.]/, ".o.", "g", str); str = gensub(/[.][.]/, ".o.", "g", str); # Delete all spaces str = gensub(/[.]/, "", "g", str); # Guard against empty string: if (str == "") { str = "o"; } return str; } function printout(mw, fn, i) { # prints $0 with "mw" inserted as field "$(fn)" if (NF < fn-1) { error("not enough output fields\n"); } if (fn == 1) { print mw, $0; } else if (fn == NF+1) { print $0, mw; } else { for (i=1;i