#! /usr/bin/gawk -f # Neeeds -f eva2erg.gawk # Last edited on 1999-01-05 22:07:03 by stolfi # Adds a match key adequate for gathering together "similar" English # phrases. Collapses doubled letters to single letters, then # identifies letters that have similar shapes, regardless of sounds. # # cat INFILE \ # | add-wow-match-key -f eva2erg.gawk \ # [ -v inField=IFLDNUM ] \ # [ -v outField=OFLDNUM ] \ # > OUTFILE # # This script reads from stdin one or more records that contain an # English word KEY. It outputs the same redords, each augmented with a # "reduced" version RKEY of the same word. # # The KEY word is assumed to be field IFLDNUM of each record # (the first field by default), and the RKEY is inserted as field # OFLDNUM (first by default). function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr" abort = 1 exit } function recode_text(str) { # Converts a textstring with no embedded "{}"s to the # requested encoding. # Delete EVMT fillers: str = gensub(/[!% ]/, "", "g", str); # Normalize spaces and surround with one space str = gensub(/[-/=,.]+/, ".", "g", str); str = gensub(/^[-/=,.]*/, ".", "g", str); str = gensub(/[-/=,.]*$/, ".", "g", str); str = gensub(/[b][b]/, "b", "g", str); str = gensub(/[c][c]/, "c", "g", str); str = gensub(/[d][d]/, "d", "g", str); str = gensub(/[e][e]/, "e", "g", str); str = gensub(/[f][f]/, "f", "g", str); str = gensub(/[g][g]/, "g", "g", str); str = gensub(/[k][k]/, "k", "g", str); str = gensub(/[l][l]/, "l", "g", str); str = gensub(/[m][m]/, "m", "g", str); str = gensub(/[n][n]/, "n", "g", str); str = gensub(/[o][o]/, "o", "g", str); str = gensub(/[p][p]/, "p", "g", str); str = gensub(/[r][r]/, "r", "g", str); str = gensub(/[s][s]/, "s", "g", str); str = gensub(/[t][t]/, "t", "g", str); str = gensub(/[v][v]/, "v", "g", str); str = gensub(/[z][z]/, "z", "g", str); str = gensub(/[ao]/, "o", "g", str); str = gensub(/[pqgy]/, "p", "g", str); str = gensub(/[mnwuv]/, "n", "g", str); str = gensub(/[hbdk]/, "d", "g", str); str = gensub(/[tf]/, "t", "g", str); str = gensub(/[ce]/, "e", "g", str); str = gensub(/[lij]/, "i", "g", str); # Ensure no words have disappeared str = gensub(/[.][.]/, ".o.", "g", str); str = gensub(/[.][.]/, ".o.", "g", str); # Delete spaces str = gensub(/[.]/, "", "g", str); # Guard against empty string: if (str == "") { str = "o"; } return str; } function printout(mw, fn, i) { # prints $0 with "mw" inserted as field "$(fn)" if (NF < fn-1) { error("not enough output fields\n"); } if (fn == 1) { print mw, $0; } else if (fn == NF+1) { print $0, mw; } else { for (i=1;i