# A library of GAWK functions for removing # easily confused details from EVA text. # Last edited on 1999-02-01 00:07:27 by stolfi # ======================================================================== # Functions for erasing comments: function erg_erase_string(str) { # Returns a string of "!"s with same length as "str". return gensub(/./, "!", "g", str); } function erg_erase_comments(old, new, i) { # Removes '{}' comments and other fillers from "old", # replacing them by "!"s so as to preserve the length. new = ""; while (length(old) != 0) { i = index(old, "{"); if (i == 0) { new = (new old); old = ""; } else if (i > 1) { new = (new substr(old, 1, i-1)); old = substr(old, i); } else { match(old, /^{[^{}]*}/); if (RSTART > 0) { new = (new erg_erase_string(substr(old, 1, RLENGTH))); old = substr(old, RLENGTH + 1); } else { printf "line %d, bad {}-comment\n", NR > "/dev/stderr"; new = (new old); old = ""; } } } return new; } # ======================================================================== # Functions for finding similar words: # # The following recoding operations may map two or more EVA # characters to a fewer number of characters. In that case # the balance is filled with " ", not "!", for the # benefit of erg_char_offsets below. # # These functions should be applied in the order in which they # are declared here. function erg_map_ee_to_ch(txt) { # Maps "ee" to "ch" and "se" to "sh", i.e. provides a ligature. gsub(/se/, "sh", txt); gsub(/ee/, "ch", txt); return txt } function erg_map_sh_to_ch(txt) { # Maps "sh" to "ch", i.e. erases the plume on "sh" (but not on "s"). gsub(/sh/, "ch", txt); return txt } function erg_erase_ligatures(txt) { # Erases the ligature information, namely the # difference between "sh" and "se", "ch" and "ee", # "ckh" and "eke", etc. gsub(/c/, "e", txt); gsub(/h/, "e", txt); return txt } function erg_erase_plumes(txt) { # Erases the plume on "s", "r", by converting them to "e". # Also erases the isolated plumes ['"+]. # Also converts "sh" to "ch", if it has not been # eliminated by erg_erase_ligatures or erg_map_sh_to_ch. # Also maps "u" to "a". # Does not affect "n". gsub(/sh/, "ch", txt); gsub(/s/, "e", txt); gsub(/r/, "e", txt); gsub(/u/, "a", txt); txt = gensub(/([oayechi])['"+]/, "\\1 ", "g", txt); return txt } function erg_ignore_gallows_eyes(txt) { # Erases the "eye" information from all gallows, # by mapping "k" to "t" and "f" to "p". gsub(/k/, "t", txt); gsub(/f/, "p", txt); return txt } function erg_join_ei(txt) { # Collapses "ei" to "a". gsub(/ei/, "a ", txt); return txt } function erg_equate_aoy(txt) { # Erases the differences between "a", "o", "y", # mapping them all to "o". gsub(/a/, "o", txt); gsub(/y/, "o", txt); return txt } function erg_equate_bn(txt) { # Maps "b" to "n". gsub(/b/, "n", txt); return txt } function erg_collapse_ii(txt) { # Maps "ii" and "iii" and "iiii" to just "i". gsub(/iiii/, "i ", txt); gsub(/iii/, "i ", txt); gsub(/ii/, "i ", txt); return txt } function erg_equate_eights(txt) { # Identifies "j", "g", "m" with "d". gsub(/j/, "d", txt); gsub(/g/, "d", txt); gsub(/m/, "d", txt); return txt } function erg_equate_pt(txt) { # Identifies "p" with "t". gsub(/p/, "t", txt); return txt } function erg_erase_q(txt) { # Replaces "qX" by "X ", in a chunk of comment-free EVA. # Adds a space after the letter to preserve length. # Note that the space comes after to ensure we compute the correct offsets. # Better call "erg_join_ei" before this one. return gensub(/q([oayeclktp])/, "\\1 ", "g", txt); } # ======================================================================== # Functions for dealing with word spaces: # These functions consider "/" a space, too. function erg_unify_word_spaces(str) { # Replaces all EVA word spaces by "."s return gensub(/[-/=,.]/, ".", "g", str); } function erg_erase_word_spaces(str) { # Replaces all EVA word spaces by "!"s return gensub(/[-/=,.]/, "!", "g", str); } # ======================================================================== # Functions for discarding words that contain invalid characters: function erg_crush_invalid_words(str, res) { # Replaces any word that contains an invalid character by "*"s res = ""; while (str != "") { if (match(str, /^[^*?%]*([-.,/=]+|$)/)) { res = (res substr(str, 1, RLENGTH)); str = substr(str,RLENGTH+1); } else if (match(str, /^[^-.,/=]*[?*%][^-.,/=]*/)) { res = (res "?" erg_erase_string(substr(str, RLENGTH-1))); str = substr(str,RLENGTH+1); } else { printf "line %d, erg_crush_invalid_words lost\n", NR > "/dev/stderr"; abort = 1; exit 1; } } return res } # ======================================================================== # Functions for squeezing out fillers (and remembering them): function erg_pack(old) { # Removes all fillers from "old" return gensub(/[!% ]/, "", "g", old); } function erg_char_offsets(txt, iof, fof, i, k, m, c, goon) { # Expects iof and bof to be empty arrays. Stores in iof[i] the byte # offset of the substring of txt that apparently gave rise to the # ith character of the string pck = erg_pack(txt). Also stores in # fof[i] the byte offset for the end of that string. # # Assumes that the string of a non-filler character includes # that character and any succeeding " "s. Thus # # E.g. suppose txt = "!!!ab !c !" so that pck = "abc"; then # iof[1..3] will be {3,4,7} and fof[1..3] will be {4,6,10}. # Also sets fof[0] to 0 and iof[length(pck)] to length(txt). i = 0; m = length(txt); fof[i] = 0; goon = 0; for(k=1;k<=m;k++) { c = substr(txt,k,1); if ((c == "!") || (c == "%")) { goon = 0; } else if (c == " ") { if (goon) fof[i] = k; } else { i++; iof[i] = k-1; fof[i] = k; goon = 1; } } iof[i + 1] = m; }