#! /usr/bin/gawk -f # Must also specify -f eva2erg.gawk # Usage: # # cat INFILE \ # | remove-variant-words -f eva2erg.gawk \ # [ -v provide_ligatures=1 ] \ # [ -v erase_ligatures=1 ] \ # [ -v erase_plumes=1 ] \ # [ -v ignore_gallows_eyes=1 ] \ # [ -v join_ei=1 ] \ # [ -v equate_aoy=1 ] \ # [ -v collapse_ii=1 ] \ # [ -v equate_eights=1 ] \ # [ -v erase_q=1 ] \ # [ -v erase_word_spaces=1 ] \ # > OUTFILE # # Removes words from stdin that are considered variants of # previously seen words. # # The options define the mapping from KEY to RKEY. # If given they are applied in the order above. # See eva2erg.gawk for explanations of theit effect. function recode_text(str) { # Converts a textstring with no embedded "{}"s to the # requested encoding, preserving length. str = (provide_ligatures ? erg_provide_ligatures(str) : str); str = (erase_ligatures ? erg_erase_ligatures(str) : str); str = (erase_plumes ? erg_erase_plumes(str) : str); str = (ignore_gallows_eyes ? erg_ignore_gallows_eyes(str) : str); str = (join_ei ? erg_join_ei(str) : str); str = (equate_aoy ? erg_equate_aoy(str) : str); str = (collapse_ii ? erg_collapse_ii(str) : str); str = (equate_eights ? erg_equate_eights(str) : str); str = (erase_q ? erg_erase_q(str) : str); str = (erase_word_spaces ? erg_erase_word_spaces(str) : erg_unify_word_spaces(str)); return erg_pack(str); } BEGIN { split("", words); } /./ { w = erg_erase_comments($0); w = recode_text(w); if (w in words) { printf "%s = %s = %s\n", $0, w, words[w] > "/dev/stderr"; } else { print $0; words[w] = $0; } }