#! /usr/bin/gawk -f # Last edited on 1998-09-20 03:50:52 by stolfi BEGIN { abort = -1; usage = ( \ "perturb-words \\\n" \ " -v stringField=NUM \\\n" \ " -v distField=NUM \\\n" \ " [ -v increment=NUM ] \\\n" \ " [ -v replaceChars=CHARS ] \\\n" \ " [ -v deleteChars=CHARS ] \\\n" \ " [ -v insertChars=CHARS ] \\\n" \ " [ -v dupOnly=BOOL ] \\\n" \ " < INFILE > OUTFILE" \ ); # # Field number "stringField" of each input record must contain a # non-empty string over the given alphabet plus "_". Field number # "distField" must be a non-negative integer. # # For each input record, this script enumerates all strings that # differ from field "stringField" by one edit step (insertion, # deletion, or replacement of a letter by another from the same # class). Note that the same perturbed string may be generated # more than once. # # Each input record is copied verbatim to the output. # One additional record, in the same format, is written for each of the # "perturbed" strings, with field "distField" incremented by the # given "increment" (default 1). # # The "_" character is used as a null, to avoid empty strings. # It is deleted from the input string, before the perturbation step, # and substituted for any perturbed string that happens to be empty. # # The "replaceChars" string specifies one or more sets of characters # that can be substituted for each other, separated by "/"s. The # classes need not be disjoint. E.g., "aeiouyw/td/pb/kcg/sz/yjx/mn/lrs/wvf" # says that any of the letters { a, e, i, o, u, y w } can be replaced by # any other letter of that same set; but { w v f }. If this # parameter is not specified, no substitutions will be performed. # # The "insertChars" string specifies a list of characters that can # be inserted. Any of these characters can be inserted at any # position. If this parameter is not specified, no insertions will # be performed. # # The "deleteChars" string specifies a list of characters that can # be deleted. Any of these characters can be deleted from any # position. If this parameter is not specified, no deletions will # be performed. # # If "dupOnly" is set, deletion is restricted to adjacent duplicated # letters (e.g. "address" -> { "adress", "addres" }), and insertion # is confined to duplication of existing letters # (e.g. "fall" -> { "ffall", "faall", "falll", "falll" }) # Thus general insertions and deletions will become 2-step operations. # In any case, the "insertChars" and "deleteChars" strings will # limit the characters that can be inserted or deleted. # if (stringField == "") { arg_error("must define \"stringField\""); } if (distField == "") { arg_error("must define \"distField\""); } if (increment == "") { increment = 1; } if (dupOnly == "") { dupOnly = 0; } # # Build the table "simil[ch]" with the charactes that can be # substituted for character "ch". # nReplace = split(replaceChars, class, "/"); split("", simil); for (icl in class) { cl = class[icl]; nc = length(cl); for(i=1;i<=nc;i++) { x = substr(cl,i,1); for(j=1;j= 0) { exit abort; } if ((NF < distField) || (NF < stringField)) { error(("line " NR ": bad format")); } # Copy the input record: printf "%s\n", $0; # Generate the perturbed versions: w = $(stringField); gsub(/[_]/, "", w); d = $(distField); n = length(w); # "p[0..m-1]" are the mutated words: split("", p); m = 0; # Letter substitution: if (nReplace > 0) { for(i=1;i<=n;i++) { pre = substr(w,1,i-1); let = substr(w,i,1); pos = substr(w,i+1,n-i); if (let in simil) { reps = simil[let]; nreps = length(reps); for(j=1; j<=nreps; j++) { rep = substr(reps, j, 1); if (rep == let) { error(("line " NR ": inconsistent simil[]")); } p[m] = (pre rep pos ); m++; } } } } if (nDelete > 0) { if (dupOnly) { # Deletion of duplicate letters: for(i=1;i 0) { if(dupOnly) { # Duplication of letters: for(i=0;i "/dev/stderr"; abort = 1; exit 1 } function arg_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", msg > "/dev/stderr"; abort = 1; exit 1 }