#! /usr/bin/gawk -f # Usage: "$0 [-v skip=m] [-v take=n] < INFILE > OUTFILE" # # Reads a bunch of strings from a file, sorts them by similarity. # When comparing, skips m characters and then only looks at n. function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr" abort = 1 exit } function char_dist(xc, yc) { return (xc == yc ? 0 : 1 ) } function dist(x, y, skip, take, i, xc, yc, s) { # String distance from x to y s = 0; for(i=1;i<=take;i++) { xc = substr(x, i+skip, 1); yc = substr(y, i+skip, 1); s = s + char_dist(xc, yc) } return s; } function strsort(str, out, skip, take, i, w, k, dw, d, da, db, dk, n) { # sorts "str" by similarity and put the result in out[0..nout-1]. # Skips 'skip' bytes and then compares "take" bytes. # Assumes "str" is a vector with the words as arguments. # Returns number of strings. n = 0; split("", dw); # printf "skip=%d take=%d\n", skip, take > "/dev/stderr"; for (w in str) { if (n == 0) { out[0] = w; n++; } else if(n == 1) { out[1] = w; n++; dw[0] = dist(out[0], out[1], skip, take); } else { # find two consecutive elems most similar to w db = dist(out[0], w, skip, take); dk = db; k = -1; for (i=0; i k; i--) { out[i+1] = out[i]; } n++; out[k+1] = w; if (k >= 0) { dw[k] = dist(out[k], out[k+1], skip, take); # printf "%s", out[k] > "/dev/stderr"; } # printf "(%s)", out[k+1] > "/dev/stderr"; if (k < n-1) { dw[k+1] = dist(out[k+1], out[k+2], skip, take); # printf "%s\n", out[k+2] > "/dev/stderr"; } } } return n; } BEGIN { minlen=99999999; split("", str); } /./ { w = $0; str[w] = 1; n=length(w); minlen = (n