#! /usr/bin/gawk -f # Last edited on 2004-09-13 19:18:37 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] " [-v wsep=CHAR] [-v maxlen=NUM] < INFILE > OUTFILE" ); # Reads a text, one word per line. # Outputs all repeated maximal substrings. if (maxlen == "") { maxlen = 30; } if (wsep == "") { wsep = "."; } nstates = 0; # Number of states # Indexed by state: split("", count); # Number of input substrings leading to this state. split("", chars); # Characters leading out from this state. # Indexed by state and char: split("", nextst); # Next state. curlen = 0; # Number of currently active states. # Indexed [0..curlen-1] split("", active); # {active[k]} is state spelled by last {k} characters. # Set up root state: root = nstates; nstates++; count[root] = 0; lpath[root] = 0; # Append the root as an active state: active[curlen] = root; curlen++; count[root]++; # Statistics nwords = 0; # Words read nchars = 0; # Characters processed (incl. word seps) } /./ { if (nwords > 0) { process(wsep); } wd = $0; nwd = length(wd) for (i = 1; i <= nwd; i++) { process(substr(wd,i,1)); } nwords++; } END { printf "# %d words read \n", nwords; printf "# %d chars processed \n", nchars; printf "# %d states created \n", nstates; # Spit out states with count higher than 1: printreps(root, ""); } function process(ch, len,st,stnew) { nchars++; # Append {ch} to all current states: for (len = curlen-1; len >= 0; len--) { st = active[len]; if (! ((st,ch) in nextst)) { stnew = nstates; nextst[st,ch] = stnew; count[stnew] = 0; chars[stnew] = ""; chars[st] = (chars[st] ch); nstates++; } else { stnew = nextst[st,ch]; } active[len+1] = stnew; count[stnew]++; } # Increment {curlen} until it reaches {maxlen}: if (curlen < maxlen) { curlen++; } # Set the root as an active state: count[root]++; } function printreps(st,pre, chs,nchs,i,ch,stnew) { if (count[st] > 1) { chs = chars[st]; nchs = length(chs); if (nchs != 1) { printf "%9d %3d %s\n", count[st], length(pre), pre; } for (i = 1; i <= nchs; i++) { ch = substr(chs,i,1); stnew = nextst[st,ch]; printreps(stnew, (pre ch)); } } }