#! /usr/bin/gawk -f # Last edited on 2002-03-05 00:16:15 by stolfi # Reads a file containing lines of the form LOC WORD where LOC is a # locator string, and WORD is a word. Builds the equivalence # relation where two consecutive WORDs with same LOC are equivalent. # Outputs one line WORD ROOT for each distinct input WORD, where # ROOT is a canonical representative of its equivalence class. BEGIN { abort = -1; split("", eqv); # `eqv[w]' is the parent of `w' in the union-find tree. split("", esz); # `eqv[w] == w' means `w' is root, and `esz[w]' is the tree size. } (abort >= 0) {exit abort;} (NF == 2) { loc = $1; w = $2; if (! (w in eqv)) { eqv[w] = w; esz[w] = 1; } else { w = find_root(w); } if (loc == oloc) { if (ow < w) { eqv[w] = ow; esz[ow] += esz[w]; esz[w] = "0"; } else { eqv[ow] = w; esz[w] += esz[ow]; esz[ow] = "0"; } } ow = w; oloc = loc; next; } /./{ data_error("bad line type"); } END { if (abort >= 0) {exit abort;} for (w in eqv) { ow = find_root(w); print w, ow; } } function find_root(w, t,p,r) { t = w; while ((p = eqv[t]) != t) { t = p; } r = t; t = w; while ((p = eqv[t]) != t) { eqv[t] = r; t = p; } return r; } function data_error(msg) { printf "*** line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit abort; }