#! /bin/gawk -f # Last edited on 2004-07-24 03:04:32 by stolfi BEGIN { abort = -1; usage = ( \ "list-duplicate-words \\\n" \ " [ -v sep=SEPSTRING ] \\\n" \ " [ -v context=NUM ] \\\n" \ " [ -v lineWidth=NUM ] \\\n" \ " < INFILE.wds > OUTFILE.txt" \ ); # Reads a list of words, one per line, in the format # # LOC TYPE WORD # # where LOC is a line locator, WORD is any non-empty string, and # TYPE an integer indicating its type (see evt-to-wds). # # Each WORD is considered to be `delim' if TYPE = 1, `plain' # otherwise. Assumes an empty delim word between any two adjacent # plain words; and joins together any string of two or more # consecutive delim words. # # Prints out a file with lines of the form # # LOC LPOS FPOS LWDn LSPn ... LWD1 LSP1 WORD SEP WORD RSP1 RWD1 RWD2 ... RSPn LWDn # # where WORD is an plain word that occurs repeated in the file, # separated by SEP; and LWDi, LSPi, RSPj, RWDj are the adjacent # words and separators. The LOC, LPOS, and FPOS fields are the # position of the the first of the two WORD occurrences. The number # "n" of context words is defined by the "context" parameter. # # If "lineWidth" is not given or zero, the fields are printed in # free format; if any delim field ("LSPi", SEP, or "RSPj") is empty, # it is replaced by "sep" (default "_"). If "lineWidth" is nonzero, # the output is spaced and padded so that the locator fields are # aligned at the left margin, and the SEPs are aligned # and centered in the remaining columns; and empty # delims are omitted. # if (sep == "") { sep = "_"; } if (context == "") { context = 3; } if (lineWidth == "") { lineWidth = 0; } # The words are in "buf[0..nbuf-1]". # Their original positions are "pos[0..nbuf-1]". # Adjacent delims are already concatenated. # Synthetic delims are "" for now (turned "sep" on printout). split("", buf); nbuf = 4*context+3; # Size of token buffer (plain and delim). ibuf = 0; # Buffered tokens are "buf[ibuf+k]", "k=0..nbuf-1", modulo. ctr = 2*context; # First token of repeat pair is "buf[ibuf+ctr]", modulo. split("", pos); # Location of "buf[i]" is "pos[i]" for (i = 0; i < nbuf; i++) { buf[i] = ""; pos[i] = ""; } last_was_delim = 1; } (abort >= 0) { exit abort; } /^[ ]*([#]|$)/ { next; } /./ { if (NF != 3) { data_error("bad line format"); } wpos = (lineWidth == 0 ? $1 : sprintf("%-19s", $1) ); wtype = $2; wd = $3; is_delim = (wtype == 1); if (is_delim) { if (last_was_delim) { join_token(wd, wpos); } else { push_token(wd, wpos); last_was_delim = 1; } } else { if (! last_was_delim) { push_token("", wpos); } push_token(wd, wpos); check_for_dup(); last_was_delim = 0; } next; } END{ if (abort >= 0) { exit abort; } for (i = 0; i < nbuf; i++) { push_token("", ""); check_for_dup(); } } function join_token(wd,wpos, j) { # Appends "wd" to the last buffered token. # Also sets its location to "wpos". j = (ibuf + nbuf-1) % nbuf; buf[j] = ( buf[j] wd ); pos[j] = wpos; } function push_token(wd,wpos, j) { # Appends a token (plain or delim) to the buffer, and records its # position "wpos". j = ibuf; buf[j] = wd; pos[j] = wpos; ibuf = (ibuf + 1) % nbuf; } function check_for_dup( j1,j2) { # Prints the current line if the middle # two plain tokens in the buffer are equal and non-empty. j1 = (ibuf + ctr) % nbuf; j2 = (j1 + 2) % nbuf; if ((buf[j1] != "") && (buf[j1] !~ /[?]/) && (buf[j1] == buf[j2])) { if (lineWidth > 0) { print_buf_formatted(); } else { print_buf_plain(); } } } function print_buf_formatted( k,j,wd,ps,lwd,pad) { ps = pos[(ibuf + ctr) % nbuf]; if (ps == "") { ps = "_"; } printf("%-19s ", ps); # Compute width of left half: lwd = 0; for (k = 0; k <= ctr+1; k++) { j = (ibuf + k) % nbuf; wd = buf[j]; if (k <= ctr) { lwd += ( wd != "" ? length(wd) + 1 : 0 ) } else { lwd += ( wd != "" ? 1 + length(wd)/2 : 1/2); } } # Pad line to align central separators: pad = int((lineWidth - 19)/2 - lwd); if (pad < 0) { pad = 0; } printf "%*s", pad, ""; # Print line for (k = 0; k < nbuf; k++) { j = (ibuf + k) % nbuf; wd = buf[j]; if (wd != "") { printf(" %s", wd); } } printf("\n"); } function print_buf_plain( j,wd,ps) { ps = pos[(ibuf + ctr) % nbuf]; if (ps == "") { ps = "_"; } printf("%s", ps); for (k = 0; k < nbuf; k++) { j = (ibuf + k) % nbuf; wd = buf[j]; if (wd == "") { wd = sep; } printf(" %s", wd); } printf("\n"); } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; printf "usage: %s\n", usage >> "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg >> "/dev/stderr"; abort = 1; exit 1; } function prog_error(msg) { printf "line %d: prog error - %s\n", FNR, msg >> "/dev/stderr"; abort = 1; exit 1; }