#! /usr/bin/gawk -f # Last edited on 1999-01-06 14:43:36 by stolfi # Reads a concordance file with 10 fields (see Note-037.txt) # including STRING (field 6) and PATT (field 8). # # Appends an 11th field that is "+" for all entries of a given PATT # that has at least two entries with distinct locations and positions, # or whose STRING is a single word. BEGIN { # Invariants: # # "cpat" is the pattern of the last entry seen, and "cpos" is # the position (location plus offset and length) of the first entry. # # "s" is "+" or "-" depending on whether the lines with that # pattern constitute and interesting pattern. That is, if "s" is # "-" then all those lines have the same position "cpos" in the # text, and none is a single word. # # If "flushed" is 1, then "clin[0..n-1]" contains all previous lines # with that pattern. Otherwise those lines have been written out. abort = -1; start_pattern("", ""); } (abort >= 0) { exit abort; } /./{ loc = $1; beg = $3; len = $4; pat = $8; str = $6; pos = (loc ":" beg ":" len); if (pat != cpat) { flush_pattern(); start_pattern(pat,pos) } if ((pos != cpos) || (str !~ /[-/=., ]/)) { s = "+"; flush_pattern(); } if (flushed) { print $0, s; ct[s]++; } else { clin[n] = $0; } n++; } END { if (abort >= 0) { exit abort; } flush_pattern(s); printf "%7d records marked \"+\"\n", ct["+"] > "/dev/stderr"; printf "%7d records marked \"-\"\n", ct["-"] > "/dev/stderr"; } function start_pattern(pat, pos) { # Prepares to accumulate new pattern. Assumes the # Current one has been flushed. cpos = pos; cpat = pat; s = "-"; flushed = 0; split("", clin); n = 0; } function flush_pattern(i) { # If there are any buffered lines, writes them out. if (! flushed) { for (i=0;i