#! /usr/bin/gawk -f # Last edited on 1999-01-06 06:42:36 by stolfi # Usage: $0 WORDFILE < INFILE > OUTFILE # Reads a list of words WORDFILE and a # file in EVT format (location in columns 1-19, text # starting on column 20). For each occurrence of a word W # from WORDFILE in each line of INFILE, writes out a line # containing the triple L N W, where L is the location and number # of the line in evt format (e.g. ), and # N is the total number of text bytes in the INFILE before # that occurrence. # # Ignores "#"-comments, "{}"-comments, and the EVT fillers "!" and "%" # in the INFILE. Lines of INFILE that do not beging with "#" or "<" # are assumed to be entirely text, and are assigned location # where k is the count of such lines seen so far. # BEGIN { usage = (ARGV[0] "WORDFILE < TEXT") if (ARGC != 2) { error(("usage: " usage)); } wfile = ARGV[1]; split("", dic) while((getline w < wfile) > 0) { dic[w] = 1 } if (ERRNO != "0") { error((wfile ": " ERRNO)); } close (wfile); ARGC = 1 nAnon = 0 nBytes = 0 } /^#/ { if (abort) exit; next } function cleanup(txt) { # Removes crud from text # We discard "%" and "!". gsub(/[% !]/, "", txt); # We discard "{}"-comments: gsub(/\{[^}]*\}/, "", txt); # We choose arbitrarily the first of alternative transcriptions: gsub(/\[/, "", txt); gsub(/\|[^\]]*\]/, "", txt); gsub(/\]/, "", txt); return txt } /./ { if (abort) exit; if (substr($0,1,1) == "<") { skip = 19; loc = substr($0,1,19); gsub(/ *$/, "", loc); if ( loc !~ /^$/ ) { printf "line %d, bad location \"%s\"\n", NR, loc > "/dev/stderr" } } else { skip = 0; nAnon++; loc = ("") } if (skip >= length($0)) next; txt = cleanup(substr($0,1+skip)) for(w in dic) { i = index(txt, w); while (i != 0) { printf "%s %d %d %s\n", loc, i - 1, nBytes + i - 1, w; k = index(substr(txt, i+1), w); i = (k == 0 ? 0 : i + k) } } nBytes += length(txt); next }