#! /usr/bin/gawk -f # Last edited on 2002-03-05 00:09:58 by stolfi # Reads a file containing lines of the form # # SEC USEQ FNUM UNIT LINE TRAN FPOS RPOS PFRST PLAST WORD # 1 2 3 4 5 6 7 8 9 10 11 # # where: WORD is a word; SEC USEQ FNUM UNIT LINE TRAN is the # location of one occurrence of the word in the text; FPOS is the # sequential number of the word in the line; RPOS is the same, # counting backwards from the and of line; PFRST is a boolean (0 or 1) # identifying the first token of a paragraph; and PLAST is analogous # for the last token. # # Writes a file with the "special" words, in the same format. BEGIN{ abort = -1; } (abort >= 0) {exit abort;} (NF == 11){ sec = $1; useq = $2; fnum = $3; unit = $4; nlin = $5; tran = $6; fpos = $7; rpos = $8; pfrst = $9; plast = $10; word = $11; if (fpos == 1) { if ((word !~ /[ktpf]/) || (word ~ /[?*].*[?*]/) || (length(word) < 4)) { next; } if ((pfrst && (word ~ /^[c]?[kt]/)) || (word ~/[pf]/) || (word ~/[ktpf].*[ktpf]/)) { print; } } next; } /./{ data_error("bad line type"); } function data_error(msg) { printf "*** line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit abort; }