#! /usr/bin/gawk -f # Last edited on 2002-03-04 22:06:10 by stolfi # Reads a file containing lines of the form LOC PARMK WORDS # where LOC is an EVMT locator without the "<>", PARMK # is either "=" (start of paragraph) or "-", # WORDS is zero or more EVA words separated by spaces, without comments and such. # # Writes a file with one record per token (word occurrence) in the input # text, in the format # # FNUM UNIT LINE TRAN FPOS RPOS PFRST PLAST WORD # # where WORD is the word in question; FNUM, UNIT, LINE and TRAN are # fields from LOC (the page f-number; the text unit; the line number; # and the transcriber code); FPOS is the sequential number of the word # in the line; RPOS is the same, counting backwards from the and of # line; PFRST is a boolean (0 or 1) identifying the first token of a # paragraph; and PLAST is analogous for the last token. BEGIN{ abort = -1; } (abort >= 0) {exit abort;} (($1 ~ /^[f][0-9]/) && (NF >= 4)){ loc = $1; dfrst = $2; dlast = $(NF); if (dfrst !~ /^[-=]$/) { data_error(("bad line-start delimiter \"" dfrst "\"")); } if (dlast !~ /^[-=]$/) { data_error(("bad line-end delimiter \"" dlast "\"")); } nlocf = split(loc, locf, /[.;]/); if (nlocf != 4) { data_error(("bad locator formaf \"" loc "\"")); } fnum = locf[1]; unit = locf[2]; nlin = locf[3]; tran = locf[4]; for (i = 3; i < NF; i++) { print fnum, unit, nlin, tran, i-2, NF-i, (dfrst == "="), (dlast == "="), $(i); } next; } /./{ data_error("bad line format"); } function data_error(msg) { printf "*** line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit abort; }