#! /usr/bin/gawk -f
# Last edited on 2002-03-04 22:06:10 by stolfi

# Reads a file containing lines of the form LOC PARMK WORDS
# where LOC is an EVMT locator without the "<>", PARMK
# is either "=" (start of paragraph) or "-",
# WORDS is zero or more EVA words separated by spaces, without comments and such.
# 
# Writes a file with one record per token (word occurrence) in the input
# text, in the format
# 
#   FNUM UNIT LINE TRAN FPOS RPOS PFRST PLAST WORD 
# 
# where WORD is the word in question; FNUM, UNIT, LINE and TRAN are
# fields from LOC (the page f-number; the text unit; the line number;
# and the transcriber code); FPOS is the sequential number of the word
# in the line; RPOS is the same, counting backwards from the and of
# line; PFRST is a boolean (0 or 1) identifying the first token of a
# paragraph; and PLAST is analogous for the last token.

BEGIN{
  abort = -1;
}

(abort >= 0) {exit abort;} 

(($1 ~ /^[f][0-9]/) && (NF >= 4)){
  loc = $1; dfrst = $2; dlast = $(NF);
  if (dfrst !~ /^[-=]$/) { data_error(("bad line-start delimiter \"" dfrst "\"")); }
  if (dlast !~ /^[-=]$/) { data_error(("bad line-end delimiter \"" dlast "\"")); }
  nlocf = split(loc, locf, /[.;]/);
  if (nlocf != 4) { data_error(("bad locator formaf \"" loc "\"")); }
  fnum = locf[1];
  unit = locf[2];
  nlin = locf[3];
  tran = locf[4];
  for (i = 3; i < NF; i++)
    { print fnum, unit, nlin, tran, i-2, NF-i, (dfrst == "="), (dlast == "="), $(i); }
  next;
}

/./{ data_error("bad line format"); }

function data_error(msg)
{
  printf "*** line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1; exit abort;
}