#! /usr/bin/gawk -f # Last edited on 1998-12-03 04:06:05 by stolfi BEGIN { abort = -1; usage = ( \ "merge-version-into-interlin \\\n" \ " -v sourceFile=SRCFILE \\\n" \ " -v trashFile=TSHFILE \\\n" \ " -v transCodes=LETTERS \\\n" \ " < INFILE > OUTFILE" \ ); # Reads and EVMT-formatted interlinear transcription INFILE and # merges into it one more version SRCFILE, for transcriber codes # in LETTERS. Writes the result to the OUTFILE. Each text line from # SRCFILE is inserted before the first line from INFILE with the # same location code (f-number, unit, and line number). Also writes # to TSHFILE all records from SRCFILE that could not be merged, # including all comments. if (sourceFile == "") { arg_error("must specify \"-v sourceFile=FILE\"\n"); } if (trashFile == "") { arg_error("must specify \"-v trashFile=FILE\"\n"); } if (transCodes == "") { arg_error("must specify \"-v transCodes=LETTERS\"\n"); } # table "src" maps location code (minus transcriber) to # the whole line. split("", src); sourceN = 0; # counts source records read. trashN = 0; # counts rejected source records (incl comments). badN = 0; # counts erroneous source records. goodN = 0; # counts good source records. printf "reading source file...\n" > "/dev/stderr"; while((getline lin < sourceFile) > 0) { sourceN++; bad = 0; trash = 0; if (match(lin, /^ *$/)) { trash = 1; } else if (substr(lin,1,1) == "#") { trash = 1; } else if (substr(lin,1,1) == "<") { if (! match(lin, /^/)) { source_error("bad location code"); bad = 1; } else { loc = substr(lin, RSTART+1, RLENGTH-2); m = length(loc); cod = substr(loc, m, 1); if (index(transCodes, cod) == 0) { source_error("wrong transcriber code"); bad = 1; } loc = substr(loc, 1, m-2); if (loc in src) { source_error("duplicate location"); bad = 1; } } } else { source_error("bad line type"); bad = 1; } if (bad || trash) { print lin > trashFile; trashN++; if (bad) { badN++; } } else { src[loc] = lin; goodN++; } } close (sourceFile); printf "read %6d source lines\n", sourceN > "/dev/stderr" printf "rejected %6d of them (including %d errors)\n", trashN, badN > "/dev/stderr" if (badN != 0) { abort = 1; exit abort; } printf "merging files...\n" > "/dev/stderr" old_loc = ""; insertN = 0; unmatchedN = 0; sawBlank = 1; } /^ *$/ { if (abort >= 0) { exit abort; } next; } /^[#][#]/ { if (abort >= 0) { exit abort; } print "#" sawBlank = 1; print; next; } /^[#] *$/ { if (abort >= 0) { exit abort; } sawBlank = 1; print; next; } /^[#]/ { if (abort >= 0) { exit abort; } print; next; } /^[<]/ { if (abort >= 0) { exit abort; } lin = $0; if (! match(lin, /^/)) { input_error("bad location code"); loc = ""; } else { loc = substr(lin, RSTART+1, RLENGTH-2); m = length(loc); cod = substr(loc, m, 1); if (index(transCodes, cod) != 0) { input_error("wrong transcriber code"); } loc = substr(loc, 1, m-2); if ((loc != old_loc) && (loc != "")) { if (! sawBlank) { print "#"; } if (loc in src) { print src[loc]; delete src[loc]; insertN++; } old_loc = loc; } } print lin; sawBlank = 0; next; } // { if (abort >= 0) { exit abort; } input_error("unrecognized line type"); next; } END { if (abort >= 0) { exit abort; } for (loc in src) { print src[loc] > trashFile; trashN++; unmatchedN++; } close(trashFile); printf "%7d unmatched source lines\n", unmatchedN > "/dev/stderr" } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function source_error(msg) { printf "file %s, line %d: %s\n", sourceFile, sourceN, msg > "/dev/stderr"; } function input_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; }