#! /usr/bin/gawk -f # Last edited on 1998-12-30 11:39:59 by stolfi BEGIN { abort = -1; usage = ( \ "combine-versions \\\n" \ " -v ignore=CHARS \\\n" \ " -v code=CHAR \\\n" \ " -v position=[first|last] \\\n" \ " -v table=TBLFILE \\\n" \ " < INFILE.evt > OUTFILE.evt " \ ); # Reads an interlinear file in EVMT format (EVA encoding) and # writes the same with an extra version, with transcriber code # "code". # # The new version is computed by table lookup, using a table TBLFILE # that maps "reading tuples" to a single character. A reading tuple # for a given character position consists of the readings of that # position by all 26 (potential) transcribers, "A" thru "Z", encoded # as a string of 26 EVA letters. The "%" reading is assumed whenever # a character position is not covered by a particular transcription, # of for transcribers listed in the "ignore" list # # The spaces, line breaks, para breaks, and the fillers "!" and "%" # are viewed as readings, too. In-line comments are replaced by "!" # fillers, preserving alignment, before extracting the reading tuples. trset = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; bangs = "!"; while (length(bangs) < 500) { bangs = (bangs bangs); } blanks = " "; while (length(blanks) < 19) { blanks = (blanks blanks); } if (code !~ /^[A-Z]$/) { arg_error("bad or missing \"code\" argument"); } if ((position != "first") && (position != "last")) { arg_error("bad or missing \"position\" argument"); } if (ignore !~ /^[A-Z]*$/) { arg_error("bad \"ignore\" argument"); } if (table == "") { arg_error("must specify the \"table\" argument"); } tup_clear_current_batch(); # Read weight table "wt[i]" where "i" is 1..26: nentries = 0; while (getline lin < table) { if ((lin !~ /^[#]/) && (lin !~ /^ *$/)) { n = split(lin, fld); if (n != 2) { table_error("bad number of fields"); } if (length(fld[1]) != 26) { table_error("bad tuple"); } if (length(fld[2]) != 1) { table_error("bad output char"); } out[fld[1]] = fld[2]; nentries++; } } close(table); if (nentries == 0) { arg_error("no entries in weight table"); } } //{ if (abort >= 0) { exit abort; } } # Blank line /^ *$/ { next; } # `##'-comment (page/unit header) /^[#][#]/ { tup_process_current_batch(ignore); print; next; } # Other `#'-comment /^[#]/ { tup_append_line_to_batch($0); next; } # Uncommented page/unit header /^<[^<>;]*>/ { tup_process_current_batch(ignore); next; } # Text line /^=0) { exit abort; } tup_process_current_batch(ignore); } # Client functions called by tup_process_current_batch: function process_batch_texts(loc,txt,trn,nv,nc, tuple) { # Called with the cleaned-up texts, without comments. # Extract the tuples and build the combined version: split("", tuple); tup_extract_tuples(txt,trn,nv,nc,tuple); build_combined_version(loc,tuple,nc); split("", tuple); } function build_combined_version(loc,tuple,nc, j,tj,txt) { # Build the combined version: txt = ""; for (j=1; j<=nc; j++) { tj = tuple[j]; if (! (tj in out)) { fatal_error(("tuple \"" tj "\" not in table")); } txt = (txt out[tj]); } lin = sprintf("%-18.18s %s", ("<" loc ";" code ">"), txt); if (position == "first") { tup_prepend_version_to_batch(lin); } else { tup_append_version_to_batch(lin); } } function process_batch_lines(batch, nb) { # Called with the original (and new) lines, including comments. # Output them: for (i=0; i "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function fatal_error(msg) { printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function format_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; } function print_line(lin) { printf "file %s, line %d: %s\n", FILENAME, FNR, lin > "/dev/stderr"; printf "\n" > "/dev/stderr"; }