#! /usr/bin/gawk -f # Last edited on 1998-12-29 05:36:48 by stolfi BEGIN { abort = -1; usage = ( \ "extract-reading-tuples \\\n" \ " < INFILE.evt > OUTFILE.tup " \ ); # Reads an interlinear file in EVMT format (EVA encoding) and writes # a list of 26-character tuples, one for each character position # present in the interlinear. # # The tuple for a given character position consists of the readings # of that position by all 26 (potential) transcribers, "A" thru "Z". # The "%" reading is assumed whenever a character position is not # covered by a particular transcription. # # The spaces, line breaks, para breaks, and the fillers "!" and "%" # are viewed as readings, too. In-line comments are replaced by "!" # fillers, preserving alignment. trset = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; bangs = "!"; while (length(bangs) < 500) { bangs = (bangs bangs); } blanks = " "; while (length(blanks) < 19) { blanks = (blanks blanks); } # Location fields of current line: cur_fn = ""; cur_un = ""; cur_ln = ""; cur_nc = -1; # Current line length. cur_ok = 1; # means that all versions of current line have been processed. # Versions of the current line cur_nt = 0; # number of versions read so far. split("", cur_tx); # "cur_tx[i]", "i=0..cur_nt-1" are the versions, in reading order. split("", cur_tk); # "cur_tk[i]" are the corresponding transcriber codes, 1..26. split("", cur_ts); # "cur_ts[tr] = 1" means transcriber "tr" has occurred. # Various counts: n_lines = 0; # VMS text lines. n_read = 0; # Interlinear text lines read. n_used = 0; # Interlinear text lines used in tuples. # tuple for output split("", tup); } //{ if (abort >= 0) { exit abort; } } # Blank line /^ *$/ { next; } # `##'-comment (page/unit header) /^[#][#]/ { process_current_line(); next; } # Other `#'-comment /^[#]/ { process_current_line(); next; } # Uncommented page/unit header /^<[^<>;]*>/ { process_current_line(); next; } # Text line /^=0) { exit abort; } process_current_line(); printf "%7d interlinear text lines read\n", n_read > "/dev/stderr"; printf "%7d interlinear text lines used\n", n_used > "/dev/stderr"; printf "%7d VMS lines processed\n", n_lines > "/dev/stderr"; } function process_variant(lin, txt,loc,tmp,fn,un,tc,tk,nf,res) { res = 1; n_read++; if (length(lin) <= 19) { format_error("missing text"); res = 0; } # Check general format, and extract location code and text proper. # Note that line number must start with digit, # while the location code must start with letter: match(lin, /^/); if (RSTART != 1) { format_error("bad location format"); res = 0; } else { loc = substr(lin,RSTART+1,RLENGTH-2); txt = substr(lin,RLENGTH+1); if (substr(lin,RLENGTH+1, 19-RLENGTH) != substr(blanks, 1, 19-RLENGTH)) { format_error("too few blanks"); res = 0; } if (substr(lin,20,1) == " ") { format_error("too many blanks"); res = 0; } gsub(/^[ ]+/, "", txt); gsub(/[ ]+$/, "", txt); # Validate location code # Split location into fields: tmp = length(loc); tc = substr(loc, tmp,1); if (substr(loc, tmp-1, 1) != ";") { fatal_error("program error: semicolon"); } loc = substr(loc, i, tmp-2); nf = split(loc, tmp, /[.]/); if (nf == 3) { fn = tmp[1]; un = tmp[2]; ln = tmp[3]; } else if (nf == 2) { fn = tmp[1]; un = "-"; ln = tmp[2]; } else { fatal_error("program error: nf"); } # Remove trailing comments, if any, and save line length: while (gsub(/{[^{}]*}$/, "", txt)) { } nc = length(txt); # Convert any remaining comments to "!"s: txt = remove_comments(txt); # Check for non-decreasing lines if ((fn == cur_fn) && (un == cur_un) && (cur_ok == 1)) { if (numeric_line(ln) <= numeric_line(cur_ln)) { format_error("lines out of order, or interloping comments"); } } # If new line, flush previous one: if ((fn != cur_fn) || (un != cur_un) || (ln != cur_ln)) { process_current_line(); } else { if ((cur_nc != -1) && (nc != cur_nc)) { format_error(("inconsistent line lengths (" cur_nc ":" nc ")")); res = 0; } } # Save this version: if (tc in cur_ts) { format_error("repeated transcription code"); res = 0; } else { cur_ts[tc] = 1; if (res == 1) { # Save for later tuple extraction: cur_tx[cur_nt] = txt; tk = index(trset, tc); if (tk == 0) { fatal_error("program error: tr"); } cur_tk[cur_nt] = tk; cur_nt++; n_used++; } } # Update current location and line length: cur_fn = fn; cur_un = un; cur_ln = ln; cur_nc = nc; } cur_ok = 0; return res; } function remove_comments(txt, chunk,i,res) { # Replaces {}-comments by an equal length of "!"s res = ""; while (txt != "") { i = index(txt, "{"); if (i == 0) { res = (res txt); txt = ""; } else { res = (res substr(txt, 1, i-1)); txt = substr(txt, i); i = index(txt, "}"); if (i == 0) { format-error("mismatched `{'"); res = (res txt); txt = ""; } res = (res substr(bangs, 1,i)); txt = substr(txt, i+1); } } return res; } function process_current_line( i,loc,lin) { if (cur_ok != 1) { extract_tuples(cur_tx, cur_tk, cur_nt); n_lines++; } split("", cur_tx); split("", cur_tk); split("", cur_ts); cur_nt = 0; cur_ok = 1; } function extract_tuples(cur_tx, cur_tk, cur_nt, i,j,new,nc,m,c,d,and,tup) { # For each character position, creates a tuple from "cur_tx[i]", "i=0..cur_nt-1", # and writes it to standard output. # Check lengths, just to be sure: nc = length(cur_tx[0]); for (i=1; inum"); } n = 10*substr(n,1,m-1) + i; } else { fatal_error("program error: line num"); } return n; } function arg_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function fatal_error(msg) { printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function format_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; } function print_line(lin) { printf "file %s, line %d: %s\n", FILENAME, FNR, lin > "/dev/stderr"; printf "\n" > "/dev/stderr"; }