#! /usr/bin/gawk -f # Last edited on 2005-01-31 05:40:17 by stolfi BEGIN { abort = -1; # Validates the format of the 20e1 interlinear file. # Assumes that plumes are encoded by '~', not apostrophe. # Assumes that the synch filler is '!' not '%'. # Unacceptable items: # Unfolded alternatives ('[]' groups). # Nested or unpaired braces. # Nested or unpaired parens. # Leading or duplicated word-separators [-=,.]. # Parag marker anywhere except at end. # Missing final word-separator [-=,.]. # Malformed weirdo codes '&NNN;'. usage = ( \ "validate-new-evt-format \\\n" \ " [ -v validChars=CHARS ] \\\n" \ " [ -v checkTerminators=BOOL ] \\\n" \ " [ -v checkLineLengths=BOOL ] \\\n" \ " [ -v requirePageHeaders=BOOL ] \\\n" \ " [ -v requireUnitHeaders=BOOL ] \\\n" \ " [ -v requireLocusHeaders=BOOL ] \\\n" \ " INFILE " \ ); # where CHARS are the allowed non-space letters, and LOC is a # location code without line number e.g. "f103v2.T1" or "f83r" if (validChars == "") { validChars = "~abcdefghijklmnopqrstuvxyz"; } if (validChars ~ /[-=,.{}()*%!&;]/) { arg_error("invalid characters in \"validChars\" list\n"); } if (checkTerminators == "") { checkTerminators = 1; } if (checkLineLengths == "") { checkLineLengths = 1; } if (requireUnitHeaders == "") { requireUnitHeaders = 1; } if (requireLocusHeaders == "") { requireUnitHeaders = 1; } if (requirePageHeaders == "") { requirePageHeaders = 1; } # Location fields of previous line: old_fn = ""; old_un = ""; old_ln = ""; nerrors = 0; } //{ if (abort >= 0) { exit abort; } } # blank line /^ *$/ { next; } # `##'-comment (page/unit header) /^[\#][\#]/ { gsub(/^[\#][\#] */, "", $0); if (! check_header_line($0)) { print_line(); nerrors++; } next; } # other `#'-comment /^[\#]/ { next; } # VTT-style page/unit/locus header /^<[^;<>]*>/ { if (! check_header_line($0)) { print_line(); nerrors++; } next; } # Text line /^[<]/ { if (! check_text_line($0)) { print_line(); nerrors++; } next } # Other lines /./ { format_error("bad line format"); print_line(); nerrors++; next } END { if (abort >= 0) { printf "aborted\n" > "/dev/stderr"; exit abort; } printf "\n" > "/dev/stderr"; printf "%d errors flagged\n", nerrors > "/dev/stderr"; } function check_text_line(lin, txt,loc,tmp,fn,un,tr,nf,res) { res = 1; if (length(lin) <= 19) { format_error("missing text"); res = 0; } # Check general format, and extract location code and text proper. # Note that line number must start with digit, # while the location code must start with letter: if (! match(lin, /^/)) { format_error("missing or malformed locator"); return 0; } loc = substr(lin,RSTART+1,RLENGTH-2); txt = substr(lin,RLENGTH+1); if (substr(txt,1, 19-RLENGTH) != substr(" ", 1, 19-RLENGTH)) { format_error("too few leading blanks"); res = 0; } if (substr(lin,20,1) == " ") { format_error("too many leading blanks"); res = 0; } gsub(/^[ ]+/, "", txt); gsub(/[ ]+$/, "", txt); # Validate location code # Split location into fields: tmp = length(loc); tr = substr(loc, tmp,1); if (substr(loc, tmp-1, 1) != ";") { fatal_error("program error"); } loc = substr(loc, i, tmp-2); nf = split(loc, tmp, /[.]/); if (nf != 3) { fatal_error("program error"); } fn = tmp[1]; un = tmp[2]; ln = tmp[3]; # Check page f-number: if (requirePageHeaders) { if (fnum == "") { format_error("missing page header line"); res = 0; fnum = fn; } else { if (fn != fnum) { format_error(("wrong page f-number, should have been " fnum)); res = 0; } } } else { fnum = fn; } if (requireUnitHeaders) { # Check unit tag: if (unit == "") { format_error("missing unit header line"); res = 0; unit = un; } else { if (un != unit) { format_error(("wrong unit code, should have been " unit)); res = 0; } } } else { unit = un; } if (requireLocusHeaders) { # Check unit tag: if (lnum == "") { format_error("missing locus header line"); res = 0; lnum = ln; } else { if (ln != lnum) { format_error(("wrong line number, should have been " lnum)); res = 0; } } } else { lnum = ln; } # Convert line numbers to pure number if (match(ln, /[0-9]$/)) { # Append a "0" ln = (ln "0"); } else { # Convert the final letter to a digit: gsub(/[a]$/, "1", ln); gsub(/[b]$/, "2", ln); gsub(/[c]$/, "3", ln); gsub(/[d]$/, "4", ln); gsub(/[e]$/, "5", ln); } if (! match(ln, /^[0-9][0-9]*$/)) { fatal_error("program error"); } # printf "[%s][%s][%s] -> [%s][%s][%s][%s]", \ # old_fn, old_un, old_ln, fn, un, ln, tr > "/dev/stderr"; # Check for non-decreasing line numbers: if ((fn == old_fn) && (un == old_un)) { if ((ln + 0) < (old_ln + 0)) { format_error("lines out of order"); res = 0; } } # Check for repeated transcription code: if ((fn == old_fn) && (un == old_un) && (ln == old_ln)) { # Still same locus, transcriber must be new if (tr in tr_seen) { format_error("repeated transcription code"); res = 0; } } else { # Locus changed; clear all transcriber codes: split("", tr_seen); } # Mark transcriber as seen: tr_seen[tr] = 1; # Validate line length if (checkLineLengths) { # Note: trailing {}-comments are included. nc = length(txt); if ((fn == old_fn) && (un == old_un) && (ln == old_ln)) { if ((old_nc != -1) && (nc != old_nc)) { format_error(("inconsistent line lengths (" old_nc "/" nc ")")); res = 0; } } old_nc = nc; } # Validate the text proper # Remove '{}' comments gsub(/{[^{}]*}/, "", txt); # Assume the '[|]' groups have been unfolded, # otherwise we should do this: # gsub(/\[[-*%A-Z.24678]*[|][-*%A-Z.24678]*\]/, "", txt); # Remove non-significant fillers [!] but leave skip-markers [%]: gsub(/[!]/, "", txt); if (txt == "") { # Empty lines are OK. } else { # Remove weirdoes gsub(/[&][0-9][0-9][0-9];/, "*", txt); # Remove parens (only paired, non-empty, and un-nested ones): txt = gensub(/[(]([^()]+)[)]/, "\\1", "g", txt); # Check for leading or double word breaks # (one trailing word break is OK, e.g. in circular text.) if (txt ~ /^[-=.,]./) { format_error("leading [-=.,]"); res = 0; } # All chars but last one must be {validChars} or [-.,%*]: if (txt !~ ("^[-.,%*" validChars "]*.$")) { match(txt, ("^[-.,%*" validChars "]*")); badch = substr(txt,RLENGTH+1,1); format_error("invalid char \"" badch "\" in text"); res = 0; } # Check for consecutive word breaks: if (txt ~ /[-=.,][-=.,]/) { format_error("doubled [-=.,]"); res = 0; } # Require final word-break: if (txt !~ /[-=.,]$/) { format_error("text should end with [-=.,]"); res = 0; } else { # Check that all readings of the same locus end with the same word break: cr = substr(txt, length(txt), 1); if ((fn == old_fn) && (un == old_un) && (ln == old_ln)) { if ((old_cr != "") && (cr != old_cr) && (checkTerminators)) { format_error(("inconsistent line terminator (" old_cr "/" cr ")")); res = 0; } } old_cr = cr; } } old_fn = fn; old_un = un; old_ln = ln; return res; } function check_header_line(lin, att,nf,fld,i,res) { # Checks a page/unit/locus header line. # Sets the global variables {fnum,unit,lnum} from it. # If no {unit}, sets it to "". # If no {lnum}, sets it to "". res = 1; if (! match(lin, /^<[^<>{};]*> *[{][^{}<>]*[}] *$/)) { format_error("bad page/unit/locus header"); return 0; } # Extract and check locator if (! match(lin, /<.*>/)) { fatal_error("program error"); } loc = substr(lin, RSTART+1, RLENGTH-2); # decompose locator in {fnum}, {unit}, and {lnum} fnum = ""; unit = ""; lnum = ""; nf = split(loc, fld, /[.]/); if ((nf < 1) || (nf > 3)) { format_error("bad locator in header line"); res = 0; } # Check page f-number, save in {fnum}: fnum = fld[1]; if (! match(fnum, /^f[0-9]+[vr]?[0-9]?$/)) { format_error("bad f-number in header line"); res = 0; } if (nf >= 2) { unit = fld[2]; if (! match(unit, /^[A-Z][0-9]*$/)) { format_error("bad unit tag in header line"); res = 0; } } if (nf >= 3) { lnum = fld[3]; if (! match(lnum, /^[0-9]+[a-e]?$/)) { format_error("bad line number in header line"); res = 0; } } # Check attribute list match(lin, /{.*}/); if (RSTART == 0) { fatal_error("program error"); } att = substr(lin, RSTART+1, RLENGTH-2); nf = split(att, fld, " "); for (i=1; i<=nf; i++) { if (! match(fld[i], /^ *[$][A-Z][=][A-Z0-9] *$/)) { format_error("bad attribute"); res = 0; } } return res; } function arg_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function fatal_error(msg) { printf "%s:%d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function format_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; } function print_line() { printf " %s\n", $0 > "/dev/stderr"; }