#! /usr/bin/gawk -f # Last edited on 2004-07-15 23:37:55 by stolfi BEGIN { abort = -1; # Validates the format of the new split and unfolded interlinear file. usage = ( \ "validate-new-evt-format \\\n" \ " [ -v chars=CHARS ] \\\n" \ " [ -v location=LOC ] \\\n" \ " [ -v checkTerminators=1 ] ] \\\n" \ " [ -v checkLineLengths=1 ] \\\n" \ " [ -v requireUnitHeaders=0 ] \\\n" \ " [ -v requirePageHeaders=0 ] \\\n" \ " < INFILE > OUTFILE " \ ); # where CHARS are the allowed non-space letters, and LOC is a # location code without line number e.g. "f103v2.T1" or "f83r" if (chars == "") { chars = "\"'abcdefghijklmnopqrstuvxyzAEFIKOPSTY"; } if (chars ~ /[-=,.]/) { arg_error("invalid characters in \"chars\" list\n"); } if (checkTerminators == "") { checkTerminators = 0; } if (checkLineLengths == "") { checkLineLengths = 0; } if (requireUnitHeaders == "") { requireUnitHeaders = 1; } if (requirePageHeaders == "") { requirePageHeaders = 1; } # The "location" parameter is used to check ## <..> lines # exclusively. The values of 'fnum' and "unit" are extracted from # those lines and used to check text lines. # Location fields of previous line: old_fn = ""; old_un = ""; old_ln = ""; nerrors = 0; } //{ if (abort >= 0) { exit abort; } } # blank line /^ *$/ { next; } # `##'-comment (page/unit header) /^##/ { gsub(/^## */, "", $0); if (! check_page_header($0)) { print_line(); nerrors++; } next; } # other `#'-comment /^#/ { next; } # VTT-style page/unit header /^<[^;<>]*>/ { if (! check_page_header($0)) { print_line(); nerrors++; } next; } # Text line /^= 0) { printf "aborted\n" > "/dev/stderr"; exit abort; } printf "\n" > "/dev/stderr"; printf "%d errors flagged\n", nerrors > "/dev/stderr"; } function check_text_line(lin, txt,loc,tmp,fn,un,tr,nf,res) { res = 1; if (length(lin) <= 19) { format_error("missing text"); res = 0; } # Check general format, and extract location code and text proper. # Note that line number must start with digit, # while the location code must start with letter: match(lin, /^/); if (RSTART != 1) { format_error("bad location format"); return 0; } loc = substr(lin,RSTART+1,RLENGTH-2); txt = substr(lin,RLENGTH+1); if (substr(lin,RLENGTH+1, 19-RLENGTH) != substr(" ", 1, 19-RLENGTH)) { format_error("too few blanks"); res = 0; } if (substr(lin,20,1) == " ") { format_error("too many blanks"); res = 0; } gsub(/^[ ]+/, "", txt); gsub(/[ ]+$/, "", txt); # Validate location code # Split location into fields: tmp = length(loc); tr = substr(loc, tmp,1); if (substr(loc, tmp-1, 1) != ";") { fatal_error("program error"); } loc = substr(loc, i, tmp-2); nf = split(loc, tmp, /[.]/); if (nf != 3) { fatal_error("program error"); } fn = tmp[1]; un = tmp[2]; ln = tmp[3]; # Check page f-number: if (requirePageHeaders) { if (fnum == "") { format_error("missing page header line"); res = 0; fnum = fn; } else { if (fn != fnum) { format_error(("wrong page f-number, should have been " fnum)); res = 0; } } } else { fnum = fn; } if (requireUnitHeaders) { # Check unit tag: if (unit == "") { format_error("missing unit header line"); res = 0; unit = un; } else { if (un != unit) { format_error(("wrong unit code, should have been " unit)); res = 0; } } } else { unit = un; } # Convert line numbers to pure number if (match(ln, /[0-9]$/)) { # Append a "0" ln = (ln "0"); } else { # Convert the final letter to a digit: gsub(/[a]$/, "1", ln); gsub(/[b]$/, "2", ln); gsub(/[c]$/, "3", ln); gsub(/[d]$/, "4", ln); gsub(/[e]$/, "5", ln); } if (! match(ln, /^[0-9][0-9]*$/)) { fatal_error("program error"); } # printf "[%s][%s][%s] -> [%s][%s][%s][%s]", \ # old_fn, old_un, old_ln, fn, un, ln, tr > "/dev/stderr"; # Check for non-decreasing line numbers: if ((fn == old_fn) && (un == old_un)) { if ((ln + 0) < (old_ln + 0)) { format_error("lines out of order"); res = 0; } } # Check for repeated transcription code: if ((fn == old_fn) && (un == old_un) && (ln == old_ln)) { if (tr in tr_seen) { format_error("repeated transcription code"); res = 0; } } else { split("", tr_seen); } tr_seen[tr] = 1; # Validate line length if (checkLineLengths) { # Remove trailing comments and fillers, if any while (gsub(/{[^{}]*}$/, "", txt)) { } nc = length(txt); if ((fn == old_fn) && (un == old_un) && (ln == old_ln)) { if ((old_nc != -1) && (nc != old_nc)) { format_error(("inconsistent line lengths (" old_nc ":" nc ")")); res = 0; } } old_nc = nc; } # Validate text proper # Remove '{}' comments gsub(/{[^{}]*}/, "", txt); # Ignore trailing blanks: gsub(/ *$/, "", txt); # Assume the '[|]' groups have been unfolded, # otherwise we should do this: # gsub(/\[[-*%A-Z.24678]*[|][-*%A-Z.24678]*\]/, "", txt); # Remove non-significant fillers [!] but leave skip-markers [%]: gsub(/[!]/, "", txt); if (txt == "") { # Empty lines are OK. } else { # Remove weirdoes gsub(/[&][0-9][0-9][0-9];/, "*", txt); # Check for leading or double word breaks # (one trailing word break is OK, e.g. in circular text.) if (txt ~ /^[-.,]./) { format_error("leading [-.,]"); res = 0; } if (txt !~ ("^[-.,*" chars "]*.$")) { format_error("invalid char in text"); res = 0; } if (txt ~ /[-.,][-.,]/) { format_error("doubled [-.,]"); res = 0; } if (txt !~ /[-=.,]$/) { format_error("text should end with [-=.,]"); res = 0; } else { cr = substr(txt, length(txt), 1); if ((fn == old_fn) && (un == old_un) && (ln == old_ln)) { if ((old_cr != "") && (cr != old_cr) && (checkTerminators)) { format_error(("inconsistent line terminator (" old_cr ":" cr ")")); res = 0; } } old_cr = cr; } } old_fn = fn; old_un = un; old_ln = ln; return res; } function check_page_header(lin, fn,att,fld,n,i,res) { res = 1; if (! match(lin, /^<[^<>{}]*> *[{][^{}<>]*[}] *$/)) { format_error("bad page locator line"); return 0; } # Extract and check location code if (! match(lin, /<.*>/)) { fatal_error("program error"); } loc = substr(lin, RSTART+1, RLENGTH-2); if (location != "") { if (fn != req_fnum) { format_error(("wrong page f-number, should have been" req_fnum)); res = 0; } } # decompose location code in "fnum" and "unit" if (match(loc, /[.][A-Za-z0-9]+$/)) { unit = substr(loc, RSTART+1, RLENGTH-1); fnum = substr(loc, 1, RSTART-1); } else { unit = ""; fnum = loc; } # Check page f-number, save in "fnum": if (! match(fnum, /^f[0-9]+[vr]?[0-9]?$/)) { format_error("bad f-number in ##-header"); res = 0; } # Check attribute list match(lin, /{.*}/); if (RSTART == 0) { fatal_error("program error"); } att = substr(lin, RSTART+1, RLENGTH-2); n = split(att, fld, " "); for (i=1;i<=n;i++) { if (! match(fld[i], /^ *[$][A-Z][=][A-Z0-9] *$/)) { format_error("bad page attribute"); res = 0; } } return res; } function arg_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function fatal_error(msg) { printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function format_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; } function print_line() { printf "file %s, line %d: %s\n", FILENAME, FNR, $0 > "/dev/stderr"; printf "\n" > "/dev/stderr"; }