#! /usr/bin/gawk -f # usage: split-pages [-v outdir=DIR] [-v trcodes=STRING] < FILE > PAGELIST # # Splits a multipage EVT-format file into one file per page, # discarding #-comments and selecting the "best" transliteration. # Also writes to stdout a list of the files, in the order written BEGIN { ofnum = ""; otrcd = ""; ofile = ""; ounln = ""; abort = 0; if (outdir == "") { outdir = "."; } if (trcodes == "") { trcodes = "UVZABENOPRSWXYKQLMRJITFGCD"; } len = length(trcodes); if ( len != 26 ) { printf "line %d: bad trcodes\n", NR; abort=1; exit; } for (i=1; i<=len; i++) { c = substr(trcodes, i, 1); trpri[c] = i; } } function oout() { # Writes the current line "(oloc, otxt)" to file "ofile" # and clears it. if (ofnum != "") { printf "%-19s%s\n", oloc, otxt >> ofile; oloc = ""; otxt = ""; } } /^#/ { if (abort) exit 1; next; } /./ { if (abort) exit 1; if (match($0, /^")-2); skip = 19; # Analyze and regularize location code: gsub(/[.;]/, " ", tmp); split(tmp, locf); if ((!(3 in locf)) || (5 in locf)) error("bad location format"); fnum = locf[1]; unit = locf[2]; line = locf[3]; if (4 in locf) { trcd = locf[4]; } else { trcd = "X"; } } else if (substr($0,1,1) == "<") { error("bad location code"); } else { skip = 0; fnum = "f0"; unit = "P"; line = NR; } if (skip >= length($0)) next; txt = substr($0,1+skip); loc = sprintf ("<%s.%s.%s;%s>", fnum, unit, line, trcd); unln = ( unit "." line ); if ( fnum != ofnum ) { oout(); if (ofile != "") { close( ofile ); } ofile = ( outdir "/" fnum ".evt" ); ofnum = fnum; printf "%s\n", fnum; ounln = unln; otrcd = ""; } else if ( ounln != unln ) { oout(); ounln = unln; otrcd = ""; } if (( otrcd == "" ) || ( trpri[trcd] < trpri[otrcd] ) ) { oloc = loc; otxt = txt; otrcd = trcd; } } END{ if (abort) exit 1; oout(); close( ofile ); }