#! /usr/bin/gawk -f # usage: best-pick [-v trcodes=STRING] < FILE.evt > FILE.evt # # Reads an EVT-format file and selects the "best" transliteration # for each line. BEGIN { ofnum = ""; otrcd = ""; ofile = ""; ounln = ""; abort = 0; if (trcodes == "") { trcodes = "UVZABENOPRSWXYKQLMRJITFGCD"; } len = length(trcodes); if ( len != 26 ) { printf "line %d: bad trcodes\n", NR; abort=1; exit; } for (i=1; i<=len; i++) { c = substr(trcodes, i, 1); trpri[c] = i; } } function oout() { # Writes the current line "(oloc, otxt)" # and clears it. if (ofnum != "") { printf "%-19s%s\n", oloc, otxt; oloc = ""; otxt = ""; } } /^#/ { if (abort) { exit 1; } next; } /./ { if (abort) { exit 1; } if (match($0, /^")-2); skip = 19; # Analyze and regularize location code: gsub(/[.;]/, " ", tmp); split(tmp, locf); if ((!(3 in locf)) || (5 in locf)) error("bad location format"); fnum = locf[1]; unit = locf[2]; line = locf[3]; if (4 in locf) { trcd = locf[4]; } else { trcd = "X"; } } else if (substr($0,1,1) == "<") { error("bad location code"); } else { skip = 0; fnum = "f0"; unit = "P"; line = NR; } if (skip >= length($0)) next; txt = substr($0,1+skip); loc = sprintf ("<%s.%s.%s;%s>", fnum, unit, line, trcd); unln = ( unit "." line ); if ( fnum != ofnum ) { oout(); ofnum = fnum; ounln = unln; otrcd = ""; } else if ( ounln != unln ) { oout(); ounln = unln; otrcd = ""; } if (( otrcd == "" ) || ( trpri[trcd] < trpri[otrcd] ) ) { oloc = loc; otxt = txt; otrcd = trcd; } } END{ if (abort) { exit 1; } oout(); }