#! /usr/bin/gawk -f # Last edited on 1998-07-23 04:11:37 by stolfi # usage: pick-best-labels [-v trcodes=STRING] < INFILE.idx > OUTFILE.idx # # Reads a label/title index file and selects the "best" transliteration # for each label. BEGIN { abort = -1; FS = "|"; OFS = "|"; if (trcodes == "") { trcodes = "UVZABENOPRSWXYKQLMRJITFGCD"; } len = length(trcodes); if ( len != 26 ) { error(("bad trcodes")); } for (i=1; i<=len; i++) { c = substr(trcodes, i, 1); trpri[c] = i; } oclr(); } function oclr() { # Clears the line buffer oposn = ""; # Textual order osect = ""; # Section name ofnum = ""; # Page f-number ounit = ""; # Textual unit tag oline = ""; # Line number otrcd = ""; # Transcriber's code olabl = ""; # The label ogrov = ""; # Grove's alternate encoding okind = ""; # One-letter subject code owhat = ""; # Object labeled onote = ""; # Comments or "-" oloc = ""; } function oget() { # Copies the current line to the line buffer oposn = $1; osect = $2; ofnum = $3; ounit = $4; oline = $5; otrcd = $6; olabl = $7; ogrov = $8; okind = $9; owhat = $10; onote = $11; oloc = sprintf ("<%s.%s.%s>", ofnum, ounit, oline); } function oout() { # Writes the line buffer and clears it. if (oposn != "") { print oposn, osect, ofnum, ounit, oline, otrcd, olabl, ogrov, okind, owhat, onote; oclr(); } } /^#/ { if (abort) { exit 1; } next; } /./ { if (abort >= 0) { exit abort; } if (NF != 11) { error(("line " NR ": bad field count")); } fnum = $3; unit = $4; line = $5; trcd = $6; if (! match(fnum, /^f[0-9][0-9]*[vr][0-9]*$/)) { error(("line " NR ": bad page f-number")); } if (! match(trcd, /^[A-Z]$/)) { error(("line " NR ": bad transcriber code")); } loc = sprintf ("<%s.%s.%s>", fnum, unit, line); if ( loc != oloc ) { oout(); oget(); } else if (trpri[trcd] < trpri[otrcd]) { oget(); } } END{ if (abort >= 0) { exit abort; } oout(); } function error(msg) { printf "%s\n", msg > "/dev/stderr" abort = 1 exit }