#! /usr/bin/gawk -f # Last edited on 2001-12-29 01:29:49 by stolfi BEGIN { abort = -1; usage = ( "count-elem-pairs \\\n" \ " { -v rowList='a,o,...' | -v rowTable=FILE } \\\n" \ " { -v colList='a,o,...' | -v colTable=FILE } \\\n" \ " [ -v endMarker=STRING ] \\\n" \ " [ -v showBadWords=BOOL ] \\\n" \ " < INFILE.wct > OUTFILE.tex" \ ); # Assumes the input records have fields # # COUNT WORD # # where WORD is plain EVA, with capitalized ligatures and elements # marked off by {}; and COUNT is its token count. Outputs the # element pair counts in those tokens, in the format # # COUNT ELEM1:ELEM2 # # The lists of valid elements for ELEM1 and ELEM2 # are specified either directly, through the # `rowList' and `colList' parameters, or through files named by # the `rowTable' abd `colTable' # parameters. In the first case the elements should be separated by # commas. In the second case, each element must be the first field # in a separate line ("#" lines excluded). In either case, the # elements must be capitalized as in the input, without braces. # # The "endMarker" string (default "_") is implicitly prefixed and postfixed to # every word, and must appear in the element list. # # The special elements "/" "~" "+" and "-" are ignored; they # may appear multiple times in the element list. # # The output will contain only the pairs where both elements # are in the list, in the specified sequence. if (endMarker == "") { endMarker = "_"; } if (showBadWords == "") { showBadWords = 0; } # indexed with the capitalized element itself: split("", ect); # indexed with the capitalized element pair: split("", eect); if ((rowList == "") == (rowTable == "")) { arg_error("must define exactly one of \"rowList\" and \"rowTable\""); } split("", rowElem); split("", rowIndex); split("", rowClass); if (rowList != "") { nrowElems = parse_explicit_elems(rowList,rowElem,rowIndex,rowClass); } else { nrowElems = load_elems_from_file(rowTable,rowElem,rowIndex,rowClass); } if ((colList == "") == (colTable == "")) { arg_error("must define exactly one of \"colList\" and \"colTable\""); } split("", colElem); split("", colIndex); split("", colClass); if (colList != "") { ncolElems = parse_explicit_elems(colList,colElem,colIndex,colClass); } else { ncolElems = load_elems_from_file(colTable,colElem,colIndex,colClass); } split("", pairCt); split("", rowCt); split("", colCt); nrowGud = 0; ncolGud = 0; nrowBad = 0; ncolBad = 0; } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } /./ { if (NF != 2) { data_error("bad line format"); } ct = $1; w = $2; if (w !~ /^[{}a-zA-Z?]+$/) { data_error(("bad word \"" w "\"")); } # split word into elements: gsub(/[}][{]/, "} {", w); ne = split(w, welem, " "); prev = endMarker; for (i = 1; i <= ne; i++) { e = welem[i]; if (e !~ /^[{][a-zA-Z?]+[}]$/) { data_error(("bad elem \"" e "\"")); } gsub(/[{}]/, "", e); tally_pair(prev, e, ct, $2); prev = e; } tally_pair(prev,endMarker,ct,$2); next; } function tally_pair(ei,ej,ct,worig) { if (! (ei in rowCt)) { rowCt[ei] = 0; } rowCt[ei] += ct; if (ei in rowIndex) { nrowGud += ct; } else { nrowBad += ct; if (showBadWords && (ei !~ /[?]/)) { printf " %5d %-5s %s\n", ct, (ei ":"), $2 > "/dev/stderr"; } } if (! (ej in colCt)) { colCt[ej] = 0; } colCt[ej] += ct; if (ej in colIndex) { ncolGud += ct; } else { ncolBad += ct; if (showBadWords && (ej !~ /[?]/)) { printf " %5d %-5s %s\n", ct, (":" ej), $2 > "/dev/stderr"; } } pairCt[ei,ej] += ct; } END { if (abort >= 0) { exit abort; } if (nrowBad > 0) { printf "extraneous row elems found:\n" > "/dev/stderr"; for (ei in rowCt) { if (! (ei in rowIndex)) { printf " %-5s %7d\n", ei, rowCt[ei] > "/dev/stderr"; } } } if (ncolBad > 0) { printf "extraneous col elems found:\n" > "/dev/stderr"; for (ej in colCt) { if (! (ej in colIndex)) { printf " %-5s %7d\n", ej, colCt[ej] > "/dev/stderr"; } } } for (i = 1; i <= nrowElems; i++) { ei = rowElem[i]; if (ei !~ /^[-+~\/]$/) { for (j = 1; j <= ncolElems; j++) { ej = colElem[j]; if (ej !~ /^[-+~\/]$/) { ct = pairCt[ei,ej]; printf "%7d %s:%s\n", ct, ei,ej; } } } } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function table_error(msg) { printf "error in elemsTable: %s\n", msg > "/dev/stderr"; abort = 1; exit 1; }