#! /usr/bin/gawk -f # Last edited on 2001-12-28 21:02:27 by stolfi BEGIN { abort = -1; usage = ( "count-elems" " \\\n" \ " { -v elemList='a,o,...' | -v elemTable=FILE } \\\n" \ " [ -v showBadWords=BOOL ] \\\n" \ " [ -v joinRepeats=BOOL ] \\\n" \ " < INFILE.wct > OUTFILE.tex" \ ); # Assumes the input records have fields # # COUNT WORD # # where WORD is plain EVA, with capitalized ligatures and elements # marked off by {}; and COUNT is its token count. # # If "joinRepeats" is false, outputs a set of lines in the format # # TOTCOUNT ELEM # # where ELEM is an element (without the braces), and TOTCOUNT is the # total number of occurrences of ELEM in all input words, multiplied # by the respective COUNTs. # # If "joinRepeats" is true, counts each maximal element repeat # as a distinct element: so "{a}{b}{b}{a}{c}{c}{c}" has 4 maximal # repeats, "{a}", "{b}{b}", "{a}", and "{c}{c}{c}". # In that case, the repeat count, in braces, is appended # to the ELEM field. # # The list of elements is specified either directly, through the # `elemList' parameter, or through a file named in the `elemTable' # parameter. In the first case the elements should be separated by # commas. In the second case, each element must be the first field # in a separate line ("#" lines excluded). In either case, the # elements must be capitalized as in the input, without braces. # # The special elements "~", "/", "+", "-" may occur multiple # times in the list, and are ignored. # # The output will contain only the specified elements, in the # specified sequence. if (showBadWords == "") { showBadWords = 0; } if (joinRepeats == "") { joinRepeats = 0; } if ((elemList == "") == (elemTable == "")) { arg_error("must define exactly one of \"elemList\" and \"elemTable\""); } split("", elem); split("", eindex); split("", eclass); if (elemList != "") { nelems = parse_explicit_elems(elemList,elem,eindex,eclass); } else { nelems = load_elems_from_file(elemTable,elem,eindex,eclass); } # indexed with the capitalized element itself (plus repeat count): split("", ect); if (joinRepeats) { split("", maxrep); } ngud = 0; nbad = 0; } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } /./ { if (NF != 2) { data_error("bad line format"); } ct = $1; w = $2; if (w !~ /^[{}a-zA-Z?]+$/) { data_error("bad word"); } # split word into elements: gsub(/[}][{]/, "} {", w); ne = split(w, welem, " "); rep = 0; for (i = 1; i <= ne; i++) { e = welem[i]; if (e !~ /^[{][a-zA-Z?]+[}]$/) { data_error(("bad elem \"" e "\"")); } gsub(/[{}]/, "", e); if (e in eindex) { ngud += ct; } else { nbad += ct; if (showBadWords && (e !~ /[?]/)) { printf " %5d %-5s %s\n", ct, e, $2 > "/dev/stderr"; } } if (joinRepeats) { if (! (e in maxrep)) { maxrep[e] = 0; } rep++; if ((i == ne) || (welem[i+1] != welem[i])) { er = ( e "{" rep "}" ); if (! (er in ect)) { ect[er] = 0; } ect[er] += ct; if (rep > maxrep[e]) { maxrep[e] = rep; } rep = 0; } } else { if (! (e in ect)) { ect[e] = 0; } ect[e] += ct; } } next; } END { if (abort >= 0) { exit abort; } printf "%d valid elems found.\n", ngud > "/dev/stderr"; if (nbad > 0) { printf "extraneous elems found:\n" > "/dev/stderr"; for (e in ect) { if (! (e in eindex)) { printf " %-5s %5d\n", e, ect[e] > "/dev/stderr"; } } } for (i = 1; i <= nelems; i++) { e = elem[i]; if (e !~ /^[-~+\/]$/) { if (joinRepeats) { for (rep = 1; rep <= maxrep[e]; rep++) { er = ( e "{" rep "}" ); ct = ect[er]; printf "%7d %s\n", ct, er; } } else { ct = ect[e]; printf "%7d %s\n", ct, e; } } } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function table_error(msg) { printf "error in elemsTable: %s\n", msg > "/dev/stderr"; abort = 1; exit 1; }