#! /usr/bin/gawk -f # Last edited on 2002-01-03 22:33:00 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] "\\\n" \ " { -v elemList='a,o,...' | -v elemTable=FILE } \\\n" \ " [ -v maxRepeat=NUM ] \\\n" \ " [ -v minFreq=FRQ ] \\\n" \ " [ -v freqDigits=NUM ] \\\n" \ " [ -v showCounts=BOOL ] \\\n" \ " [ -v showFreqs=BOOL ] \\\n" \ " < INFILE.jfr > OUTFILE.tex" \ ); # Tabulates given counts and/or frequencies of symbols # and formats the output as a LaTeX table. # Assumes the input records have fields # # TCOUNT TFREQ WCOUNT WFREQ GLYPH{REPEAT} # # where GLYPH is an EVA string, already capitalized, # REPEAT is an integer, TCOUNT and TFREQ are the # statistics of maximal GLYPH^REPEAT in the text (token sequence), # and WCOUNT WFREQ are the same for the lexicon (word set). # The output is formatted as a TeX table with GLYPH on the left, # then "maxRepeat" columns for token statistics, then # another "maxRepeat" columns for lexicon statistics. # # The output entries correspond to the glyphs listed in the # "elemList" string or in the "elemTable" file. In these lists, # a "-" glyph inserts an horizontal line. # This special glyph may occur multiple times. if (maxRepeat == "") { maxRepeat = 4; } if (showCounts == "") { showCounts = 1; } if (showFreqs == "") { showFreqs = 1; } if (minFreq == "") { minFreq = 0.00005; } if (freqDigits == "") { freqDigits = 4; } if ((elemList == "") == (elemTable == "")) { arg_error("must define exactly one of \"elemList\" and \"elemTable\""); } split("", elem); split("", eindex); split("", eclass); if (elemList != "") { nelems = parse_explicit_elems(elemList,elem,eindex,eclass); } else { nelems = load_elems_from_file(elemTable,elem,eindex,eclass); } # indexed with the capitalized element and the repeat count: split("", etct); split("", etfr); split("", ewct); split("", ewfr); split("", maxrep); } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } /./ { if (NF != 5) { data_error("bad line format"); } tct = $1; tfr = $2; wct = $3; wfr = $4; er = $5; nread++; if (! match(er, /^[A-Za-z?]+[{]/)) { data_error(("bad elem/repeat format \"" er "\"")); } e = substr(er, 1, RLENGTH-1); rep = substr(er, RLENGTH); if (rep !~ /^[{][0-9]+[}]$/) { data_error(("bad repeat format \"" rep "\"")); } gsub(/[{}]/, "", rep); rep = rep + 0; if ((rep < 1) || (rep > maxRepeat)) { data_error(("bad repeat value \"" rep "\"")); } if ((e,rep) in etct) { data_error(("repeated elem/repeat \"" er "\"")); } if (! (e in maxrep)) { maxrep[e] = 0; } if (rep > maxrep[e]) { maxrep[e] = rep; } if (tct !~ /^[0-9]+$/) { data_error(("bad count format \"" tct "\"")); } if (tfr !~ /^[0-9]*[.][0-9]*$/) { data_error(("bad freq format \"" tfr "\"")); } etct[e,rep] = tct; etfr[e,rep] = tfr; if (wct !~ /^[0-9]+$/) { data_error(("bad count format \"" wct "\"")); } if (wfr !~ /^[0-9]*[.][0-9]*$/) { data_error(("bad freq format \"" wfr "\"")); } ewct[e,rep] = wct; ewfr[e,rep] = wfr; next; } END { if (abort >= 0) { exit abort; } print_elem_freqs_table(); } function print_elem_freqs_table( i,e,xe) { printf "%% Created by tex-format-elem-rep-tw-freqs\n"; output_table_preamble(); output_table_header(); printf "nelems = %d:", nelems > "/dev/stderr"; for (i = 1; i <= nelems; i++) { e = elem[i]; if (e == "-") { output_hline(); } else if (maxrep[e] > 1) { printf " %s", e > "/dev/stderr"; output_elem_name(format_elem(e)); output_elem_counts(e, etct, etfr); output_elem_counts(e, ewct, ewfr); printf " \\str\\\\\n"; } } printf "\n" > "/dev/stderr"; output_hline(); printf "\\end{tabular}%%\n"; } function output_table_preamble( col,rep) { printf "\\begin{tabular}{"; printf "|c"; for (col = 1; col <= 2; col++) { printf "|"; for (rep = 1; rep <= maxRepeat; rep++) { if (showCounts) { printf "r"; } if (showFreqs) { printf "r"; } printf "|"; } } printf "}\n"; output_hline(); } function output_table_header( nc,col,rep,vb) { nc = (showCounts ? 1 : 0) + (showFreqs ? 1 : 0); # First header line printf " ~\n"; printf " & \\multicolumn{%d}{r||}{\\hd{text}} \n", nc*maxRepeat; printf " & \\multicolumn{%d}{r|}{\\hd{lexicon}} \n", nc*maxRepeat; printf " \\\\\n"; printf " \\cline{2-%d}\n", 2*nc*maxRepeat+1; # Second header line printf " \\hd{glyph}\n"; for (col = 1; col <= 2; col++) { for (rep = 1; rep <= maxRepeat; rep++) { vb = ((col == 1) && (rep == maxRepeat) ? "|" : ""); printf " & \\multicolumn{%d}{r|%s}{\\hd{%d}} \n", nc, vb, rep; } } printf " \\\\\n"; output_hline(); } function output_hline() { printf " \\hline\n"; } function output_elem_name(xe) { printf " "; printf "%-10s ", xe; } function output_elem_counts(e,ect,efr, rep,ct,fr,xct,xfr) { for (rep = 1; rep <= maxRepeat; rep++) { ct = ect[e,rep]; xct = format_count(ct); fr = efr[e,rep]; xfr = format_freq(fr); output_entry_count(xct, xfr); } } function output_entry_count(xct,xfr) { if (showCounts) { printf "& %10s ", xct; } if (showFreqs) { printf "& %10s ", xfr; } } function format_elem(e) { if (e == "+") { return ("\\tot"); } else { return ("\\ev{" e "}"); } } function format_count(ct) { if (ct + 0 == 0) { return "\\zeroct"; } else { return ("\\ct{" sprintf("%d", ct) "}"); } } function format_freq(fr) { if (fr + 0 < minFreq) { return "\\zerofr"; } else { fr = sprintf("%*.*f", freqDigits+2, freqDigits, fr); if (fr >= 1.0) { fr = substr(fr, 1, freqDigits+1); } else { fr = substr(fr,2, freqDigits+1); } return ("\\fr{" fr "}"); } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }