#! /usr/bin/gawk -f # Last edited on 2002-01-03 22:22:38 by stolfi BEGIN { abort = -1; usage = ( "tex-format-elem-tw-freqs \\\n" \ " { -v elemList='a,o,...' | -v elemTable=FILE } \\\n" \ " [ -v ncols=NUM ] \\\n" \ " [ -v minFreq=FRQ ] \\\n" \ " [ -v freqDigits=NUM ] \\\n" \ " [ -v showClasses=BOOL ] \\\n" \ " [ -v showCounts=BOOL ] \\\n" \ " [ -v showFreqs=BOOL ] \\\n" \ " < INFILE.jfr > OUTFILE.tex" \ ); # Tabulates given counts and/or frequencies of symbols # and formats the output as a LaTeX table. # Assumes the input records have fields # # TCOUNT TFREQ WCOUNT WFREQ GLYPH # # where GLYPH is an EVA string, already capitalized; # TCOUNT and TFREQ are the statistics of GLYPH among the tokens; # and WCOUNT and WFREQ are the statistics among the words. # The output is formatted as `ncols' columns, filled row-wise. # # The output entries correspond to the glyphs listed in the # "elemList" string or in the "elemTable" file. In these lists, # if GLYPH = "~", the entry is left blank. # if GLYPH = "/", the current row is padded with blanks. # if GLYPH = "-", does the same, then inserts an horizontal line. # These special glyphs may occur multiple times. # # If showClasses is TRUE, also prints the element's class # at the leftmost column. This option is effective only # when the elements are read from a file. if (ncols == "") { ncols = 2; } if (showCounts == "") { showCounts = 1; } if (showFreqs == "") { showFreqs = 1; } if (showClasses == "") { showClasses = ( elemTable != ""); } if (minFreq == "") { minFreq = 0.00005; } if (freqDigits == "") { freqDigits = 4; } if ((elemList == "") == (elemTable == "")) { arg_error("must define exactly one of \"elemList\" and \"elemTable\""); } split("", elem); split("", eindex); split("", eclass); if (elemList != "") { nelems = parse_explicit_elems(elemList,elem,eindex,eclass); } else { nelems = load_elems_from_file(elemTable,elem,eindex,eclass); } if (showClasses && (! hasclass)) { arg_error("there are no classes to show"); } # Element counts and freqs, for tokens and words. # Indexed with the capitalized element itself. split("", etCt); split("", ewCt); split("", etFr); split("", ewFr); } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } /./ { if (NF != 5) { data_error("bad line format"); } tCt = $1; tFr = $2; wCt = $3; wFr = $4; e = $5; nread++; if (e !~ /^[A-Za-z?]+$/) { data_error(("bad elem \"" e "\"")); } if (e in etCt) { data_error(("repeated elem \"" e "\"")); } etCt[e] = tCt; etFr[e] = tFr; ewCt[e] = wCt; ewFr[e] = wFr; next; } END { if (abort >= 0) { exit abort; } print_elem_freqs_table(); } function print_elem_freqs_table( \ i,col,row,oldrow,hline,cline,e,tCt,tFr,wCt,wFr,\ cl,oldcl,xe,xtCt,xtFr,xwCt,xwFr,xcl \ ) { printf "%% Created by %s\n", ARGV[0]; # Table preamble output_table_preamble(); row = 0; # Column headers end_row(row, 1, 0); output_table_header(); row++; # Table entries: oldcl = ""; col = ncols+1; hline = 1; cline = 0; printf "nelems = %d:", nelems > "/dev/stderr"; for (i = 1; ((i <= nelems) || (col <= ncols)); i++) { # Assert: col > 1. e = (i <= nelems ? elem[i] : "~"); printf " %s", e > "/dev/stderr"; # Obtain element data: if (e ~ /^[-\/]$/) { while (col <= ncols) { output_entry(col, "", "", "", "", ""); col++; } if (e == "-") { cline = ncols; } } else { if (e == "~") { cl = oldcl; tCt = 0; tFr = 0; wCt = 0; wFr = 0; xe = ""; xcl = ""; xtCt = ""; xtFr = ""; xwCt = ""; xwFr = ""; } else { cl = eclass[i]; tCt = etCt[e]; tFr = etFr[e]; wCt = ewCt[e]; wFr = ewFr[e]; # Format values if (showClasses && (cl != oldcl)) { xcl = ("\\cl{" cl "}"); while (col <= ncols) { output_entry(col, "", "", "", "", ""); col++; } hline = 1; cline = 0; } else { xcl = ""; } xe = format_elem(e); xtCt = format_count(tCt); xwCt = format_count(wCt); xtFr = format_freq(tFr); xwFr = format_freq(wFr); } # Print element entry if (col > ncols) { end_row(row, hline, cline); row++; col = 1; hline = 0; cline = 0; } # Assert: col <= ncols output_entry(col, xe, xtCt, xtFr, xwCt, xwFr, xcl); col++; oldcl = cl; } } printf "\n" > "/dev/stderr"; end_row(row, 1, 0); printf "\\end{tabular}%%\n"; } function output_table_preamble( col,nc) { nc = (showCounts ? 1 : 0) + (showFreqs ? 1 : 0) printf "\\begin{tabular}{"; if (showClasses) { printf "|c"; } for (col = 1; col <= ncols; col++) { printf "|c|"; if (nc > 0) { if (showCounts) { printf "r"; } if (showFreqs) { printf "r"; } printf "|"; if (showCounts) { printf "r"; } if (showFreqs) { printf "r"; } printf "|"; } } printf "}\n"; } function output_table_header( col,nc,xb) { nc = (showCounts ? 1 : 0) + (showFreqs ? 1 : 0); if (showClasses) { printf "\\hd{class} &\n "; } for (col = 1; col <= ncols; col++) { if (col != 1) { printf "&\n"; } printf " "; printf "\\hd{glyph} "; if (nc > 0) { xb = ( col < ncols ? "|" : ""); printf "& \\multicolumn{%d}{c|}{\\hd{text}}", nc; printf "& \\multicolumn{%d}{c|%s}{\\hd{lexicon}}", nc, xb; } } } function format_elem(e) { if (e == "+") { return ("\\tot"); } else { return ("\\ev{" e "}"); } } function format_count(ct) { if (ct + 0 == 0) { return "\\zeroct"; } else { return ("\\ct{" sprintf("%d", ct) "}"); } } function format_freq(fr) { if (fr + 0 < minFreq) { return "\\zerofr"; } else { fr = sprintf("%*.*f", freqDigits+2, freqDigits, fr); if (fr >= 1.0) { fr = substr(fr, 1, freqDigits+1); } else { fr = substr(fr,2, freqDigits+1); } return ("\\fr{" fr "}"); } } function end_row(row,hline,cline, fcol) { if (row > 0) { printf "\\str\\\\\n"; } if (hline) { printf " \\hline\n"; } else if (cline > 0) { fcol = 1 + (showClasses ? 1 : 0 ); printf " \\cline{%d-%d}\n", fcol, fcol + cline - 1; } } function output_entry(col,xe,xtCt,xtFr,xwCt,xwFr,xcl) { if (col != 1) { printf "&\n"; } printf " "; if ((col == 1) && showClasses) { printf "%10s &\n ", xcl; } printf "%-10s ", xe; if (showCounts) { printf "& %10s ", xtCt; } if (showFreqs) { printf "& %10s ", xtFr; } if (showCounts) { printf "& %10s ", xwCt; } if (showFreqs) { printf "& %10s ", xwFr; } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }