#! /usr/bin/gawk -f # Last edited on 2002-01-03 22:25:32 by stolfi BEGIN { abort = -1; usage = ( "tex-format-elem-pair-freqs \\\n" \ " { -v rowList='a,o,...' | -v rowTable=FILE } \\\n" \ " { -v colList='a,o,...' | -v colTable=FILE } \\\n" \ " [ -v elemPrefix=STRING ] \\\n" \ " [ -v endMarker=STRING ] \\\n" \ " [ -v minFreq=FRQ ] \\\n" \ " [ -v freqDigits=NUM ] \\\n" \ " [ -v showCounts=BOOL ] \\\n" \ " [ -v showRowFreqs=BOOL ] \\\n" \ " [ -v showColFreqs=BOOL ] \\\n" \ " < INFILE.gpf > OUTFILE.tex" \ ); # Tabulates given counts and/or frequencies of symbols # and formats the output as a LaTeX table. # Assumes the input records have fields # # COUNT ROWFREQ COLFREQ TOTFREQ ELEM1:ELEM2 # # where ELEM1 and ELEM2 are EVA strings, already capitalized. # As a special case either elem or both can be "+", meaning a row, # column, or table total. # # The output rows and columns correspond to the elems listed in the # "rowList" and "colList strings or in the "rowTable" and "colTable" # files. # # In the element lists, the special elems "/" and "~" are ignored. # The special elem "-" inserts a line at the corresponding row or # column. The "endMarker" elem is mapped to \remk or \cemk, the "+" # elem is mapped to \rtot or \ctot. Other elems are # mapped to \rev{ELEM1} or \cev{ELEM2}, with the optional "elemPrefix" # prepended to ELEM1 or ELEM2. If a frequency is less than # minFreq it is mapped to \zerofr, otherwise to \fr{FREQ}. # if (endMarker == "") { endMarker = "_"; } if (showCounts == "") { showCounts = 1; } if (showRowFreqs == "") { showRowFreqs = 1; } if (showColFreqs == "") { showColFreqs = 1; } if (minFreq == "") { minFreq = 0.005; } if (freqDigits == "") { freqDigits = 2; } if ((rowList == "") == (rowTable == "")) { arg_error("must define exactly one of \"rowList\" and \"rowTable\""); } split("", rowElem); split("", rowIndex); split("", rowClass); if (rowList != "") { nrowElems = parse_explicit_elems(rowList,rowElem,rowIndex,rowClass); } else { nrowElems = load_elems_from_file(rowTable,rowElem,rowIndex,rowClass); } if ((colList == "") == (colTable == "")) { arg_error("must define exactly one of \"colList\" and \"colTable\""); } split("", colElem); split("", colIndex); split("", colClass); if (colList != "") { ncolElems = parse_explicit_elems(colList,colElem,colIndex,colClass); } else { ncolElems = load_elems_from_file(colTable,colElem,colIndex,colClass); } # Pair data tables # indexed with the capitalized element pair: split("", pairCt); split("", rowFreq); split("", colFreq); npairs = 0; } (abort >= 0) { exit abort; } /^ *([#]|$)/ { next; } /./ { if (NF != 5) { data_error("bad line format"); } ct = $1; rfr = $2; cfr = $3; totfr = $4; pr = $5; npairs++; nitems = split(pr, item, ":"); if (nitems != 2) { data_error("bad item format"); } for (k = 1; k <= 2; k++) { it = item[k]; if ((it !~ /^[+]$/) && (it !~ /^[_A-Za-z?]+$/)) { data_error("bad elem"); } } ei = item[1]; ej = item[2]; pairCt[ei,ej] = ct; rowFreq[ei,ej] = rfr; colFreq[ei,ej] = cfr; next; } END { if (abort >= 0) { exit abort; } print_elem_pair_freqs_table(); } function print_elem_pair_freqs_table( \ i,col,row,ei,ej,ct,rfr,cfr,xei,xej,xct,xrfr,xcfr \ ) { printf "%% Created by tex-format-elem-pair-freqs\n"; # Table preabmble printf "\\begin{tabular}{|c|"; for (col = 1; col <= ncolElems; col++) { ej = colElem[col]; if (ej == "-") { printf "|"; } else if (ej !~ /^[~\/]$/) { if (showCounts) { printf "r"; } if (showRowFreqs) { printf "r"; } if (showColFreqs) { printf "r"; } } } printf "|}\n"; printf " \\hline\n"; print_table_header(); # Table entries: for (row = 1; row <= nrowElems; row++) { ei = rowElem[row]; if (ei == "-") { printf " \\hline\n"; } else if (ei !~ /^[~\/]$/) { printf " "; xei = format_elem(ei,"r"); printf "%-10s\n ", xei; for (col = 1; col <= ncolElems; col++) { ej = colElem[col]; if (ej !~ /^[-~\/]$/) { ct = pairCt[ei,ej]; rfr = rowFreq[ei,ej]; cfr = colFreq[ei,ej]; xct = format_count(ct); xrfr = format_freq(rfr); xcfr = format_freq(cfr); if (showCounts) { printf "& %10s ", xct; } if (showRowFreqs) { printf "& %10s ", xrfr; } if (showColFreqs) { printf "& %10s ", xcfr; } printf "\n " } } printf "\\rstr\n \\\\\n"; } } printf " \\hline\n"; printf "\\end{tabular}%%\n"; } function print_table_header( col,ej,xej,nspan,lbar,algn,rbar) { # Generates column headers printf " %-10s", "~"; nspan = 0; if (showCounts) { nspan++; } if (showRowFreqs) { nspan++; } if (showColFreqs) { nspan++; } # Center the header if the column has a single frequency value: if (showCounts || (nspan > 1)) { algn = "r"; } else { algn = "c"; } lbar = "|"; for (col = 1; col <= ncolElems; col++) { ej = colElem[col]; if (ej == "-") { lbar = "|"; } else if (ej !~ /^[~\/]$/) { rbar = (col == ncolElems ? "|" : ""); xej = format_elem(ej, "c"); printf "\n & %10s\\multicolumn{%d}{%s%s%s}{%s}", "",nspan,lbar,algn,rbar,xej; lbar = ""; } } printf "\\cstr\n \\\\\n"; printf " \\hline\n"; } function format_elem(e,dir, x,i,m) { if (e == "+") { return ("\\" dir "tot"); } else { # replace endmarkers by the appropriate TeX macro: m = length(endMarker); while((i = index(e,endMarker)) > 0) { x = (x substr(e,1,i-1) "{\\" dir "emk}"); e = substr(e,i+m); } e = (x e); return ("\\" dir "ev{" elemPrefix e "}"); } } function format_count(ct) { if (ct + 0 == 0) { return "\\zeroct"; } else { return ("\\ct{" sprintf("%d", ct) "}"); } } function format_freq(fr) { if (fr + 0 < minFreq) { return "\\zerofr"; } else { fr = sprintf("%*.*f", freqDigits+2, freqDigits, fr); if (fr >= 1.0) { fr = substr(fr, 1, freqDigits+1); } else { fr = substr(fr,2); } return ("\\fr{" fr "}"); } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }