#! /usr/bin/gawk -f # Last edited on 2012-05-05 14:36:07 by stolfilocal BEGIN { abort = -1; usage = ( "tex-format-word-freqs-by-section \\\n" \ " [ -v showCounts=BOOL ] \\\n" \ " [ -v showFreqs=BOOL ] \\\n" \ " < INFILE.wct > OUTFILE.tex" \ ); # Tabulates the counts and/or frequencies of words by section, as a LaTeX table. # Assumes the input records have fields # # SECTAG COUNT FREQ WORD # # where WORD is in EVA, and SECTAG is a section tag # like "bio.1" or "txt.n". The output is formatted as # `ncols' columns, one per section, in the order seen in the input. if (showCounts == "") { showCounts = 1; } if (showFreqs == "") { showFreqs = 1; } # These arrays are indexed by s = [0..ns-1] split("", nw); split("", tag); # These arrays are indexed with the section tag: split("", sindex); # These arrays are indexed by s = [0..ns-1] and r = [0..nw[s]-1] split("", wct); split("", wfr); split("", wrd); ns = 0; nrows = 0; } (abort >= 0) { exit abort; } /^ *([\#]|$)/ { next; } /./ { if (NF != 4) { data_error(("bad line format = |" $0 "|")); } tg = $1; ct = $2; fr = $3; w = $4; if (match(w, /[^\\][$&% _\^#]/)) { data_error(("bad word \"" w "\" - has \"" substr(w,RSTART,RLENGTH) "\"")); } if (! (tg in sindex)) { s = ns; ns++; tag[s] = tg; nw[s] = 0; sindex[tg] = s; } else { s = sindex[tg]; } r = nw[s]; nw[s]++; wct[s,r] = ct; wfr[s,r] = fr; wrd[s,r] = w; if (nrows < nw[s]) { nrows = nw[s]; } next; } END { if (abort >= 0) { exit abort; } print_word_table(); } function print_word_table( s,r,nspan,bar,w,ct,fr,xw,xct,xfr) { printf "%% Created by %s\n", ARGV[0]; # Table header: printf "\\begin{tabular}{|"; for (s = 0; s < ns; s++) { if (showCounts) { printf "r"; } if (showFreqs) { printf "r"; } printf "l|"; } printf "} \\hline\n"; # Column headers: nspan = (showCounts ? 1 : 0) + (showFreqs ? 1 : 0) + 1; printf " "; for (s = 0; s < ns; s++) { if (s > 0) { printf " &\n "; } bar = (s == ns-1 ? "|" : ""); printf "\\multicolumn{%d}{|c%s}{{\\tt %s}}", nspan, bar, tag[s]; } printf " \\\\ \\hline\n"; # Table entries: for (r = 0; r < nrows; r++) { printf " "; for (s = 0; s < ns; s++) { if (r < nw[s]) { w = wrd[s,r]; ct = wct[s,r]; fr = wfr[s,r]; xct = ("\\ct{" sprintf("%d", ct) "}"); xfr = ("\\fr{" substr(sprintf("%5.3f", fr),2) "}"); xw = ("\\ev{" w "}"); } else { w = ""; ct = 0; xct = ""; xfr = ""; xw = (i == nw+1 ? "\\dots" : ""); } printf " "; if (showCounts) { printf "%10s &", xct; } if (showFreqs) { printf "%10s &", xfr; } printf "%-10s ", xw; if (s == ns-1) { printf "\\str\\\\\n"; } else { printf "&\n "; } } } printf " \\hline\n"; printf "\\end{tabular}%%\n"; } function arg_error(msg) { printf "%s\n", NR, msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }