#! /usr/bin/gawk -f # Last edited on 2005-02-02 05:39:36 by stolfi # Counts the number of non-comment, non-filler, non-space # bytes per text unit and transcriber, in an EVT-format file. BEGIN { abort = -1; usage = ( ARGV[0] "\\\n" \ " -v scribes=STRING \\\n" \ " -v countChars=BOOL \\\n" \ " -v countLines=BOOL \\\n" \ " -v perPage=BOOL \\\n" \ " -v perUnit=BOOL \\\n" \ " [<] INFILE > TOTALS \\\n" \ ); if (scribes == "") { arg_error("must define \"scribes\""); } # Defaults are for backwards compatibility. if (perPage == "") { perPage = 0; } if (perUnit == "") { perUnit = 1; } if (countLines == "") { countLines = 0; } if (countChars == "") { countChars = 1; } # If splitting at units, also split at pages: if (perUnit) { perPage = 1; } # If counting per pages or units, use horizontal format: horFmt = (perUnit || perPage); split("", guy); # {guy[0..nguys-1]} are the transcribers seen. split("", seen); # {seen[t]=1} if {t} has been registered already. nguys=0; len = length(scribes); for (i=1; i<=len; i++) { cert_guy(substr(scribes,i,1)); } cur_p = "*"; # Current page or "*.*" if none yet. cur_u = "*.*"; # Current page.unit or page."*" if no unit in page yet. split("", ct); # {ct[k,t]} is the level-k char count for transcriber {t}. split("", rt); # {rt[k,t]} is the level-k line count for transcriber {t}. split("", ng); # {ng[k]} is the level-k total line count. clr(2); clr(1); clr(0); if (horFmt) { head_hor(); dash_hor(); } } (abort >= 0) { exit abort; } /^[ ]*$/ { # Blank line, skip: next; } /^[\#]/ { # Comment line, skip: next; } /^[<][^<>; ]*[>] *[{][^{}]*[}] *$/ { # Page/unit/locus header line: check_locus($1); next; } /^<[^<>; ]*[;][A-Z][>]/ { # Split into locator {loc} and text {txt}: loc = $1; len = length(loc); txt = substr($0, len+1); # Remove transcriber code {t} from {loc}: t = substr(loc, len-1,1); loc = ( substr(loc, 1,len-3) ">" ); check_locus(loc); cert_guy(t); # Count this line at all three levels: nch = length(clean_text(substr($0,20))); for (k = 0; k < 3; k++) { ng[k]++; rt[k,t] ++; ct[k,t] += nch; } } END{ if (abort >= 0) { exit; abort; } dmp(0, "TOTAL"); if (horFmt) { dash_hor(); head_hor(); } } function check_locus(loc, n,fld,p,u,grp) { # If summation group changed, dumps statistics and # resets them. # Splits locus into page and unit, gsub(/[<>]/, "", loc); n = split(loc, fld, /[.]/); if ((n < 1) || (n > 3)) { data_error(("bad locator \"" loc "\"")); } p = (n >= 1 ? fld[1] : "*"); u = (p "." (n >= 2 ? fld[2] : "*")); if ((u != cur_u) && perUnit) { # printf "[%s]", u > "/dev/stderr"; dmp(2, cur_u); clr(2); } if ((p != cur_p) && perPage) { # printf "(%s)", p > "/dev/stderr"; dmp(1, cur_p); clr(1); } cur_u = u; cur_p = p; } function cert_guy(t) { # Adds "t" to the set of known transcribers, if not there already. if (! (t in seen)) { guy[nguys] = t; nguys++; seen[t] = 1; } } function clr(k, t) { # Clears counters of level {k} ng[k] = 0; for (t in seen) { ct[k,t] = 0; rt[k,t] = 0; } } function dmp(k,grp) { if ((k > 0) && (ng[k] == 0)) { return; } if (horFmt) { dmp_hor(k,grp); } else { dmp_ver(k,grp); } } function dmp_hor(k,grp, i,t) { # Dumps counters of level {k}, horizontal format: if (countLines) { printf "%-12s ", grp; printf " %7d ", ng[k]; for(i=0; i "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function data_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; }