#! /usr/bin/gawk -f # Last edited on 1999-01-02 11:38:45 by stolfi BEGIN { abort = -1; usage = ( \ "compute-transcriber correlations \\\n" \ " [ -v alternates=ALTS ] \\\n" \ " < TUPLECTS " \ ); # Reads a file whose lines have the form COUNT TUPLE, where COUNT is # an integer and TUPLE is a string of 26 EVA characters representing the # readings of one VMS character position by 26 potential # transcribers ("A" thru "Z"). In this list "%" denotes "no # information" and "*" denotes "unreadable". # # Prints tables that show how many times transcriber "i" agreed and # disagreed with transcriber "j", in counts and percentages. The # disagreements are futher broken down into substantive and # blank-related, the latter being cases where the disagreement is # over "!", ".", or ",". # # Readings of "%", "*", "-", or "=" are not counted. # # Certain pairs of transcriber codes, like "F" and "G", are # actually two alternative reading by the same transcriber # (originally denoted by "[...|...]" in Landini's file). If # non-"%" readings are present for both codes, each counts # as half a reading. I.e. if transcriber A reads "XXXX" while # B reads "X[X|Y]ZZ", then A and B agree on 1.5 a characters # and disagree on 2.5. alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; # Compute the alternate transcription pairing map: "alter[i] = j" # and "alter[j] = i" if transcriber number "i" is an alternate # reading for transcriber "j", where "i" and "j" range in 1..26. npairs = split(alternates, trpair, ","); split("", seen); for (k=1; k<=npairs; k++) { xy = trpair[k]; if (! match(xy, /^[A-Z][A-Z]$/)) { arg_error(("bad alternative pair \"" xy "\"")); } x = index(alpha, substr(xy, 1,1)); y = index(alpha, substr(xy, 2,1)); if((x == 0) || (x in alter)) {arg_error(("bad/duplicated primary/alternate \"" p "\"")); } if((y == 0) || (y in alter)) {arg_error(("bad/duplicated primary/alternate \"" a "\"")); } alter[x] = y; alter[y] = x; # printf "alter[%s] = %s\n", x, y > "/dev/stderr"; # printf "alter[%s] = %s\n", y, x > "/dev/stderr"; } split("", trseen); split("", tt); } // { if (abort >= 0) { exit abort;} } /./{ if (NF != 2) format_error("wrong num of fields"); ct = $1; tp = $2; nr = split(tp, rd, ""); if ((nr != 26) || (length(tp) != 26)) { format_error("bad tuple length"); } for (i=1; i<=26; i++) { ci = rd[i]; if (ci !~ /^[-=%*]$/) { tt[i] ++; wi = ((i in alter) && (rd[alter[i]] !~ /^[-=%*]$/) ? 0.5 : 1); trseen[i] = 1; for (j=1; j= 0) { exit abort;} print_simple_counts("readings per transcriber", tt); print_counts("overlapping readings", tot); print_counts("agreement", agr); print_ratios("agreement", agr, tot); print_counts("wordspace disagreement", bdi); print_ratios("wordspace disagreement", bdi, tot); print_counts("substantive disagreement", dis); print_ratios("substantive disagreement", dis, tot); } function print_simple_counts(title,tbl, i,j,v,f) { # prints single count table "tbl[i]" printf "\n%s (counts)\n", title; for (i=1; i<=26; i++) { if (i in trseen) { printf "%s |", substr(alpha,i,1); v = tbl[i]; f = (v == 0 ? "." : sprintf("%.1f", v)); printf " %8s", f; printf "\n"; } } } function print_counts(title,tbl, i,j,v,f) { # prints count table "tbl[i,j]" printf "\n%s (counts)\n", title; for (i=1; i<=26; i++) { if (i in trseen) { printf "%s |", substr(alpha,i,1); for (j=1; j "/dev/stderr"; abort = 1; exit abort; } function arg_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; }