#! /usr/bin/gawk -f # Last edited on 1998-12-30 06:22:04 by stolfi BEGIN { abort = -1; usage = ( \ "compute-majority-table \\\n" \ " [ -v alternates=ALTS ] \\\n" \ " [ -v weights=WTFILE ] \\\n" \ " < TUPLECTS > TUPLEMAP" \ ); # Reads a file whose lines have the form COUNT TUPLE, where COUNT is # an integer and TUPLE is a string of 26 EVA characters representing the # readings of one VMS character position by 26 potential # transcribers ("A" thru "Z"). In this list "%" denotes "no # information" and "*" denotes "unreadable". # # Writes a file of the form COUNT TUPLE MAJR TOTWT where COUNT and # TUPLE are as in the input, MAJR the majority reading, and TOTWT is # the total number of votes cast on that letter. # # A reading equal to some character "C" denotes a certain number of # votes for "C", the number depending on "C" and on the # transcriber's weight (see below). A reading of "%" counts as no # vote, and readings of "*" get an infinitesimal fraction of a vote. # # If there are no votes on some character position, the majority # reading is "%". If the votes for some character C are more than # half of the total votes cast, then the majority reading is C. # Otherwise, if the readings "!", ".", and "," together have more # than half of the total votes, the majority reading is ",". In all # other cases the majority reading is "*". # # The transcrber weights can be specified in the WTFILE table, which # has entries of the form CODE WEIGHT where CODE is a letter "A" # through "Z" and WEIGHT is the (non-negative, integer) weight to be # given to readings by the transcriber with that CODE. The default # is weight 1 for everybody. # # Certain pairs of transcriber codes, like "F" and "G", are # actually two alternative reading by the same transcriber # (originally denoted by "[...|...]" in Landini's file). If # non-"%" readings are present for both codes, their weights # are divided by two. # # These pairs of transcriber codes are specified by the string ALTS, # a comma-separated list of letter pairs "XY,XY,...,XY". alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; # Compute the alternate transcription pairing map: "alter[i] = j" # and "alter[j] = i" if transcriber number "i" is an alternate # reading for transcriber "j", where "i" and "j" range in 1..26. npairs = split(alternates, trpair, ","); split("", seen); for (k=1; k<=npairs; k++) { xy = trpair[k]; if (! match(xy, /^[A-Z][A-Z]$/)) { arg_error(("bad alternative pair \"" xy "\"")); } x = index(alpha, substr(xy, 1,1)); y = index(alpha, substr(xy, 2,1)); if((x == 0) || (x in alter)) {arg_error(("bad/duplicated primary/alternate \"" p "\"")); } if((y == 0) || (y in alter)) {arg_error(("bad/duplicated primary/alternate \"" a "\"")); } alter[x] = y; alter[y] = x; # printf "alter[%s] = %s\n", x, y > "/dev/stderr"; # printf "alter[%s] = %s\n", y, x > "/dev/stderr"; } # Read weight table "wt[i]" where "i" is 1..26: if (weights != "") { nentries = 0; while (getline lin < weights) { if ((lin !~ /^[#]/) && (lin !~ /^ *$/)) { n = split(lin, fld); if (n != 2) { table_error("bad number of fields"); } if (fld[1] !~ /^[A-Z]$/) { table_error("bad letter"); } if (fld[2] !~ /^[0-9]+$/) { table_error("bad weight"); } i = index(alpha, fld[1]); if (i == 0) { print lin; print fld[1]; program_error("letter conv"); } wt[i] = fld[2]; nentries++; } } close(weights); if (nentries == 0) { arg_error("no entries in weight table"); } } else { for (i=1;i<=26;i++) { wt[i] = 1; } } } /./ { if (NF != 2) format_error("wrong num of fields"); ct = $1; tp = $2; nr = split(tp, rd, ""); if ((nr != 26) || (length(tp) != 26)) { format_error("bad tuple length"); } # Compute majority: split("", vote); wtot = 0; for(i=1; i<=26; i++) { c = rd[i]; if (c == "%") { } else { w = wt[i]; if ((i in alter) && (rd[alter[i]] != "%")) { w /= 2.0; } if (c == "*") { w /= 1000.0; } vote[c] += w; wtot += w; } } if (wtot == 0) { maj = "%"; } else { maj = "*"; for (c in vote) { if (2*vote[c] > wtot) { maj = c; break; } } if ((maj == "*") && (2*(0 + vote["."] + vote["!"] + vote[","]) > wtot)) { maj = ","; } } printf "%7d %s %s %6.2f\n", ct, tp, maj, wtot + 0.00001; next; } // { next; } function arg_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function table_error(msg) { printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function format_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function program_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; abort = 1; exit abort; }