#! /usr/bin/gawk -f # Last edited on 1998-12-20 11:46:44 by stolfi BEGIN { abort = -1; usage = "compute-tuple-stats < TUPLECTS > TUPLESTATS"; # Reads a file whose lines have the form COUNT TUPLE CON MAJ, # where # # COUNT is an occurrence count for TUPLE. # # TUPLE is a string of 26 characters representing the # readings of one VMS character position by 26 potential # transcribers ("A" thru "Z"). # # CON is an arbitrary "consensus" reading for TUPLE. # # MAJ is an arbitrary "majority" reading for TUPLE. # # Writes a table of the form COUNT TRCODE CHR MAJ # where TRCODE is a transcriber code ("A" through "Z"), # CHR and MAJ are two EVA characters, COUNT is the number # of times that the transcriber read CHR when the majority reading was MAJ. alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; split("", tct); } /./ { if (NF != 4) format_error("wrong num of fields"); num = $1; tup = $2; con = $3; maj = $4; if (length(tup) != 26) { format_error("bad tuple length"); } # Add statistics for(i=1; i<=26; i++) { ch = substr(tup,i,1); if (ch != "%") { tc = substr(alpha,i,1); ch_seen[ch] = 1; ch_seen[maj] = 1; tct[tc,ch,maj] += num; } } next; } // { next; } END { # Print statistics for (i=1;i<=26;i++) { tc = substr(alpha,i,1); for (ch in ch_seen) { for (maj in ch_seen) { if ((tc,ch,maj) in tct) { printf "%7d %s %s %s\n", tct[tc,ch,maj], tc, ch, maj; } } } } } function arg_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function table_error(msg) { printf "file %s, line %d: *** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function format_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function program_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; abort = 1; exit abort; }