#! /usr/bin/gawk -f # Last edited on 2004-02-17 15:12:52 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] "\\\n" \ " [ -v field=NUM ] \\\n" \ " -v smp=LANG/BOOK -v sec=SEC \\\n" \ " [ -v writeGud=NUM ] [ -v writeBad=NUM ] \\\n" \ " < INFILE > OUTFILE" \ ); # Selects "good" Voynichese words for statistical analysis. # Looks at field number {field} of each input record. # If {writeGud} is 1, writes the entire record if that field is "good". # If {writeBad} is 1, writes the entire record if the word is "bad". # The {smp} and {sec} strings are provided just in case # the decision depends on them. if (smp == "") { arg_error("must define \"smp\""); } if (sec == "") { arg_error("must define \"sec\""); } if (field == "") { field = 1; } if (writeGud == "") { writeGud = 0; } if (writeBad == "") { writeBad = 0; } if (field < 1) { arg_error("bad field"); } if ((writeGud != 0) && (writeGud != 1)) { arg_error("bad writeGud"); } if ((writeBad != 0) && (writeBad != 1)) { arg_error("bad writeBad"); } if ((writeBad == 0) && (writeGud == 0)) { arg_error("no output"); } nread = 0; # Number of words read nwrite = 0; # Number of words written ngud = 0; nbad = 0; } (abort >= 0) { exit abort; } /^[ ]*([\#]|$)/ { print; next; } // { nread++; if (field > NF) { data_error("too few fields"); } w = $(field); if (w ~ /[A-Z]/) { data_error("field contains capital letters"); } else if (w ~ /[^a-z*?]/) { data_error("field contains special characters"); } else if (w ~ /[^a-z]/) { bad = 1; } else { gsub(/ch/, "C", w); gsub(/sh/, "S", w); gsub(/ckh/, "K", w); gsub(/cth/, "T", w); gsub(/cfh/, "F", w); gsub(/cph/, "P", w); bad = (w ~ /[^eiaoqydlrsnmktfpCSKTPF]/); } if (bad) { nbad++; } else { ngud++; } if ((bad && writeBad) || ((! bad) && writeGud)) { nwrite++; print; } } END { if (abort >= 0) { exit abort; } ntot = nbad + ngud; printf "%7d good records (%5.3f)\n", ngud, ngud/(ntot > 0 ? ntot:1) > "/dev/stderr"; printf "%7d bad records (%5.3f)\n", nbad, nbad/(ntot > 0 ? ntot:1) > "/dev/stderr"; printf "%7d words read, %7d written\n", ntot, nwrite > "/dev/stderr"; } function arg_error(msg) { printf "*** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function data_error(msg) { printf "file %s, line %d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; }