#! /bin/gawk -f # Last edited on 2004-02-17 15:06:10 by stolfi BEGIN { abort = -1; usage = ( \ "cat RAW.wds \\\n" \ " | select-gud-bad-words \\\n" \ " -f AWKLIB.gawk \\\n" \ " -v smp=SMP \\\n" \ " -v sec=SEC \\\n" \ " [ -v inField=NUM ] \\\n" \ " [ -v maxGud=NUM ] \\\n" \ " -v writeGud=BOOL \\\n" \ " -v writeBad=BOOL \\\n" \ " > GUD.wds" \ ); # # Selects "good" and/or "bad" words for statistical analysis # generally that means discarding symbols, numerals, # unreadable words, etc.. # # Looks at field number `inField' of each input record. # # If {writeGud} is TRUE, writes every record with a good word. # If {writeBad} is TRUE, writes every record with a bad word. # # If {maxGud} is specified, stops after seeing that many good words. # # The library "{AWKLIB}.gawk" must define the procedure # define_patterns(smp, sec) (which will be called before the first # record) and the predicate is_good_word(smp, sec, wd). The latter # may depend on the {smp} and {sec}, which are arbitrary # client-given strings. # if (inField == "") { inField = 1; } if (writeGud == "") { writeGud = 0; } if (writeBad == "") { writeBad = 0; } if (maxGud == "") { maxGud = -1; } if (inField < 1) { arg_error("bad inField"); } if ((writeGud != 0) && (writeGud != 1)) { arg_error("bad writeGud"); } if ((writeBad != 0) && (writeBad != 1)) { arg_error("bad writeBad"); } if ((writeGud == 0) && (writeBad == 0)) { arg_error("no output"); } nread = 0; # Number of words read nwrite = 0; # Number of words written ngud = 0; nbad = 0; define_patterns(smp, sec); } (abort >= 0) { exit abort; } ((maxGud >= 0) && (ngud >= maxGud)) { exit 0; } /^[ ]*([\#]|$)/ { next; } /./ { nread++; if (inField > NF) { data_error("too few fields"); } word = $(inField); if (word == "") { next; } gud = is_good_word(smp, sec, word); if (gud) { ngud++; } else { nbad++; } if ((gud && writeGud) || ((! gud) && writeBad)) { print; nwrite++; } next; } END { if (abort >= 0) { exit abort; } ntot = nbad + ngud; printf "%7d gud records (%5.3f)\n", ngud, ngud/(ntot > 0 ? ntot:1) > "/dev/stderr"; printf "%7d bad records (%5.3f)\n", nbad, nbad/(ntot > 0 ? ntot:1) > "/dev/stderr"; printf "%7d words read, %7d written\n", nread, nwrite > "/dev/stderr"; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort=1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }