#! /bin/gawk -f # Last edited on 2002-02-11 18:01:59 by stolfi BEGIN { abort = -1; usage = ( \ "cat RAW.evt \\\n" \ " | evt-to-wds \\\n" \ " | fix-raw-words \\\n" \ " -f FIXFNS.gawk \\\n" \ " -v sample=SAMPLE \\\n" \ " [ -v field=FIELD ] \\\n" \ " [ -v table=TABLE ] \\\n" \ " > GUD.wds" \ ); # Performs any adjustments to words extracted from text sample # SAMPLE needed for proper statistical analysis. # This may include a change of encoding, removing capitalization, # elimination of selected words, etc. # # The output is the "raw" word list: it should still include all # pronounceable words, numerals, and symbols, but should exclude # punctuation and other silent marks. if (sample == "") { arg_error("must define \"sample\""); } if (table == "") { split("", wmap); } else { # Read word-remapping table, if present. # To discard a word from the "raw" susbset, # map it to the string "*DELETE*". load_remapping_table(table); } if (field == "") { field = 0; } nread = 0; # Number of words read nwrite = 0; # Number of words written } (abort >= 0) { exit abort; } /^[ ]*([#]|$)/ { next; } /./ { nread++; word = $(field); if (word in wmap) { word = wmap[word]; if ((word == "*DELETE*") || (word == "*delete*")) { next; } } word = fix_raw_word(word); if ((word == "*DELETE*") || (word == "*delete*")) { next; } nwds = split(word, wds, "\n"); for(i = 1; i <= nwds; i++) { $(field) = wds[i]; print; nwrite++; } next; } END { if (abort >= 0) { exit abort; } printf "%s: %7d words read, %7d written\n", sample, nread, nwrite > "/dev/stderr"; } function load_remapping_table(file, nMap,lin,fld,nfld) { # Reads a word mapping table from "file", containing pairs # of the form ORGINAL NEW. # Stores the table in "wmap[ORIGINAL] = NEW". nMap=0; split("", wmap) while((getline lin < file) > 0) { gsub(/^[ ]*/, "", lin); if (! match(lin, /^([#]|$)/)) { gsub(/[ ]*[#].*$/, "", lin); nfld = split(lin, fld, " "); if (nfld != 2) tbl_error(file, ("bad table entry = \"" lin "\"")); if (fld[1] in wmap) tbl_error(file, ("repeated key = \"" lin "\"")); wmap[fld[1]] = fld[2]; nMap++; } } if (ERRNO != "0") { arg_error((file ": " ERRNO)); } close (file); if (nMap == 0) { printf "warning: file \"" file "\" empty or missing\n" > "/dev/stderr"; } else { printf "loaded %6d map pairs\n", nMap > "/dev/stderr"; } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort=1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function tbl_error(file, msg) { printf "file %s, line %s: %s\n", file, FNR, msg > "/dev/stderr"; abort = 1; exit 1; }