#! /usr/bin/gawk -f # Last edited on 2004-02-18 15:36:30 by stolfi BEGIN { abort = -1; usage = ( \ "cat main.wds \\\n" \ " | fix-tagged-words \\\n" \ " -f AWKLIB.gawk \\\n" \ " -v smp=SMP \\\n" \ " -v sec=SEC \\\n" \ " [ -v field=FIELD ] \\\n" \ " [ -v table=TABLE ] \\\n" \ " > GUD.wds" \ ); # # Performs adjustments to words extracted from a text sample # needed for proper statistical analysis. This may include change # of encoding, removing capitalization, elimination of some # words, etc. # # The word to be fixed is assumed to be field number {field} of each # input record (default 1). # # The {TABLE}, if specified, must contain pairs of words {OLD NEW}. # If word {OLD} appears in the input, it is replaced by {NEW}. # # The library "{AWKLIB}.gawk" must define the following functions: # # fix_word(smp, sec, wd) # # where {wd} is a word and {smp} and {sec} are # client-specified strings. The procedure should return a # cleaned copy of {wd}, e.g. without capitalization or undesired # markings. It may split {wd} by inserting blanks. # Note that this function may adjust its behavior # based on the {smp} and {sec} arguments, which # otherwise are not used for anything. # # define_patterns(smp,sec) # # A function that will be called before the first record, # to set up any tables and patterns that may be needed by # {fix_word}. # # A word that gets remapped to "*DELETE*", "*delete*" will be discarded. # The result of {fix_word} is split at blanks, and one output # record is written for each field. # if (smp == "") { arg_error("must define \"smp\""); } if (sec == "") { arg_error("must define \"sec\""); } split("", wmap); if (table != "") { # Read word-remapping table, if present. load_remapping_table(table); } if (field == "") { field = 0; } nread = 0; # Number of words read nwrite = 0; # Number of words written define_patterns(smp, sec); } (abort >= 0) { exit abort; } /^[ ]*([\#]|$)/ { next; } /./ { nread++; word = $(field); if (word in wmap) { word = wmap[word]; if ((word == "*DELETE*") || (word == "*delete*")) { next; } } word = fix_word(smp, sec, word); if ((word == "*DELETE*") || (word == "*delete*")) { next; } nwds = split(word, wds, /[ \012]/); for(i = 1; i <= nwds; i++) { if (wds[i] != "") { $(field) = wds[i]; print; nwrite++; } } next; } END { if (abort >= 0) { exit abort; } printf "%s: %7d words read, %7d written\n", smp, nread, nwrite > "/dev/stderr"; } function load_remapping_table(file, nMap,lin,fld,nfld) { # Reads a word mapping table from "file", containing pairs # of the form ORGINAL NEW. # Stores the table in "wmap[ORIGINAL] = NEW". nMap=0; split("", wmap) while((getline lin < file) > 0) { gsub(/^[ ]*/, "", lin); if (! match(lin, /^([\#]|$)/)) { gsub(/[ ]*[\#].*$/, "", lin); nfld = split(lin, fld, " "); if (nfld != 2) tbl_error(file, ("bad table entry = \"" lin "\"")); if (fld[1] in wmap) tbl_error(file, ("repeated key = \"" lin "\"")); wmap[fld[1]] = fld[2]; nMap++; } } if (ERRNO != "0") { arg_error((file ": " ERRNO)); } close (file); if (nMap == 0) { printf "warning: file \"" file "\" empty or missing\n" > "/dev/stderr"; } else { printf "loaded %6d map pairs\n", nMap > "/dev/stderr"; } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort=1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function tbl_error(file, msg) { printf "file %s, line %s: %s\n", file, FNR, msg > "/dev/stderr"; abort = 1; exit 1; }