#! /usr/bin/gawk -f # Last edited on 2004-02-25 23:41:29 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] "\\\n" \ " -f FUNCS.gawk \\\n" \ " -v smp=SMP \\\n" \ " -v sec=SEC \\\n" \ " [ -v maxAlpha=NUM ] \\\n" \ " -v keepComments=BOOL \\\n" \ " -v keepSections=BOOL \\\n" \ " -v keepLineNums=BOOL \\\n" \ " -v keepOutside=BOOL \\\n" \ " -v keepSymbols=BOOL \\\n" \ " -v keepPunct=BOOL \\\n" \ " < main.wds > select.wds " \ ); # # Selects words from "main.wds" based on sec and type. # # Each input line must have the format "{TYPE} {WORD}", where {WORD} # is a token from the text, and {TYPE} is its type ("#"=comment, # "$"=section tag, "@"=line number, "a"=alpha, "b"=blank, "p"=punct, # "s"=symbol, "n"=null). # # This script recomputes the type of each input record by calling a # procedure from the user-specified library "FUNCS.gawk": # # smp_reclassify_word(smp, sec, cursec, curlin, type, wd) # # where {wd} is a word read from the input, {type} is its input # type, {smp} and {sec} are user-specified strings, and {cursec} # and {curlin} are the section tag and line number containing # this occurrence of {wd} (saved from the last "$" and "@" # records preceding this one). The procedure must return a new type # for {wd}; or "x", meaning that the record should be excluded # for being outside the region of interest. # # The script then disposes of the input record according to the # type returned by {smp_reclassify_word}: # # "#" written only if {keepComments=1}. # "$" written only if {keepSections=1}. # "@" written only if {keepLineNums=1}. # "x" written only if {keepOutside=1}. # "s" written only if {keepSymbols=1}. # "p" written only if {keepPunct=1}. # "a" always written. # "n" always discarded. # "b" always discarded. # # The script stops after {maxAlpha} "a"-type words have been # written. The default is to process the whole input file. # # User must also provide a function # # smp_initialize(smp, sec) # # that is called before the first record. This procedure could, for # instance, precompile any complicated patterns to be used by # {smp_reclassify_word}. # if (smp == "") { arg_error("must define \"smp\""); } if (sec == "") { arg_error("must define \"sec\""); } if (maxAlpha == "") { maxAlpha = -1; } if (keepComments == "") { keepComments = 0; } if (keepSections == "") { keepSections = 0; } if (keepLineNums == "") { keepLineNums = 0; } if (keepOutside == "") { keepOutside = 0; } if (keepSymbols == "") { keepSymbols = 0; } if (keepPunct == "") { keepPunct = 0; } curSec = ""; curLin = ""; smp_initialize(smp, sec); nAlpha = 0; nWritten = 0; } (abort >= 0) { exit abort; } ((maxAlpha >= 0) && (nAlpha >= maxAlpha)) { exit 0; } ($1 ~ /^[\#$@anpbs]$/) { type = $1; if (type == "#") { wd = substr($0, 3); } else { if (NF != 2) { data_error("bad input format"); } wd = $2; if (type == "$") { curSec = wd; } else if (type == "@") { curLin = wd; } } if (type !~ /^[\#@$]/) { type = smp_reclassify_word(smp, sec, curSec, curLin, type, wd); } if (type == "#") { if (keepComments) { output_word(type, wd); } } else if (type == "$") { if (keepSections) { output_word(type, wd); } } else if (type == "@") { if (keepLineNums) { output_word(type, wd); } } else if (type == "x") { if (keepOutside) { output_word(type, wd); } else if (type == "s") { if (keepSymbols) { output_word(type, wd); } else if (type == "p") { if (keepPunct) { output_word(type, wd); } } else if (type == "a") { output_word(type, wd); } else if ((type == "n") || (type == "b")) { } else { data_error(("invalid type tag = \"" type "\"")); } # printf "%s --> %s\n", $1, type > "/dev/stderr"; next; } // { data_error(("invalid input type tag \"" $1 "\"")); } END { if (abort >= 0) { exit abort; } printf "%d records (%d alpha)\n", nWritten, nAlpha > "/dev/stderr"; } function output_word(type, wd) { # Outputs word {wd} of type {type}. # Updates {nAlpha} printf "%s %s\n", type, wd; if (type == "a") { nAlpha++; } nWritten++; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; }