#! /usr/bin/gawk -f # Last edited on 2001-01-02 01:06:42 by stolfi # Reads EVMT text from stadard input, selects text units of given type(s). # # cat INFILE \ # | select-units \ # -v types='TYPE1,TYPE2,...,TYPEN' \ # -v table=TBLFILE \ # > OUTFILE # # Each line of TBLFILE should have two words UNIT TYPE specifying # the text type for each text unit that may occur in the INFILE # === ACTIONS =================================================== BEGIN { abort = -1; # Parse "types" option, set up "good[t] = 1" for each sleected type "t". if (types == "") arg_error("must specify \"-v types='TYPE1,TYPE2,...,TYPEN'\"\n"); n = split(types,tp,/[ ,]+/); split("", good); for (i=1;i<=n;i++) { good[tp[i]] = 1; } # Read the unit-to-type table, "utype[u]" if (table == "") arg_error("must specify \"-v table=FILE\"\n"); split("", utype); nMap=0; while((getline lin < table) > 0) { if (! match(lin, /^[#]/)) { nfld = split(lin, fld, " "); if (nfld != 2) arg_error(("bad table entry = \"" lin "\"")); if (fld[1] in dic) arg_error(("repeated key = \"" lin "\"")); utype[fld[1]] = fld[2]; nMap++; } } if (ERRNO != "0") { arg_error((table ": " ERRNO)); } close (table); if (nMap == 0) { arg_error(("file \"" table "\" empty or missing")); } # printf "loaded %6d table pairs\n", nMap > "/dev/stderr" } (abort >= 0) { exit abort; } /^#/ { next; } /^<[^<>;]*> *[{][^{}]*[}] *$/{ print; next; } /^]*[;][A-Z]>/)) { error("bad locator format"); } un = substr($0, 2, RLENGTH-4); gsub(/[.][0-9]+[a-z]?$/, "", un); if (! (un in utype)) { error(("unit \"" un "\" not in table")); } if (good[utype[un]]) { print; } next; } /./ { error("bad line format"); } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1 } function error(msg) { printf "line %d: %s\n", NR, msg >> "/dev/stderr"; abort = 1; exit 1 }