#! /usr/bin/gawk -f # Last edited on 2002-02-11 21:57:03 by stolfi BEGIN { abort = -1; usage = ( "words-from-evt \\\n" \ " [ -v showLines=BOOL ] [ -v showParags=BOOL ] \\\n" \ " [ -v omitInitial=BOOL ] [ -v omitFinal=BOOL ] \\\n" \ " [ -v omitMedial=BOOL ] \\\n" \ " [ -v showLocation=BOOL ] \\\n" \ " < INFILE > OUTFILE" \ ); # By default, input line breaks and paragraph breaks are treated # just like any other wordspace. # If "showLines" is true, input line breaks are translated into a blank line, # input paragraph breaks are translated into two blank lines. # If "showLines" is false but "showParags" is true, input line breaks are # treated like word spaces, but paragraph breaks are translated into an # empty line. # If "omitInitial" is true, words immediately following a line or figure break are # omitted from the output. The option "omitFinal" is symmetric, and # "omitMedial" discards any words that are *not* adjacent to a line # or figure break. # If "showLocation" is true, prints the location code before every word # (but not in the blank lines). if (showLines == "") { showLines = 0; } if (showParags == "") { showParags = 0; } # It simplifies things if "showLines" implies "showParags" : if (showLines) { showParags = 1; } if (omitInitial == "") { omitInitial = 0; } if (omitMedial == "") { omitMedial = 0; } if (omitFinal == "") { omitFinal = 0; } if (omitInitial && omitMedial && omitFinal) { arg_error("omitting everything!"); } if (showLocation == "") { showLocation = 0; } # We first reduce the input file to a stream of words alternating # with single separators (".", "-", or "="). This stream is fed # through a filter "output_word" that converts the separators to # blank lines, as requested, and omits line -initial, -medial, or # -final tokens, as requested. leftover = "="; # Leftover separators from previous line curToken = ""; # Last token parsed but not printed, or "" if none. curInitial = 0; # TRUE if curToken was line-initial. nBlanks = 2; # Number of blank lines printed since last non-blank. nBlanksNeeded = 0; # Number of blank lines needed before next non-blank } (abort >= 0) { exit abort; } /^[ ]*([#]|$)/ { next; } /^<[^<>;]*>/ { next; } // { lin = $0; # Extract line locator: if (match(lin, /^<[^>]*>/)) { loc = substr(lin, RSTART+1, RLENGTH-2); } else { loc = "f0r.P0.0;X"; } # Remove line locator: gsub(/^<[^>]*> */, "", lin); # Remove embedded comments (twice in case of nested "{}"s) gsub(/{[^{}]*}/, "", lin); gsub(/{[^{}]*}/, "", lin); # Remove fillers: gsub(/[!]/, "", lin); # Reduce all bad char codes to "?" gsub(/[?*%]/, "?", lin); if (lin == "") { next; } # Prepend leftover delimiter: lin = ( leftover lin ); while (match(lin, /^[-\/=., ]+[^-\/=., ]+/)) { # Isolate the next word with its preceding delimiter w = substr(lin, 1, RLENGTH); lin = substr(lin, RLENGTH + 1); # Split them apart: if (! match(w, /^[-\/=., ]+/)) { prog_error(("missing delimiter")); } delim = substr(w, 1, RLENGTH); w = substr(w, RLENGTH + 1); # Output delimiter and word: output_word(delim, w); } if (lin !~ /^[-\/=., ]*$/) { prog_error(("left food on plate")); } # Newlines in file are implicit word breaks: leftover = ( lin "."); next; } function output_word(delim,w, isBreak,isParBreak,omit,nB) { # Process another delimiter and another word. if (delim == "") {prog_error(("empty delim")); } if (w == "") { prog_error(("empty w")); } # Outputs "curToken", if appropriate: isBreak = (delim ~ /[-\/=]/); isParBreak = (delim ~ /[=]/); # Note that "isParBreak" implies "isBreak". if (curToken != "") { curFinal = isBreak; omit = 0; omit += (omitInitial && curInitial); omit += (omitFinal && curFinal); omit += (omitMedial && (!curInitial) && (!curFinal)); if (omit == 0) { while (nBlanks < nBlanksNeeded) { print ""; nBlanks++; } if (showLocation) { printf "%s ", loc; } print curToken; nBlanks = 0; nBlanksNeeded = 0; } } # Update nBlanksNeeded according to "delim" and options. nB = (isBreak && showLines) + (isParBreak && showParags); if (nBlanksNeeded < nB) { nBlanksNeeded = nB; } # Save "w" as the current token: curToken = w; curInitial = isBreak; } END{ if (abort >= 0) { exit abort; } # Flush out last token: output_word("=", "EOF"); } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; printf "usage: %s\n", usage >> "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg >> "/dev/stderr"; abort = 1; exit 1; } function prog_error(msg) { printf "line %d: prog error - %s\n", FNR, msg >> "/dev/stderr"; abort = 1; exit 1; }