#! /usr/bin/gawk -f # Last edited on 2023-05-14 13:23:45 by stolfi # ** MUST BE SAVED IN ISO-LATIN-1 ** BEGIN { abort = -1; usage = ( ARGV[0] "\\\n" \ " -f FUNCS.gawk \\\n" \ " -v smp=SMP \\\n" \ " -v sec=SEC \\\n" \ " [ -v table=TABLE.tbl ] \\\n" \ " [ -v maxAlpha=NUM ] \\\n" \ " < INFILE > OUTFILE " \ ); # Converts a Langbank token file ("main.wds") into a "raw.alw" or "raw.tlw" # language sample file for VMS comparative analysis studies. # # Each input record must have the format "{type} {word}", where # {type} is one of [#@$aspbn] and defines the token type, as follows: # # "#" = {word} is a #-comment (may include blanks). # "$" = start of the section whose full ID is {word}, e.g. "{GEN}{c3}{v14}". # "@" = start of line number {word} in the original text. # "a" = {word} is an alpha token. # "s" = {word} is a symbol-like token (numeral, math symbol, etc.). # "p" = {word} is a punctuation-like token. # "b" = {word} is a blank-like token (should not occur). # "n" = {word} is a null token (should not occur). # # Output records will have the format "{type} {loc} {word}" where # {word} is a text token, {type} is "a" or "s", and {loc} is the # token's location in the original book. The location {loc} will # consist of a full book section ID {curSec}, concatenated with the # a book line number {curLin} surrounded in "{" and "}". # # This script first recomputes the type of each input record of type # "a", "p", or "s" by calling a procedure from the user-specified # library "FUNCS.gawk": # # smp_reclassify_word(smp, sec, curSec, curLin, type, word) # # where {word} is the {body} of the input record; {type} is its # single-character type tag, as above; {smp} and {sec} are the # desired sample and sub-sample tags, specified by the user; and # {curSec} and {curLin} are the full section ID and line number # containing this occurrence of {word} in the original book. The # procedure must return a new type for {word}, according to the # table above; or "x", meaning that the record is in an unwanted # subsection of the selected section that behaves like a symbol # (foreign phrase, table, poem, etc.). # # The values of {curSec} and {curLin} are obtained from the last "$" # and "@" records, respectively, before the current record. Note # that the format of {sec} may be very different from that of # {curSec}; e.g. {sec="gen.1"} could mean "take the whole text from # Genesis", i.e. {curSec ~ /{GEN}{c[0-9]+}{v[0-9]+}/}. # # After calling {smp_reclassify_word}, the script discards # the record if its new type is "#", "$", "@", "n", or "b". # # The remaining "a", "p", "s", or "x" record has its {word} adjusted to # be suitable for statistical analysis. This may include change of # encoding, de-capitalization, compound splitting, supression of # some letters or words, etc. # # The script then applies the function # # smp_fix_word(smp, sec, type, word) # # where {smp} and {sec} are the desired sample and section, # {type} is the new type ("a", "p", "s", or "x"), and {word} is # the input word. The procedure should return a cleaned # copy of {word}, e.g. without capitalization or undesired # markings. It may split {word} by inserting blanks and/or # newlines. # # A word that gets remapped to "*DELETE*", "*delete*", or all blanks # will be discarded. The result of {fix_word} is split at blanks or # newlines, and each field is processed as if it were a separate # input record, with the same {type} and {LOC}. # # The individual "a" records are then tested with # {smp_is_good_word(smp, sec, word)}, and the {type} is changed to # "s" if the result is FALSE. The "a", "p", and "s" records then # written to the output. # # As for "x"-records, runs of three or more are squeezed, leaving only # the first and last records of the run. These have their word enclosed # in braces and marked with "*", e.g. "finis" becomes "*{finis}", # and their type gets replaced by "s". # # The resulting file is written to # # The file "" is then truncated after {maxAlpha} "a"-type tokens. # The default is to process the whole input file. # # INITIALIZATION # # If the {table} argument is specified, it must name a file that # contain pairs of words "{OLD} {NEW}". This script will read that # file and create an array {wmap} with {wmap[OLD] = NEW}. This table # may then be used by {smp_reclassify_word} and/or {smp_fix_word} # # The "FUNCS.gawk" library must also define a function # # smp_define_patterns(smp, sec) # # that will be called by this script, after loading the {wmap} table # (if any) but before processing the first input record. This # procedure could, for instance, precompile any complicated patterns # to be used by {smp_reclassify_word}. if (smp == "") { arg_error("must define \"smp\""); } if (sec == "") { arg_error("must define \"sec\""); } if (maxAlpha == "") { maxAlpha = -1; } if (maxRead == "") { maxRead = -1; } debug = 0; # debug = 1; maxRead = 200; s = "???"; n = "???"; curSec = ""; curLin = ""; nExRun = 0; # Number of consecutive "x"-type records generated so far # Data of last "x"-type output record (not written), if {nExRun >= 2}: lastExSec = ""; # Location. lastExLin = ""; # Original line number. lastExWord = ""; # Word. nRead = 0; # Total records read nWritten = 0; # total records written to output. nAlpha = 0; # total "a"-type records written. nSymbol = 0; # total "s"-type records written. nPunct = 0; # total "p"-type records written. nIntrude = 0; # total "x"-type records written (as "s"-type). split("", wmap); if (table != "") { # Read word-remapping table, if present. load_remapping_table(table); } if (field == "") { field = 0; } smp_define_patterns(smp, sec); } (abort >= 0) { exit abort; } /^ *$/ { next; } # Stop if enough: ((maxRead >= 0) && (nRead >= maxRead)) { exit 0; } ($1 ~ /^.$/) { nRead++; # Get type tag {type} and body {word}: type = $1; if (type == "#") { word = substr($0, 3); } else { if (NF != 2) { data_error("bad input format"); } word = $2; } # Dispose accordingly: if (type == "$") { curSec = word; } else if (type == "@") { curLin = word; } else if (type ~ /^[#nb]/) { } else if (type ~ /^[aspx]/) { if (debug) { printf "@@ %s:[%s]\n", type, word > "/dev/stderr"; } # Reclassify word and check whether it is inside the desired section: otype = type; type = smp_reclassify_word(smp, sec, curSec, curLin, type, word); if (debug) { printf " -r-> %s:[%s]\n", type, word > "/dev/stderr"; } if (type ~ /[apsx]/) { # Apply sample-specific adjustments: oword = word; word = smp_fix_word(smp, sec, type, word); if (debug) { printf " -f-> %s:[%s] -> %s:%s\n", type,oword, type,word > "/dev/stderr"; } if ((word == "*DELETE*") || (word == "*delete*")) { word = ""; } # Split into separate words at blanks: nwds = split(word, wds, /[ \012]+/); # Write each word separately for(i = 1; i <= nwds; i++) { wdi = wds[i]; tpi = type; if (wdi != "") { if ( tpi == "a" ) { # Re re-reclassify each piece: tpi = ( smp_is_good_word(smp, sec, tpi, wdi) ? type : "s" ); } else { tpi = type; } if (debug) { printf " -g-> %s:[%s]\n", tpi, wdi > "/dev/stderr"; } output_word(tpi, curSec, curLin, wdi); } } next; } else if (type !~ /[#nb]/) { data_error(("invalid new type tag \"" type "\"")); } } else { data_error(("unknown input type tag \"" type "\"")); } next; } // { data_error(("invalid input type tag \"" $1 "\"")); } END { if (abort >= 0) { exit abort; } flush_x_record(); printf " %d records read, %d written (%d alpha, %d symbol, %d punct, %d intrusions)\n", \ nRead, nWritten, nAlpha, nSymbol, nPunct, nIntrude > "/dev/stderr"; } function output_word(type,aSec,aLin,word, gud) { # Outputs word {word} of type {type}, squeezing long runs of "x" tokens. # Updates {nAlpha}, {nSymbol}, {nPunct}, {nIntrude}, {nWritten}. # Manages {nExRun}, {lastExSec}, {lastExLin}, {lastExWord}. # Exits the script (with 0) after writing {maxAlpha} "a"-records. if (type == "x") { # Squeeze runs of "x" records, mark them as bad (type "s"): word = ("*{" word "}"); type = "s"; if (nExRun == 0) { # First in a run of "x"-records, print it: print fmt_word(type,aSec,aLin,word); nWritten++; nIntrude++; nExRun = 1; } else { # Non-first in a run of "x"-records, save it for now: lastExSec = aSec; lastExLin = aLin; lastExWord = word; nExRun++; } } else { # Do we have any "x"-record waiting to be printed? flush_x_record(); # Print it: print fmt_word(type,aSec,aLin,word); nWritten++; if (type == "a") { nAlpha++; # Have we written enough stuff: if ((maxAlpha >= 0) && (nAlpha >= maxAlpha)) { exit 0; } } else if (type == "s") { nSymbol++; } else if (type == "p") { nPunct++; } } } function flush_x_record() { # If there is any "x"-record suspended, print it: if (nExRun >= 2) { if (nExRun >= 3) { lastExWord = ( ".." lastExWord ); } # Flush last "x"-record: print fmt_word("s",lastExSec,lastExLin,lastExWord); nWritten++; nIntrude++; } nExRun = 0; } function fmt_word(type,aSec, aLin, word) { # Formats a word {word} of type {type} for output, # including {aSec} and {aLin}: return sprintf("%s %s{%s} %s", type, aSec, aLin, word); } function load_remapping_table(file, nMap,lin,fld,nfld) { # Reads a word mapping table from "file", containing pairs # of the form ORGINAL NEW. # Stores the table in "wmap[ORIGINAL] = NEW". nMap=0; split("", wmap) ERRNO = ""; while((getline lin < file) > 0) { gsub(/^[ ]*/, "", lin); if (! match(lin, /^([#]|$)/)) { gsub(/[ ]*[#].*$/, "", lin); nfld = split(lin, fld, " "); if (nfld != 2) tbl_error(file, ("bad table entry = \"" lin "\"")); if (fld[1] in wmap) tbl_error(file, ("repeated key = \"" lin "\"")); wmap[fld[1]] = fld[2]; nMap++; } } if (ERRNO != "") { arg_error((file ": ERRNO = " ERRNO)); } close (file); printf " loaded %6d map pairs\n", nMap > "/dev/stderr"; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function tbl_error(file, msg) { printf "file %s, line %s: %s\n", file, FNR, msg > "/dev/stderr"; abort = 1; exit 1; }