#! /usr/bin/gawk -f # Last edited on 1999-01-04 17:59:07 by stolfi # Creates a raw concordance for all words and short phrases in an # interlinear file. # # Usage: # # cat TEXT \ # | enum-text-phrases -f eva2erg.gawk \ # [-v maxLength=MAXLEN] \ # [-v leftContext=LCONT] \ # [-v rightContext=RCONT] \ # > OCCS # # This script reads a single-version text from stdin (in plain or # ".evt" interliner format), and, for every occurrence of every word # or sufficiently short phrase in it, writes to stdout a record of the # form # # LOC TRANS START LENGTH LCTX PHRASE RCTX # 1 2 3 4 5 6 7 # # where # # LOC is a line locator, e.g. "f86v5.L5" or "f86v5.C2.12a" # # TRANS is a letter identifying a transcriber, e.g. "F" for Friedman # # START is the index of the phrase in the line (START=1 means col. 20). # # LENGTH is the original length of the phrase in the text, # including fillers, {}-comments, spaces, etc.. # # LCTX is a word or word sequence, the left context of PHRASE, # # PHRASE is the word or word sequence in question. # # RCTX is a word or word sequence, the right context of PHRASE # # # The PHRASE does not contain any EVA fillers ([!% ]), newlines, or # {}-comments. # # A word is a string of characters delimited by any of the EVA # word-spaces [-=,.]. (Note that ASCII blanks are treated as fillers # and not word breaks.) # # The PHRASE may extend across gaps and ordinary line breaks # ("-") but not paragraph breaks ("=") or changes of textual unit. In # that case the LENGTH field does not count the newline characters, # columns 1-19, or any intervening "#"-comments. # # In both the phrase and the context strings, the uncertain spaces "," # are replaced by ordinary spaces ".", The line nad paragraph breaks # are preserved, except that line-final "-" is changed to "/" to # distinguish it from embedded "-" denoting gaps, vellum defects, or # intruding figures. # # A multi-word PHRASE is considered sufficiently short if contains at # most MAXLENGTH EVA characters, not counting fillers and word spaces. # # The context fields LCTX and RCTX are writen only if the parameters # "leftContext" and "rightContext" are specified. If they are # present, each of these two fields includes the word separator # characters that delimit it on both sides. The left context includes # as many whole words as needed to make "leftContext" characters # (including all delims after the first); and symmetrically for # "rightContext". # # If the file is in ".evt" format, this script will get the LOC # and TRANS fields from columns 1-19. The TRANS code is # optional, but the unit must be explicit (i.e. there should be no # "anonymous" units). If the input file is not in ".evt" format, the # script assumes the location code is f0.P.NNN;X where NNN is the # input record number. function gather_words(str, wp, wl, ws, \ i, k, kb, m, b, c, bSep, cSep) { # Stores in wp[i] the index of the first char # of the ith non-empty word of "str". # Also stores in wl[i] its length, and in ws[i] the following # separator (or "." if for some reason the separator is not found). # Returns the number of words found. # Assumes "str" has been cleaned of comments. # Turn line-final "-" into "/": str = gensub(/[-]([!%]*)$/, "/\\1", " ", str); # Replace uncertain spaces by "." gsub(/[,]/, ".", str); # Pad with "." just for sentinels. str = ("." str "."); m = length(str); n = 0; b = substr(str,1,1); if (b != ".") { error("internal padding error"); exit; } bSep = 1; for(k=2; k<=m; k++) { c = substr(str,k,1); cSep = (match(c, /[-/.,=]/) != 0); if (bSep && (! cSep)) { kb = k; } if ((! bSep) && cSep) { n++; wp[n] = kb - 1; wl[n] = k-kb; ws[n] = c; } b = c; bSep = cSep; } if (c != ".") { error("internal padding error"); exit; } return n; } # === THE WORD BUFFER =========================================== function clear_words() { # Clears the word buffer: nbuf = 0; split("", wbuf); # wbuf[1..nbuf] are the saved words, squeezed. split("", wlen); # wlen[i] is length(wbuf[i]). split("", wloc); # wloc[i] the loc and trcode of line containing wbuf[i] (sep " "). split("", wiof); # wiof[i] is the index of wbuf[i]'s first char in the line. split("", wfof); # wfof[i] is the index of the first char after wbuf[i]. split("", wsep); # wsep[i] is the word separator after wbuf[i]. split("", wskp); # wskp[i] is the number of non-word bytes skipped after wbuf[i]. wskp[0] = 0; # Sentinel. } function append_word(wd, sep, loc, iindex, findex) { # Appends to the word buffer the word "wd" and following separator "sep", # with location "loc", original indices "[iindex..findex-1]" if (iindex < 1) { error("append_word: iindex error"); exit; } if (findex <= iindex) { error("append_word: findex error"); exit; } nbuf++; wbuf[nbuf] = wd; wlen[nbuf] = length(wd); wloc[nbuf] = loc; wiof[nbuf] = iindex; wfof[nbuf] = findex; wsep[nbuf] = sep; wskp[nbuf] = 0; } function append_filler(len) { # Records that "len" filler bytes were skipped after the # last word in the buffer if (len < 0) { error("append_filler: internal error"); exit; } wskp[nbuf] += len } function dump_phrases( \ i, j, k, len, locw, olen, ctx, wtmp) { # Writes to stdout all words and short phrases currently in # the word buffer. Also increments nWords. # for (i=1; i<=nbuf; i++) { nWords++; j = i; len = 0; off = wiof[i]; olen = 0; locw = wloc[i]; while((j <= nbuf) && ((j == i) || (len + wlen[j] <= maxLength))) { # Output one record for phrase "wbuf[i..j]": olen += wfof[j] - wiof[j]; len += wlen[j]; printf "%s %d %d", locw, off, olen; if (leftContext >= 0) { print_left_context(i, leftContext); } printf " %s", wbuf[i]; for (k=i+1; k<=j; k++) { printf "%s%s", wsep[k-1],wbuf[k]; } if (rightContext >= 0) { print_right_context(j, rightContext); } printf "\n"; olen += wskp[j]; j++; nPhrases++; } } clear_words(); } function print_left_context(i, minlen, k, len) { # Prints the words preceding "wbuf[i]", and their delimiters, # with at least "minlen" total word characters. len = 0; k = i-1; while((len < minlen) && (k >= 1)) { len += 1 + wlen[k]; k--; } printf " %s", (k < 1 ? "=" : wsep[k] ); k++; while(k < i) { printf "%s%s", wbuf[k], wsep[k]; k++; } } function print_right_context(j, minlen, k, len) { # Prints the words following "wbuf[j]", and their delimiters, # with at least "minlen" total word characters. len = 0; printf " %s", wsep[j]; k = j+1; while((len < minlen) && (k <= nbuf)) { printf "%s%s", wbuf[k], wsep[k]; len += wlen[k] + 1; k++; } } # === ACTIONS =================================================== BEGIN { abort = 0; if (maxLength == "") { maxLength = 0; } if (leftContext == "") { leftContext = -1; } if (rightContext == "") { rightContext = -1; } # Clears the word buffer: clear_words(); # Count of words preceding the first one in the word buffer: nWords = 0; # Count of phrases written to stdout: nPhrases = 0; # Current page+unit and line: cur_unit = ""; cur_line = ""; cur_trans = ""; } /^#/ { if (abort) exit; next; } /./ { if (abort) exit; if (match($0, /^<[fc][0-9]+[vr][0-9]?[.]([A-Za-z][A-Za-z0-9]?[.]|[A-Z]|)[0-9]+[a-z]?([;][A-Z]|)>/)) { loc = substr($0,2,RLENGTH-2); skip = 19; } else if (substr($0,1,1) == "<") { error("bad location code"); } else { loc = ("f0.P." NR ";X"); skip = 0; } if (skip >= length($0)) next; # Analyze and regularize location code: len = length(loc); if (substr(loc,len-1,1) != ";") { trans = "X"; # error("semicolon?"); } else { trans = substr(loc,len,1); loc = substr(loc, 1, len-2); } if (! match(loc, /[0-9]+[a-z]?$/)) { error("prog error: no line num"); } else { line = substr(loc, RSTART); unit = substr(loc, 1, RSTART-1); } if (trans == cur_trn) { } else if (cur_trn == "") { cur_trn = trans; } else { error("wrong transcriber code"); } loc = (loc " " trans); # Do not consider phrases that span more than one text unit: if (unit != cur_unit) { dump_phrases(); cur_unit = unit; } else if (line == cur_line) { error("repeated line"); } cur_line = line; # Get the text proper "txt", omitting skipped part, # and replace comments and fillers by "!"; txt = erg_erase_comments(substr($0,1+skip)); # Replace four or more "*"s in the same word by "***": while(gsub(/[!][*]/, "*!", txt) > 0) { } while(sub(/[*][*][*][*]/, "!***", txt) > 0) { } # Extract the non-empty words from "txt": split("", wp); split("", wl); split("", ws); nw = gather_words(txt, wp, wl, ws); # Append the words to the word buffer: findex = 1; for (i=1; i<=nw; i++) { wd = erg_pack(substr(txt, wp[i], wl[i])); if (wd != "") { iindex = wp[i]; append_filler(iindex - findex); findex = wp[i] + wl[i]; # print i, wp[i], wl[i], iindex, findex > "/dev/stderr"; append_word(wd, ws[i], loc, iindex, findex); } } append_filler(length(txt) - findex); # Phrases should not span paragraph boundaries: if (match(txt, /[=]/)) { if (! match(substr(txt, RSTART+1), /[!% ]*$/)) { error("embedded paragraph"); } dump_phrases(); } next; } END { if(abort) exit; dump_phrases(); printf "read %6d words\n", nWords > "/dev/stderr"; printf "wrote %6d phrases\n", nPhrases > "/dev/stderr"; } function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr" abort = 1 exit }