#! /usr/bin/gawk -f # Must specify -f eva2erg.gawk # Creates a raw concordance for all words and short phrases in # a label/title dictionary # # Usage: # # cat TEXT \ # | enum-label-phrases -f eva2erg.gawk \ # [-v maxlen=MAXLEN] \ # > OCCS # # This script reads from stdin a database of labels and titles, # in the ".idx" format (see Note-010.html). Each input record should # have the form # # LOCATION|LABEL|CLASS|MEANING|SECTION|COMMENTS # 1 2 3 4 5 6 # # and, for occurrence of every word or # sufficiently short phrase in it, writes to stdout a record of the # form # # FNUM UNIT LINE TRANS START LENGTH POS STRING OBS # 1 2 3 4 5 6 7 8 10 # # where FNUM, UNIT, LINE and TRANS are the components of the LOCATION, # and the OBS field is a join of the SECTION and MEANING fields. The # POS field is set to zero. The other fields are as defined in # enum-text-phrases, except that the output STRINGs will always be # confined to a single input line. function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr" abort = 1 exit } function gather_words(str, wd, wo, wl, \ i, k, kb, m, b, c, ww) { # Stores in wd[i] the ith non-empty word of "str", # Also set wo[i] to its original index, # and wl[i] to its original length. # Returns the number of words found. str = erg_unify_word_spaces(erg_erase_comments(str)); str = ("." str "."); m = length(str); n = 0; b = substr(str,1,1); for(k=2; k<=m; k++) { c = substr(str,k,1); if ((b == ".") && (c != ".")) { kb = k; } if ((b != ".") && (c == ".")) { ww = erg_pack(substr(str, kb, k-kb)); if (ww != "") { n++; wd[n] = ww; wo[n] = kb - 1; wl[n] = k - kb; } } b = c; } return n; } function dump_phrases(lc, wd, wo, wl, nw, ob, \ i, j, k, len, locf, olen, pos) { # Writes to stdout all words and short phrases # formed from words "wd[1..nw]", given their # original indices "wo[1..nw]" and lengths "wl[1..nw]". # Prints them with location "lc", and observations "ob". gsub(/[.;]/, " ", lc); pos = 0; for (i=1; i<=nw; i++) { nWords++; j = i; len = 0; olen = 0; while((j <= nw) && ((j == i) || (len + length(wd[j]) <= maxlen))) { len += length(wd[j]); olen = (wo[j] - wo[i]) + wl[j]; printf "%s %d %d %d %s", lc, wo[i], olen, pos, wd[i]; for (k=i+1; k<=j; k++) { printf ".%s", wd[k]; } printf " %s\n", (ob == "" ? "-" : ob); j++; nPhrases++; } } } # === ACTIONS =================================================== BEGIN { abort = 0; FS = "|"; nPhrases = 0; nWords = 0; } /^#/ { if (abort) exit; next; } /./ { if (abort) exit; location = $1; text = $2; class = $3; meaning = gensub(/ /, "_", "g", $4); section = $5; split("", words); split("", indices); split("", lengths); obs = (meaning == "?" ? section : (section ":" meaning)); nw = gather_words(text, words, indices, lengths); dump_phrases(location, words, indices, lengths, nw, obs); next; }