#! /n/gnu/bin/gawk -f # Must specify -f eva2erg.gawk BUGGY AND OBSOLETE NOW. THE PART THAT SHOWS OCCURENCES MAY STILL BE USEFUL - READ A CONCORDANCE FILE AND A TEXT FILE, AND PRINT THEM # Finds exact or approximate word matches betwen two files. Usage: # # cat INPUT \ # | find-matches \ # [ -v field=FLDNUM ] \ # -v words=WORDFILE \ # > OCCS # # This script reads a list of words WORDFILE, and # then copies to stdout every record from stdin that # contains . # # Each record of the input files should begin with one blank-delimited # word. FIle FILEA must be sorted by that word. # # Two records are said to match if they begin with the same word. # Whenever it finds one or more records from FILEA that match one # or more from FILEB, this script # # WORD DATA # # where # # WORD is a word as read from the list. # # MATCH is the string from stdin that matched WORD. # # Fillers ([!% ]) and comments in the text are always ignored in # the comparisons. Normally the non-ignored characters are # compared for equality. If "ignoreq=1" is specified, # ignores the EVA "q" characters. If "forgiving=1" is specified, # the words are compared by a looser, error-tolerant criterion. # In any case the OFFSET is the index of the first non-space # charater of the line that matched the first character of WORD. # # If the file is in ".evt" format, the transcriber code (";S") is optional. # If the input file is not in ".evt" format, the output code # is simply where NNN is the input record number. # # If "aswords=1" is given then EVA word spaces [-=,.] are # treated as significant characters; also the target words will # match only whole text words. Otherwise the spaces are ignored # and the patterns will match anywhere within a single line. # # The POS field is the number of "matching opportunities" preceding # the occurrence, excluding comments and fillers: If aswords=0, it is # the number of text characters compared: EVA characters if # forgiving=ignoreq=0, reduced characters if forgiving=1 or ignoreq=1. # If aswords=1, it is the number of words. # # The "show=1" option causes the script to print the input text # and all occurrences aligned with the text. # function error(msg) { printf "line %d: %s\n", NR, msg > "/dev/stderr" abort = 1 exit } function recode_text(str) { # Converts a textstring with no embedded "{}"s to the # requested encoding, preserving length. str = (forgiving ? erg_blur_details(str) : str); str = (ignoreq ? erg_erase_q(str) : str); return (aswords ? erg_unify_word_spaces(str) : erg_erase_word_spaces(str)); } function word_counts(txt, wct, i, k, m, b, c) { # Stores in wct[i] the number of non-empty words # that begin before character "i" of "txt". # Assumes all word breaks have been reduced to "." # and "txt" has been padded with one "." on each side. m = length(txt); k = 0; b = substr(txt,1,1); if (b != ".") { error("internal padding error"); exit; } wct[1] = 0; for(i=2;i<=m;i++) { c = substr(txt,i,1); if ((b != ".") && (c == ".")) k++; wct[i] = k; b = c; } if (c != ".") { error("internal padding error"); exit; } } # === ACTIONS =================================================== BEGIN { abort = 0; if (wordfile == "") error("must specify \"-v wordfile=FILE\"\n"); nWords = 0; split("", dic); split("", pat); while((getline w < wordfile) > 0) { p = erg_pack(recode_text(w)); dic[nWords] = w; pat[nWords] = (aswords ? ("." p ".") : p); ctr[nWords] = 0; nWords++; } close (wordfile); printf "loaded %6d words\n", nWords > "/dev/stderr" nOccs = 0; # Occurrences found so far nSites = 0; # Matching opportunities found so far. } /^#/ { if (abort) exit; if (show) print; next } /./ { if (abort) exit; if (show) print; if (substr($0,1,1) == "<") { loc = substr($0,1,index($0,">")); skip = 19; } else { loc = (""); skip = 0; } if (skip >= length($0)) next; # lin = original line without skipped part and with comments mapped to "!" lin = erg_erase_comments(substr($0,1+skip)); # txt = same as lin, with all ignorable details mapped to "!" txt = recode_text(lin); if (show && (forgiving || ignoreq)) print (substr($0,1,skip) txt); # If matching words, we must pad "txt" with "." to get correct # matches at line extremities. (Note that each pattern has been padded # with "."s in this case) if (aswords) { txt = ("." txt "."); split("", wct); word_counts(txt, wct); } # pck = txt with all "!"s squeezed out pck = erg_pack(txt); if (show) print (substr($0,1,skip) pck); # Compute offsets in txt for each character in pck: split("", iof); split("", fof); erg_char_offsets(txt, iof, fof); ops = 0; for(k=0;k "/dev/stderr"; } print loc, offset, pos, dic[k], occ; } j = index(substr(pck, i+1), p); i = (j == 0 ? 0 : i + j) } } nSites += ( aswords ? wct[length(pck)] : length(pck) ); next } END { if(abort) exit; printf "tested %6d potential matching sites\n", nSites > "/dev/stderr"; printf "found %6d occurrences\n", nOccs > "/dev/stderr"; printf "words not found:\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf " " > "/dev/stderr"; for (k=0; k "/dev/stderr"; } } printf "\n" > "/dev/stderr"; }