#! /usr/bin/gawk -f # Makes a consensus interlinear transcription from # an EVT multi-version transcription. BEGIN { oldloc = "" nmtc = 10; # Lookahead for dynamic programming matching padding = ""; for (i=0; i "/dev/stderr" print msg > "/dev/stderr" } function putout() { # print out consensus pattern, if any if (oldloc != "") { printf "%-18s %s%s\n\n", ("<" oldloc ";S>"), oldtxt, oldeol oldloc = "" oldeol = "" } } function dynmatch(a,na,b,nb,mtca,mtcb, \ i,j,k,m,mscore,cbest,c10,c01,c11,clink,cskip \ ) { # Receives two arrays of characters a[1..na], b[1..b]. Fills the # arrays mtca[1..m], mtcb[1..m] with a matching between a and b. # where m (the length of the match) is returned as the function's # value. Each element mtca[k] is either an index into a, or 0; the nonzero # elements are strictly increasing and include all indices of a # Likewise for mtcb. At least one of mtca[k] and mtcb[k] is nonzero. # If both are nonzero, the index k is called a "link", else it # is a "skip". A link k such that a[mtca[k]] = b[mtcb[k]] is a # "match", else a "mismatch". The contents of mtca and mtcb are # chosen so as to maximize a numerical score that rewards # coincidences and penalizes skips and mismatches. split("", mtca); split("", mtcb); # quick check for perfect match: if (na == nb) { for(i=1;(i<=na)&&(a[i]==b[i]);i++) { } if (i > na) { m = na for (k=1;k<=m;k++) { mtca[k]=k; mtcb[k] = k; } return m; } } # Builds mscore[i,j] = best score of any matching of a[1..i] to b[1..j]: split("", mscore) mscore[0,0] = 0; for(i=1;i<=na;i++) mscore[i,0] = mscore[i-1,0] - skippenalty for(j=1;j<=nb;j++) mscore[0,j] = mscore[0,j-1] - skippenalty for(i=1;i<=na;i++) for(j=1;j<=nb;j++) { # A matching of a[1..i] to b[1..j] either # matches a[1..i-1] with b[1..j-1] and links a[i] with b[j], or # matches a[1..i-1] with b[1..j] and skips a[i], or # matches a[1..i] with b[1..j-1] and skips b[j]. c10 = mscore[i-1,j] - skippenalty; c01 = mscore[i,j-1] - skippenalty; c11 = mscore[i-1,j-1] + (a[i] == b[j] ? +1 : -mismatchpenalty); cskip = (c01 > c10 ? c01 : c10) clink = c11 mscore[i,j] = (clink > cskip ? clink : cskip) } # Find end [i,j] of best match that matches the whole of either string: i = na j = nb cbest = mscore[i,j] for(k=1;k cbest) { cbest = mscore[k,nb]; i = k; j = nb } } for(k=1;k cbest) { cbest = mscore[na,k]; i = na; j = k } } # Extract best match in reverse order: k = 0 while((i > 0 ) && (j > 0)) { c11 = mscore[i-1,j-1] + (a[i] == b[j] ? +1 : -mismatchpenalty) c10 = mscore[i-1,j] - skippenalty; c01 = mscore[i,j-1] - skippenalty; if (mscore[i,j] == c11) { mtca[k] = i; i--; mtcb[k] = j; j-- } else if (mscore[i,j] == c10) { mtca[k] = i; i--; mtcb[k] = 0 } else if (mscore[i,j] == c01) { mtca[k] = 0; mtcb[k] = j; j-- } else { print "dynprog error 1" > "/dev/stderr"; exit 1 } k-- } while (i > 0) { mtca[k] = i; mtcb[k] = 0; i--; k-- } while (j > 0) { mtca[k] = 0; mtcb[k] = j; j--; k-- } m = -k; if ((i != 0) || (j != 0)) { print "dynprog error 2" > "/dev/stderr"; exit 1 } # Reverse match for(k=1;k<=m;k++) { mtca[k] = mtca[k-m]; delete mtca[k-m] mtcb[k] = mtcb[k-m]; delete mtcb[k-m] } return m } function printmatch(a,mtca,b,mtcb,m, i,j,k,astr,bstr) { astr = "" bstr = "" for(k=1;k<=m;k++) { i = mtca[k]; j = mtcb[k]; astr = (astr (i == 0 ? "!" : a[i])) bstr = (bstr (j == 0 ? "!" : b[j])) } printf "%s\n", astr; printf "%s\n", bstr; printf "\n"; } function consensuschar(ac, bc) { # Given two character that have to be paired, chooses the # output character. Has special treatment for ".", "-", "=": if ((ac == "." || ac == "-" || ac == "=") && (bc == "." || bc == "-" || bc == "=")) { if (ac == "=" || bc == "=") { return "="; } else if (ac == "-" || bc == "-") { return "-"; } else { return "."; } } else { return "?"; } } function mergestrings(a,b, nstr,m,ach,bch,mtca,mtcb,i,j,k,dif,och,score) { # Merges two nonblank strings by poorman's "dynamic programming" # We scan both strings from left to right, outputting the result. # At each step, we pair up the next nmtc characters of both strings by # dynamic programming, allowing for insertions. If the first pair of # the match is a coincidence, we output the common char, otherwise we # output a "?" an skip over the pair. a = (a padding) b = (b padding) nstr = "" while ((a != padding) && (b != padding)) { # Find best local match: split(substr(a,1,nmtc), ach, "") split(substr(b,1,nmtc), bch, "") m = dynmatch(ach,nmtc,bch,nmtc,mtca,mtcb) # printmatch(ach,mtca,bch,mtcb,m) if (m < nmtc) { print "matching error 1" > "/dev/stderr"; exit 1 } # Output next character: i = mtca[1] j = mtcb[1] if ((i > 1) || (j > 1) || (i < 0) || (j < 0)) { print "matching error 2" > "/dev/stderr"; exit 1 } else if ((i == 0) && (j == 0)) { print "matching error 3" > "/dev/stderr"; exit 1 } else if ((i == 1) && (j == 1)) { # Characters should be paired if (ach[1] == bch[1]) { nstr = (nstr ach[1]) a = substr(a,2) b = substr(b,2) } else { nstr = (nstr consensuschar(ach[1], bch[1])) # Decide which strings to advance score = 0 for(k=2;k<=m;k++) { if((mtca[k]!=0)&&(mtcb[k]!=0)&&(ach[mtca[k]]==bch[mtcb[2]])) { score++ } } if (3*score >= 2*nmtc) { # at least 2/3 of the match is links; advance both. a = substr(a,2) b = substr(b,2) } else { # Advance the longer string # unless one is significantly shorter that the other dif2 = 2*(length(a) - length(b)) if (dif2 > -nmtc) a = substr(a,2) if (dif2 < nmtc) b = substr(b,2) } } } else { # One of them must be ignored if (i != 0) { a = substr(a,2); och = (bch[1] == "." ? "." : "?") } else if (j != 0) { b = substr(b,2); och = (ach[1] == "." ? "." : "?") } nstr = (nstr och) } } while (a != padding) { nstr = (nstr "?"); a = substr(a,2); } while (b != padding) { nstr = (nstr "?"); b = substr(b,2); } return nstr } /^ *$/ { putout(); print; next } /^ *#/ { putout(); print; next } /^<[^>;]*> *$/ { putout(); print; next } /^<[^>]*\.[^>]*;[A-Z]> / { split($1, tag, /[;<>]/) curloc = tag[2] curver = tag[3] if ((substr($0,19,1) != " ") || (substr($0,20,1) == " ")) { flagrec("text does not start on col. 20", $0); next; } curtxt = substr($0,20) # Clean up garbage. # We discard the "%"s since they are recreated if needed # (as "?"s) by the dynamic programming matcher. curtxt = gensub(/[*_]/, "?", "g", curtxt); curtxt = gensub(/\{[^}]*\}/, "", "g", curtxt); curtxt = gensub(/[% !]/, "", "g", curtxt); curtxt = gensub(/^\.[.]*/, "", "g", curtxt); curtxt = gensub(/\.[.]*$/, "", "g", curtxt); # Special handling for the end-of-line markings: curlen = length(curtxt); cureol = (curlen > 0 ? substr(curtxt,curlen,1) : "") if ((cureol == "-") || (cureol == "=")) { curtxt = substr(curtxt,1,curlen-1) curtxt = gensub(/\.[.]*$/, "", "g", curtxt) } else { cureol = "" } if (curloc != oldloc) { putout(); oldloc = curloc oldtxt = curtxt oldeol = cureol } else { # Update consensus text: if (oldtxt == "") { oldtxt = curtxt } else if (curtxt != "") { # print oldtxt > "/dev/stderr" # print curtxt > "/dev/stderr" oldtxt = mergestrings(oldtxt, curtxt) } # Update consensus end-of-line: if (oldeol == "") { oldeol = cureol } else if ((oldeol == "-") && (cureol == "=")) { oldeol = cureol } } print; next } /./ { flagrec("unrecognized record type", $0); next } END { putout(); }