#! /usr/bin/gawk -f

# Makes a consensus interlinear transcription from 
# an EVT multi-version transcription.

BEGIN { 
  oldloc = ""
  nmtc = 10;  # Lookahead for dynamic programming matching
  padding = ""; for (i=0; i<nmtc;i++) padding = (padding " ")
  skippenalty = 1
  mismatchpenalty = 1
  print "# Output of make-consensus-interlin"
}

function flagrec(msg, rec)
{
  print rec > "/dev/stderr"
  print msg > "/dev/stderr"
}

function putout()
{
  # print out consensus pattern, if any
  if (oldloc != "")
    { printf "%-18s %s%s\n\n", ("<" oldloc ";S>"), oldtxt, oldeol
      oldloc = ""
      oldeol = ""
    }
}

function dynmatch(a,na,b,nb,mtca,mtcb, \
  i,j,k,m,mscore,cbest,c10,c01,c11,clink,cskip \
)
{
  # Receives two arrays of characters a[1..na], b[1..b].  Fills the
  # arrays mtca[1..m], mtcb[1..m] with a matching between a and b.
  # where m (the length of the match) is returned as the function's
  # value.  Each element mtca[k] is either an index into a, or 0; the nonzero
  # elements are strictly increasing and include all indices of a
  # Likewise for mtcb. At least one of mtca[k] and mtcb[k] is nonzero.
  # If both are nonzero, the index k is called a "link", else it
  # is a "skip". A link k such that a[mtca[k]] = b[mtcb[k]] is a
  # "match", else a "mismatch".  The contents of mtca and mtcb are
  # chosen so as to maximize a numerical score that rewards
  # coincidences and penalizes skips and mismatches.
  
  split("", mtca);
  split("", mtcb);
  # quick check for perfect match:
  if (na == nb)
    { for(i=1;(i<=na)&&(a[i]==b[i]);i++) { }
      if (i > na)
        { m = na
          for (k=1;k<=m;k++) { mtca[k]=k; mtcb[k] = k; }
          return m;
        }
    }
  
  # Builds mscore[i,j] = best score of any matching of a[1..i] to b[1..j]:
  split("", mscore)
  mscore[0,0] = 0;
  for(i=1;i<=na;i++) mscore[i,0] = mscore[i-1,0] - skippenalty 
  for(j=1;j<=nb;j++) mscore[0,j] = mscore[0,j-1] - skippenalty 
  
  for(i=1;i<=na;i++)
    for(j=1;j<=nb;j++)
      { # A matching of  a[1..i] to b[1..j] either 
        #   matches a[1..i-1] with b[1..j-1] and links a[i] with b[j], or
        #   matches a[1..i-1] with b[1..j] and skips a[i], or
        #   matches a[1..i] with b[1..j-1] and skips b[j].
        c10 = mscore[i-1,j] - skippenalty;
        c01 = mscore[i,j-1] - skippenalty;
        c11 = mscore[i-1,j-1] + (a[i] == b[j] ? +1 : -mismatchpenalty);
        cskip = (c01 > c10 ? c01 : c10)
        clink = c11
        mscore[i,j] = (clink > cskip ? clink : cskip)
      } 
  # Find end [i,j] of best match that matches the whole of either string:
  i = na
  j = nb
  cbest = mscore[i,j]
  for(k=1;k<na;k++)
    { if (mscore[k,nb] > cbest) { cbest = mscore[k,nb]; i = k; j = nb } }
  for(k=1;k<nb;k++)
    { if (mscore[na,k] > cbest) { cbest = mscore[na,k]; i = na; j = k } }
  # Extract best match in reverse order:
  k = 0
  while((i > 0 ) && (j > 0))
    { c11 = mscore[i-1,j-1] + (a[i] == b[j] ? +1 : -mismatchpenalty)
      c10 = mscore[i-1,j] - skippenalty;
      c01 = mscore[i,j-1] - skippenalty;
      if (mscore[i,j] == c11) { mtca[k] = i; i--; mtcb[k] = j; j-- }
      else if (mscore[i,j] == c10) { mtca[k] = i; i--; mtcb[k] = 0 }
      else if (mscore[i,j] == c01) { mtca[k] = 0; mtcb[k] = j; j-- }
      else { print "dynprog error 1" > "/dev/stderr"; exit 1 }
      k--
    }
  while (i > 0) { mtca[k] = i; mtcb[k] = 0; i--; k-- } 
  while (j > 0) { mtca[k] = 0; mtcb[k] = j; j--; k-- } 
  m = -k;
  if ((i != 0) || (j != 0)) { print "dynprog error 2" > "/dev/stderr"; exit 1 }
  # Reverse match
  for(k=1;k<=m;k++)
    { mtca[k] = mtca[k-m]; delete mtca[k-m]
      mtcb[k] = mtcb[k-m]; delete mtcb[k-m]
    }
  return m
}

function printmatch(a,mtca,b,mtcb,m,  i,j,k,astr,bstr)
{
  astr = ""
  bstr = ""
  for(k=1;k<=m;k++)
    { i = mtca[k]; j = mtcb[k];
      astr = (astr (i == 0 ? "!" : a[i]))
      bstr = (bstr (j == 0 ? "!" : b[j]))
    }
  printf "%s\n", astr;
  printf "%s\n", bstr;
  printf "\n";
}

function consensuschar(ac, bc)
{
  # Given two character that have to be paired, chooses the 
  # output character.  Has special treatment for ".", "-", "=":
  if ((ac == "." || ac == "-" || ac == "=") && (bc == "." || bc == "-" || bc == "="))
    { if      (ac == "=" || bc == "=") { return "="; }
      else if (ac == "-" || bc == "-") { return "-"; }
      else { return "."; }
    }
  else
    { return "?"; }
}

function mergestrings(a,b,   nstr,m,ach,bch,mtca,mtcb,i,j,k,dif,och,score)
{
  # Merges two nonblank strings by poorman's "dynamic programming"
  # We scan both strings from left to right, outputting the result.
  #   At each step, we pair up the next nmtc characters of both strings by 
  #   dynamic programming, allowing for insertions.  If the first pair of
  #   the match is a coincidence, we output the common char, otherwise we
  #   output a "?" an skip over the pair.
  a = (a padding)
  b = (b padding)
  nstr = ""
  while ((a != padding) && (b != padding))
    { 
      # Find best local match:
      split(substr(a,1,nmtc), ach, "")
      split(substr(b,1,nmtc), bch, "")
      m = dynmatch(ach,nmtc,bch,nmtc,mtca,mtcb)
      # printmatch(ach,mtca,bch,mtcb,m)
      if (m < nmtc) { print "matching error 1" > "/dev/stderr"; exit 1 }
      # Output next character:
      i = mtca[1]
      j = mtcb[1]
      if ((i > 1) || (j > 1)  || (i < 0) || (j < 0))
        { print "matching error 2" > "/dev/stderr"; exit 1 }
      else if ((i == 0) && (j == 0))
        { print "matching error 3" > "/dev/stderr"; exit 1 }
      else if ((i == 1) && (j == 1))
        { 
          # Characters should be paired
          if (ach[1] == bch[1])
            { nstr = (nstr ach[1])
              a = substr(a,2)
              b = substr(b,2)
            }
          else
            { 
              nstr = (nstr consensuschar(ach[1], bch[1]))
              # Decide which strings to advance
              score = 0
              for(k=2;k<=m;k++)
                { if((mtca[k]!=0)&&(mtcb[k]!=0)&&(ach[mtca[k]]==bch[mtcb[2]])) { score++ } }
              if (3*score >= 2*nmtc)
                { # at least 2/3 of the match is links; advance both. 
                  a = substr(a,2)
                  b = substr(b,2)
                }
              else
                { # Advance the longer string 
                  # unless one is significantly shorter that the other
                  dif2 = 2*(length(a) - length(b))
                  if (dif2 > -nmtc) a = substr(a,2)
                  if (dif2 <  nmtc) b = substr(b,2)
                }
            }
        }
      else
        { 
          # One of them must be ignored
          if (i != 0) 
            { a = substr(a,2);
              och = (bch[1] == "." ? "." : "?") 
            }
          else if (j != 0) 
            { b = substr(b,2);
              och = (ach[1] == "." ? "." : "?") 
            }
          nstr = (nstr och)
        }
    }
  while (a != padding) { nstr = (nstr "?"); a = substr(a,2); }
  while (b != padding) { nstr = (nstr "?"); b = substr(b,2); }
  return nstr
}

/^ *$/ { putout(); print; next }
/^ *#/ { putout(); print; next }
/^<[^>;]*> *$/ { putout(); print; next }

/^<[^>]*\.[^>]*;[A-Z]> / {
  split($1, tag, /[;<>]/)
  curloc = tag[2]
  curver = tag[3]
  if ((substr($0,19,1) != " ") || (substr($0,20,1) == " "))
    { flagrec("text does not start on col. 20", $0); next; }
  curtxt = substr($0,20)
  # Clean up garbage.  
  # We discard the "%"s since they are recreated if needed 
  # (as "?"s) by the dynamic programming matcher.
  curtxt = gensub(/[*_]/, "?", "g", curtxt);
  curtxt = gensub(/\{[^}]*\}/, "", "g", curtxt);
  curtxt = gensub(/[% !]/, "", "g", curtxt);
  curtxt = gensub(/^\.[.]*/, "", "g", curtxt);
  curtxt = gensub(/\.[.]*$/, "", "g", curtxt);
  # Special handling for the end-of-line markings:
  curlen = length(curtxt);
  cureol = (curlen > 0 ? substr(curtxt,curlen,1) : "")
  if ((cureol == "-") || (cureol == "="))
    { curtxt = substr(curtxt,1,curlen-1)
      curtxt = gensub(/\.[.]*$/, "", "g", curtxt)
    }
  else
    { cureol = "" }
    
  if (curloc != oldloc) 
    { putout(); 
      oldloc = curloc 
      oldtxt = curtxt
      oldeol = cureol
    }
  else 
    { # Update consensus text:
      if (oldtxt == "")
        { oldtxt = curtxt }
      else if (curtxt != "")
        { # print oldtxt > "/dev/stderr"
          # print curtxt > "/dev/stderr"
          oldtxt = mergestrings(oldtxt, curtxt)
        }
      # Update consensus end-of-line:
      if (oldeol == "")
        { oldeol = cureol }
      else if ((oldeol == "-") && (cureol == "="))
        { oldeol = cureol }
    }
  print; next
}

/./ { flagrec("unrecognized record type", $0); next }

END { putout(); }