#! /usr/bin/gawk -f
# Last edited on 2008-06-06 23:51:05 by stolfi

BEGIN {

  usage = ( \
    "dbd_enum_valid_events \\\n" \
    "   -v winSize=NUM \\\n" \
    "   -v tMin=NUM -v tK=NUM -v tN=NUM \\\n" \
    " > OUTFILE.pre" \
  );
  
  # Enumerates all {k}-events compatible with our DNA model, and their
  # "a priori" probabilities, where {k=winSize}.  
  # 
  # The output file starts with one line "{k} {nE}" where {nE} is the
  # number of valid {k}-events in the model. Then come {nE} lines in
  # the format "{EVENT} {PROB} {PARTITION}" where the {EVENT} is a
  # valid {k}-event (a string of {k} symbols in [DEFN]), {PROB} is the
  # a priori probability of {EVENT} according to our model, and
  # {PARTITION} is the same {EVENT} with '.'s inserted between its
  # elementary factors.
  # 
  # Our model assumes that the DNA is a strictly alternating sequence 
  # of coding (K) and non-coding (N) regions. Each region has an exponential
  # distributon of lengths, with minimum {tMin} and average {tK}
  # and {tN}, respectively.  The bases in an N region are all labeled 'N'.
  # The bases in a K region are labeled with the repeating string "DEF".
  # A K region may start with any of the three letters, and its length 
  # need not be a multiple of 3; the starting phase is independent of the
  # ending phase of the previous K region.
  
  if (winSize == "") { arg_error(("must specify \"winSize\"")); }
  if (tMin == "") { arg_error(("must specify \"tMin\"")); }
  if (tK == "") { arg_error(("must specify \"tK\"")); }
  if (tN == "") { arg_error(("must specify \"tN\"")); }

  if (tK < tMin) { arg_error(("\"tK\" too small")); }
  if (tN < tMin) { arg_error(("\"tN\" too small")); }
  
  k = winSize; # A more convenient name.
  printf "%d\n", k;
    
  # Check whether the single-transition hypothesis is true:
  if (tMin <= k-2)
    { arg_error("multiple transitions not implemented yet"); }

  # There are 4 events with no transitions, 
  # 3*(k-1) events with an N-K transition,  
  # and the same number of events with a K-N transition.
  
  nvev = 4 + 6*(k-1);  # Number of valid events.
  printf "%d\n", nvev;
  
  # Suppose we take all {k}-tuples of a very long DNA string. Since
  # the minimum length {tMin} of a K or N region is at least {k-1}, no
  # {k} tuple contains more than one K-N or N-K transition. Since the
  # average length of K and N regions is {tK} and {tN}, there will be
  # one K-N transition and one N-K transition every {tK+tN} bases. So,
  # the fraction of {k}-tuples of each type is
  # 
  #   mixed K-N: {(k-1)/(tK+tN)}
  #   mixed N-K: {(k-1)/(tK+tN)}
  #   pure N: {(tN-k+1)/(tK+tN)}
  #   pure K: {(tK-k+1)/(tK+tN)}
  #
  # The mixed N-K events can be divided into {k-1} patterns, all
  # equally likely, based on the number of 'N's (1 through {k-1}).
  # For every pure-K pattern there are three events, corresponding 
  # to the three possible start phases.  Ditto for each N-K pattern
  # and for each K-N pattern.
  
  den = 1.0/(tK+tN);
  totpr = 0.0;
  
  # All-'N' event:
  ev = ""; 
  for (i = 0; i < k; i++) { ev = (ev "N"); }
  output_event(ev, den*(tN-k+1));

  printf "\n";
  
  # Loop on initial codon phase:
  for (ph = 0; ph < 3; ph++)
    { 
      # All-codon event starting with phase {ph}:
      ev = "";
      for (i = 0; i < k; i++)
        { j = 1 + (i + ph) % 3;
          b = substr("DEF", j, 1)
          ev = (ev b); 
        }
      output_event(ev, den*(tK-k+1)/3);

      if (k > 1)
        { printf "\n";

          # Loop on length of 'N' prefix/suffix:
          eva = ev;
          evb = revevent(ev);
          ii = "";
          for (sh = 1; sh < k; sh++)
            { eva = substr(("N" eva), 1, k);
              output_event(eva, den/3);
              evb = substr((evb "N"), 2, k);
              output_event(evb, den/3);
            }
        }

      printf "\n";
    }
  close("/dev/stdout");
  printf "%d events total prob = %8.6f\n", nvev, totpr > "/dev/stderr";
}

function output_event(ev,pr )
{
  # Output one line of the valid event file.
  # Also adds {pr} to the global variable {totpr}.
  printf "%s %14.7e %s\n", ev, pr, punctuate(ev);
  totpr += pr;
}

function revevent(e,   r,b)
{
  r = "";
  while (e != "")
    { 
      b = substr(e, 1, 1);
      e = substr(e, 2);
      if (b == "D") { b = "F"; } else if (b == "F") { b = "D"; }
      r = (b r);
    }
  return r;
}
  
function punctuate(e)
{
  # Factor the N part, if any, and the N-K and K-N transitions.
  e = punctuate_N_singlets(e)
  # now insert "." at every codon-codon boundary:
  gsub(/FD/, "F.D", e);
  
  # done:
  return e;
}

function punctuate_N_singlets(e, n,h,pre,mid,suf)
{
  # Factors the N part of event {e} into isolated 'N' labels,
  # and inserts '.' between those factors.
  
  # Insert '.' around every 'N' label:
  gsub(/N/, ".N.", e);
  
  # Remove redundant '.'s:
  gsub(/^[.]+/, "", e);
  gsub(/[.]+$/, "", e);
  gsub(/[.][.]+/, ".", e);
  
  return e;
}
 
function arg_error(msg)
{
  printf "** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  exit(1);
}