#! /usr/bin/gawk -f # Last edited on 2008-06-06 23:51:05 by stolfi BEGIN { usage = ( \ "dbd_enum_valid_events \\\n" \ " -v winSize=NUM \\\n" \ " -v tMin=NUM -v tK=NUM -v tN=NUM \\\n" \ " > OUTFILE.pre" \ ); # Enumerates all {k}-events compatible with our DNA model, and their # "a priori" probabilities, where {k=winSize}. # # The output file starts with one line "{k} {nE}" where {nE} is the # number of valid {k}-events in the model. Then come {nE} lines in # the format "{EVENT} {PROB} {PARTITION}" where the {EVENT} is a # valid {k}-event (a string of {k} symbols in [DEFN]), {PROB} is the # a priori probability of {EVENT} according to our model, and # {PARTITION} is the same {EVENT} with '.'s inserted between its # elementary factors. # # Our model assumes that the DNA is a strictly alternating sequence # of coding (K) and non-coding (N) regions. Each region has an exponential # distributon of lengths, with minimum {tMin} and average {tK} # and {tN}, respectively. The bases in an N region are all labeled 'N'. # The bases in a K region are labeled with the repeating string "DEF". # A K region may start with any of the three letters, and its length # need not be a multiple of 3; the starting phase is independent of the # ending phase of the previous K region. if (winSize == "") { arg_error(("must specify \"winSize\"")); } if (tMin == "") { arg_error(("must specify \"tMin\"")); } if (tK == "") { arg_error(("must specify \"tK\"")); } if (tN == "") { arg_error(("must specify \"tN\"")); } if (tK < tMin) { arg_error(("\"tK\" too small")); } if (tN < tMin) { arg_error(("\"tN\" too small")); } k = winSize; # A more convenient name. printf "%d\n", k; # Check whether the single-transition hypothesis is true: if (tMin <= k-2) { arg_error("multiple transitions not implemented yet"); } # There are 4 events with no transitions, # 3*(k-1) events with an N-K transition, # and the same number of events with a K-N transition. nvev = 4 + 6*(k-1); # Number of valid events. printf "%d\n", nvev; # Suppose we take all {k}-tuples of a very long DNA string. Since # the minimum length {tMin} of a K or N region is at least {k-1}, no # {k} tuple contains more than one K-N or N-K transition. Since the # average length of K and N regions is {tK} and {tN}, there will be # one K-N transition and one N-K transition every {tK+tN} bases. So, # the fraction of {k}-tuples of each type is # # mixed K-N: {(k-1)/(tK+tN)} # mixed N-K: {(k-1)/(tK+tN)} # pure N: {(tN-k+1)/(tK+tN)} # pure K: {(tK-k+1)/(tK+tN)} # # The mixed N-K events can be divided into {k-1} patterns, all # equally likely, based on the number of 'N's (1 through {k-1}). # For every pure-K pattern there are three events, corresponding # to the three possible start phases. Ditto for each N-K pattern # and for each K-N pattern. den = 1.0/(tK+tN); totpr = 0.0; # All-'N' event: ev = ""; for (i = 0; i < k; i++) { ev = (ev "N"); } output_event(ev, den*(tN-k+1)); printf "\n"; # Loop on initial codon phase: for (ph = 0; ph < 3; ph++) { # All-codon event starting with phase {ph}: ev = ""; for (i = 0; i < k; i++) { j = 1 + (i + ph) % 3; b = substr("DEF", j, 1) ev = (ev b); } output_event(ev, den*(tK-k+1)/3); if (k > 1) { printf "\n"; # Loop on length of 'N' prefix/suffix: eva = ev; evb = revevent(ev); ii = ""; for (sh = 1; sh < k; sh++) { eva = substr(("N" eva), 1, k); output_event(eva, den/3); evb = substr((evb "N"), 2, k); output_event(evb, den/3); } } printf "\n"; } close("/dev/stdout"); printf "%d events total prob = %8.6f\n", nvev, totpr > "/dev/stderr"; } function output_event(ev,pr ) { # Output one line of the valid event file. # Also adds {pr} to the global variable {totpr}. printf "%s %14.7e %s\n", ev, pr, punctuate(ev); totpr += pr; } function revevent(e, r,b) { r = ""; while (e != "") { b = substr(e, 1, 1); e = substr(e, 2); if (b == "D") { b = "F"; } else if (b == "F") { b = "D"; } r = (b r); } return r; } function punctuate(e) { # Factor the N part, if any, and the N-K and K-N transitions. e = punctuate_N_singlets(e) # now insert "." at every codon-codon boundary: gsub(/FD/, "F.D", e); # done: return e; } function punctuate_N_singlets(e, n,h,pre,mid,suf) { # Factors the N part of event {e} into isolated 'N' labels, # and inserts '.' between those factors. # Insert '.' around every 'N' label: gsub(/N/, ".N.", e); # Remove redundant '.'s: gsub(/^[.]+/, "", e); gsub(/[.]+$/, "", e); gsub(/[.][.]+/, ".", e); return e; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; exit(1); }