#! /usr/bin/gawk -f
# Last edited on 2004-03-01 00:21:38 by stolfi

# A library of functions for generating synthetic words
# from a given sample.

function mky_make_word(wd,pr,n,ord,avglen,maxlen,   \
  word,mw,i,st,c,frac,alt,apr,debug \
)
{ 
  # Tries to make up a new string {word} by a (very slow) Shannon
  # monkey of order {ord}. Uses words {wd[0..n-1]} as the sample, with
  # probabilities proportional to {pr[0..n-1]}. The length of the word
  # should be biased towards {avglen}, and will not exceed {maxlen}.
  # Returns the string {word}, or "**FAIL**" if the trial failed.
  
  debug = 0;
  
  word = ""; mw = 0;
  st = "";  # State of Markov chain
  frac = 0; # Fraction of word that has been built so far
  
  split("", alt); # Alternative characters, or "" for end-of-word.
  split("", apr); # Probability weight of alternative chars.

  if (debug) { printf "monkey started....\n" > "/dev/stderr"; }
  c = mky_choose_next_char(wd,pr,n,st,ord,frac,alt,apr,debug);
  if (debug) { printf " -> {%s}\n", c > "/dev/stderr"; }

  while (c != "")
    { if ((c == "**FAIL**") || (mw >= maxlen)) 
        { if (debug) { printf "**FAIL**\n" > "/dev/stderr"; }
          return "**FAIL**";
        }
      
      # Append {c} to the word, update its length:
      word = (word c); mw = length(word);
      
      # Remember only the last {ord} characters:
      st = substr(word, mw-ord+1, ord); 

      # Re-estimate the fraction {frac}
      frac = (mw < avglen ? \
        0.5*mw/avglen : \
        0.5 + 0.5*(mw - avglen)/(maxlen - avglen) \
      );

      if (debug) { printf "word = {%s}\n", word > "/dev/stderr"; }
      c = mky_choose_next_char(wd,pr,n,st,ord,frac,alt,apr,debug);
      if (debug) { printf " -> {%s}\n", c > "/dev/stderr"; }
    }
  if (debug) { printf "word = {%s}\n", word > "/dev/stderr"; }
  return word;
}

function mky_choose_next_char(wd,pr,n,st,ord,frac,alt,apr,debug,   \
  ms,na,i,wdi,mi,pri,jmax,j,prj,tp,px,na1,k \
)
{ 
  # Select the next character to append, given the string {st} that
  # consists of the last {ord} characters already generated, and the
  # approximate relative displacement {frac} of the character within
  # the word. Returns "" to signal `terminate the word', and "**FAIL**" to
  # indicate that the {st} does not occur in the sample.
  
  # The alternatives for the next character are chosen by finding the
  # string {st} in one of the words {dic[0..n-1]}, and looking at the
  # next character. Arguments {alt} and {apr} are two working arrays.
  
  if (debug) { printf "  %s: ", st > "/dev/stderr"; }
  
  # If {length(st) < ord} then 
  ms = length(st);

  # Gather the alternatives and their probs in {alt,apr}: 
  na = 0; 
  for (i = 0; i < n; i++)
    { wdi = wd[i];
      mi = length(wdi);
      pri = pr[i];
      # If {ms < ord}, the search is anchored as word-start:
      jmax = (ms < ord ? 0 : mi-ms);
      # Scan the word for {st}:
      for (j = 0; j <= jmax; j++)
        { if (st == substr(wdi,j+1,ms)) 
            { prj = 1.0/(1 + mky_abs(j+ms - frac*mi));
              alt[na] = substr(wdi,1+j+ms,1); apr[na] = pri*prj;
              na++;
            }
        }
    }
  # If {st} doesn't match anywhere, abort:
  if (na == 0) { return "**FAIL**"; }
  
  if (debug)
    { for (k = 0; k < na; k++) 
        { printf "{%s}", alt[k] > "/dev/stderr"; }
    } 
  
  # Select an alternative {alt[k]} with probability proportional to {apr[k]}
  tp = 0; 
  for (k = 0; k < na; k++) { tp += apr[k]; }
  px = tp*rand();
  tp = 0; 
  for (k = 0; k < na; k++) 
    { tp += apr[k]; if (tp >= px) { return alt[k]; } }
  # Just in case...
  return alt[na-1];
}

function mky_abs(x)
{
  return (x < 0 ? -x : x);
}