#! /usr/bin/gawk -f
# Last edited on 2004-10-04 14:36:30 by stolfi

BEGIN {
  abort = -1;
  usage = ( ARGV[0] "\\\n" \
    "  -v order=NUM \\\n" \
    "  -v achfile=FILE \\\n" \
    "  -v bprfile=FILE \\\n" \
    "  < ATEXT.tks > BTEXT.tks" \
  );
  
  # Reads a text in a language A and maps it through an 
  # order-2 Markov chain for language B, trying to preserve 
  # the letter frequencies of the latter.
  
  # The {achfile} parameter is a file containing the letter 
  # frequencies of language A, one per line, in the format
  # "{COUNT} {FREQ} {CHAR}". Here "_" denotes a word break.
  # This file should be sorted by decreasing {FREQ}.
  
  # The {bprfile} parameter is a file containing a Markov model of the
  # given {order} for language language B, one transition per line, in
  # the format "{COUNT} {FREQ} {STATE} {CHAR}". The {STATE} should be
  # a string of {order} B-language chars, prefixed by "$". Here too
  # "_" denotes a word break. This file should be sorted by {STATE},
  # then by descreasing {FREQ}.
  
  # The input and output text files will have one token per line,
  # (i.e. word breaks are represented as newline chars).
  # Input blank lines are ignored. Output blank lines mean parag breaks.
  
  if (order == "") { arg_error("must define \"order\""); }
  if ((order < 0) || (order > 5)) { arg_error(("bad order = \"" order "\"")); }

  if (achfile == "") { arg_error("must define \"achfile\""); }
  if (bprfile == "") { arg_error("must define \"bprfile\""); }
  
  # Read character table for language A:
  #   {ach[0..nach-1]} are the A letters, and 
  #   {apr[0..nach-1]}  are the corresponding probs.
  read_char_table(achfile);
  
  # Read pair table for language B:
  #   {bst[0..nbst-1]} are the B states in any order,
  #   {nbtr[0..nbst-1]} are the corresponding out-degrees;
  #   {bch[0..nbch-1]} are the B characters in any order;
  #   {btrch[st,0..nbtr[st]-1]} are the letters 
  #     that can follow state {st} in language B; and
  #   {btrpr[st,0..nbtr[st]-1]} are the corresponding probs.
  read_pair_table(bprfile);
  
  # Print Markov chain:
  printf "\n" > "/dev/stderr";
  printf "Markov chain for language B\n" > "/dev/stderr";
  printf "\n" > "/dev/stderr";
  for (i = 0; i < nbst; i++) 
    { st = bst[i];
      printf "  $%s", st > "/dev/stderr";
      for (jb = 0; jb < nbch; jb++) 
        { bc = (jb < nbtr[st] ? btrch[st,jb] : " "); 
          printf " %s", bc > "/dev/stderr";
        }
      printf "\n" > "/dev/stderr";
    }
  printf "\n" > "/dev/stderr";
    
  # Build transducer table:
  split("", bmap);
  for (i = 0; i < nbst; i++)
    { st = bst[i];
      split("", sumb);
      # Map each A-lang chars to a B-lang transition from {st}, in order.
      # If there is an excess of A-lang chars, and the B-lang transition
      # has large probability, take two or more A-lang chars.
      ja = 0;
      for (jb = 0;  (ja < nach) && (jb < nbtr[st]); jb++)
        { bc = btrch[st,jb];
          bp = btrpr[st,jb];
          bs = 0;
          do { 
            ac = ach[ja]; 
            bmap[st,ac] = bc; 
            bs += apr[ja]; 
            ja++;
          } while (((nach - ja) >= (nbtr[st] - jb)) && (bs + apr[ja] <= bp));
          sumb[jb] = bs;
        }
      # Map to each B-lang transition {jb} out of state {st}
      # additional A-lang chars that add to about {btrpr[st,jb]}:
      for (jb = 0; jb < nbtr[st]; jb++)
        { bc = btrch[st,jb];
          bp = btrpr[st,jb];
          for (ja = 0; ja < nach; ja++) 
            { ac = ach[ja]; 
              if (! ((st,ac) in bmap))
                { bs = sumb[jb]; bs1 = sumb[jb] + apr[ja];
                  if ((bs1 < bp) || (bp/bs1 > bs/bp))
                    { bmap[st,ac] = bc; sumb[jb] = bs1; }
                }
            }
        }
      # Assign the still unassigned A-lang chars, if any, to 
      # deficitary B-lang transitions:
      for (ja = 0; ja < nach; ja++) 
        { ac = ach[ja];
          if (! ((st,ac) in bmap))
            { for (jb = 0; jb < nbtr[st]; jb++)
                { if (sumb[jb] <= btrpr[st,jb] + 0.000001)
                    { bc = btrch[st,jb];
                      bmap[st,ac] = bc; 
                      sumb[jb] += apr[ja];
                    }
                }
            }
        }
      # Debug - print original and faked B-lang probs
      for (jb = 0; jb < nbtr[st]; jb++)
        { bc = btrch[st,jb];
          bp = btrpr[st,jb];
          printf "    $%s -> %s %8.6f %8.6f", st, bc, bp, sumb[jb] > "/dev/stderr";
          for (ja = 0; ja < nach; ja++)
            { ac = ach[ja];
              if (bmap[st,ac] == bc) { printf " %s", ac > "/dev/stderr"; }
            }
          printf "\n" > "/dev/stderr";
        }
    }
    
  # Print transducer table:
  printf "\n" > "/dev/stderr";
  printf "Transducer from language A to pseudo-language B\n" > "/dev/stderr";
  printf "\n" > "/dev/stderr";
  blankstate = substr("             ",1,order+1);
  printf "  %s", blankstate > "/dev/stderr";
  for (ja = 0; ja < nach; ja++) { printf " %s", ach[ja] > "/dev/stderr"; }
  printf "\n" > "/dev/stderr";
  printf "  %s", blankstate > "/dev/stderr";
  for (ja = 0; ja < nach; ja++) { printf " -" > "/dev/stderr"; }
  printf "\n" > "/dev/stderr";
  for (i = 0; i < nbst; i++) 
    { st = bst[i];
      printf "  $%s", st > "/dev/stderr";
      for (ja = 0; ja < nach; ja++) 
        { ac = ach[ja]; 
          bc = bmap[st,ac]; 
          printf " %s", bc > "/dev/stderr";
        }
      printf "\n" > "/dev/stderr";
    }
  printf "\n" > "/dev/stderr";
    
  # Initial state: 
  inistate = substr("_______________________", 1,order);
  # Current state:
  st = inistate;
  # Number of newlines
  newlines = 2;
}

(abort >= 0) { exit abort; }

/./ {
  w = ( $1 "_" ); m = length(w);
  for (k = 1; k <= m; k++)
    { ac = substr(w,k,1);
      if (! ((st,ac) in bmap))
        { prog_error(("no translation for (" st "," ac ")")); }
      bc = bmap[st,ac];
      if (bc == "_")
        { if (newlines < 2) { printf "\n"; newlines++; } }
      else
        { printf "%s", bc; newlines = 0; }
      st = substr((st bc), 2);
    }
}

END {
  if (abort >= 0) { exit abort; }
  # Force file to end with newline, to keep tools happy:
  if (newlines == 0) { printf "\n"; newlines++; }
}

function read_char_table(fname,   nlin,lin,fld,nfld)
{
  # Global variables defined by this procedure:
  nach = 0;
  split("", ach); 
  split("", apr); 

  # Do it:
  nlin = 0;
  while((getline lin < fname) > 0) { 
    nlin++;
    if (! match(lin, /^[ \011]*([#]|$)/))
      { nfld = split(lin, fld);
        if ((nfld >= 4) && (fld[4] ~ /^[\#]/)) { nfld = 3; }
        if (nfld != 3) 
          tbl_error(fname, nlin, ("bad A table entry = \"" lin "\" nfld = " nfld));
        if (length(fld[3]) != 1) 
          tbl_error(fname, nlin, ("bad A char = \"" fld[3] "\""));
        ach[nach] = fld[3];
        if (fld[2] !~ /^[01][.][0-9]+$/) 
          tbl_error(fname, nlin, ("bad A freq = \"" fld[2] "\""));
        apr[nach] = fld[2];
        nach++;
      }
  }
  if (ERRNO != "0") { tbl_error(fname, nlin, ERRNO); }
  close (fname);
  if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
  printf "read %d characters\n", nach > "/dev/stderr"
}

function read_pair_table(fname,  npairs,nlin,lin,fld,nfld,ofld3,st,j,ch,bchseen)
{
  # Global variables defined by this procedure:
  nbst = 0;
  split("", bst); 
  
  nbch = 0;
  split("", bch);
  
  split("", nbtr);
  split("", btrch); 
  split("", btrpr); 
  
  # Do it:
  split("", bchseen);
  npairs = 0;
  nlin = 0;
  ofld3 = "";
  while((getline lin < fname) > 0) { 
    nlin++;
    if (! match(lin, /^[ \011]*([#]|$)/))
      { nfld = split(lin, fld);
        if ((nfld >= 5) && (fld[5] ~ /^[\#]/)) { nfld = 4; }
        if (nfld != 4) 
          tbl_error(fname, nlin, ("bad B table entry = \"" lin "\""));
        if (fld[3] != ofld3)
          { if ((fld[3] !~ /^[$]/) || (length(fld[3]) != order+1)) 
              tbl_error(fname, nlin, ("bad B state = \"" fld[3] "\""));
            ofld3 = fld[3];
            st = substr(fld[3],2);
            bst[nbst] = st;
            nbtr[st] = 0;
            nbst++;
          }
        if (length(fld[4]) != 1) 
          tbl_error(fname, nlin, ("bad B char = \"" fld[3] "\""));
        ch = fld[4];
        j = nbtr[st];
        btrch[st,j] = ch;
        if (! (ch in bchseen)) 
          { bch[nbch] = ch; nbch++; bchseen[ch] = 1; }
        if (fld[2] !~ /^[01][.][0-9]+$/) 
          tbl_error(fname, nlin, ("bad B freq = \"" fld[2] "\""));
        btrpr[st,j] = fld[2];
        nbtr[st]++;
        npairs++;
        # printf "* %s %s %s %d %d\n", st, ch, fld[2], nbtr[st], nbch > "/dev/stderr";
      }
  }
  if (ERRNO != "0") { tbl_error(fname, nlin, ERRNO); }
  close (fname);
  if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); }
  printf "read %d arcs, %d states, %d chars\n", npairs, nbst, nbch > "/dev/stderr"
}

function arg_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1
}

function prog_error(msg)
{ 
  printf "%s\n", msg > "/dev/stderr";
  abort = 1;
  exit 1
}

function tbl_error(f,n,msg)
{ 
  printf "file %s, line %d: %s\n", f, n, msg > "/dev/stderr";
  abort = 1;
  exit 1
}

function data_error(msg)
{ 
  printf "line %d: %s\n", FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1
}