#! /usr/bin/gawk -f # Last edited on 2004-10-04 14:36:30 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] "\\\n" \ " -v order=NUM \\\n" \ " -v achfile=FILE \\\n" \ " -v bprfile=FILE \\\n" \ " < ATEXT.tks > BTEXT.tks" \ ); # Reads a text in a language A and maps it through an # order-2 Markov chain for language B, trying to preserve # the letter frequencies of the latter. # The {achfile} parameter is a file containing the letter # frequencies of language A, one per line, in the format # "{COUNT} {FREQ} {CHAR}". Here "_" denotes a word break. # This file should be sorted by decreasing {FREQ}. # The {bprfile} parameter is a file containing a Markov model of the # given {order} for language language B, one transition per line, in # the format "{COUNT} {FREQ} {STATE} {CHAR}". The {STATE} should be # a string of {order} B-language chars, prefixed by "$". Here too # "_" denotes a word break. This file should be sorted by {STATE}, # then by descreasing {FREQ}. # The input and output text files will have one token per line, # (i.e. word breaks are represented as newline chars). # Input blank lines are ignored. Output blank lines mean parag breaks. if (order == "") { arg_error("must define \"order\""); } if ((order < 0) || (order > 5)) { arg_error(("bad order = \"" order "\"")); } if (achfile == "") { arg_error("must define \"achfile\""); } if (bprfile == "") { arg_error("must define \"bprfile\""); } # Read character table for language A: # {ach[0..nach-1]} are the A letters, and # {apr[0..nach-1]} are the corresponding probs. read_char_table(achfile); # Read pair table for language B: # {bst[0..nbst-1]} are the B states in any order, # {nbtr[0..nbst-1]} are the corresponding out-degrees; # {bch[0..nbch-1]} are the B characters in any order; # {btrch[st,0..nbtr[st]-1]} are the letters # that can follow state {st} in language B; and # {btrpr[st,0..nbtr[st]-1]} are the corresponding probs. read_pair_table(bprfile); # Print Markov chain: printf "\n" > "/dev/stderr"; printf "Markov chain for language B\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; for (i = 0; i < nbst; i++) { st = bst[i]; printf " $%s", st > "/dev/stderr"; for (jb = 0; jb < nbch; jb++) { bc = (jb < nbtr[st] ? btrch[st,jb] : " "); printf " %s", bc > "/dev/stderr"; } printf "\n" > "/dev/stderr"; } printf "\n" > "/dev/stderr"; # Build transducer table: split("", bmap); for (i = 0; i < nbst; i++) { st = bst[i]; split("", sumb); # Map each A-lang chars to a B-lang transition from {st}, in order. # If there is an excess of A-lang chars, and the B-lang transition # has large probability, take two or more A-lang chars. ja = 0; for (jb = 0; (ja < nach) && (jb < nbtr[st]); jb++) { bc = btrch[st,jb]; bp = btrpr[st,jb]; bs = 0; do { ac = ach[ja]; bmap[st,ac] = bc; bs += apr[ja]; ja++; } while (((nach - ja) >= (nbtr[st] - jb)) && (bs + apr[ja] <= bp)); sumb[jb] = bs; } # Map to each B-lang transition {jb} out of state {st} # additional A-lang chars that add to about {btrpr[st,jb]}: for (jb = 0; jb < nbtr[st]; jb++) { bc = btrch[st,jb]; bp = btrpr[st,jb]; for (ja = 0; ja < nach; ja++) { ac = ach[ja]; if (! ((st,ac) in bmap)) { bs = sumb[jb]; bs1 = sumb[jb] + apr[ja]; if ((bs1 < bp) || (bp/bs1 > bs/bp)) { bmap[st,ac] = bc; sumb[jb] = bs1; } } } } # Assign the still unassigned A-lang chars, if any, to # deficitary B-lang transitions: for (ja = 0; ja < nach; ja++) { ac = ach[ja]; if (! ((st,ac) in bmap)) { for (jb = 0; jb < nbtr[st]; jb++) { if (sumb[jb] <= btrpr[st,jb] + 0.000001) { bc = btrch[st,jb]; bmap[st,ac] = bc; sumb[jb] += apr[ja]; } } } } # Debug - print original and faked B-lang probs for (jb = 0; jb < nbtr[st]; jb++) { bc = btrch[st,jb]; bp = btrpr[st,jb]; printf " $%s -> %s %8.6f %8.6f", st, bc, bp, sumb[jb] > "/dev/stderr"; for (ja = 0; ja < nach; ja++) { ac = ach[ja]; if (bmap[st,ac] == bc) { printf " %s", ac > "/dev/stderr"; } } printf "\n" > "/dev/stderr"; } } # Print transducer table: printf "\n" > "/dev/stderr"; printf "Transducer from language A to pseudo-language B\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; blankstate = substr(" ",1,order+1); printf " %s", blankstate > "/dev/stderr"; for (ja = 0; ja < nach; ja++) { printf " %s", ach[ja] > "/dev/stderr"; } printf "\n" > "/dev/stderr"; printf " %s", blankstate > "/dev/stderr"; for (ja = 0; ja < nach; ja++) { printf " -" > "/dev/stderr"; } printf "\n" > "/dev/stderr"; for (i = 0; i < nbst; i++) { st = bst[i]; printf " $%s", st > "/dev/stderr"; for (ja = 0; ja < nach; ja++) { ac = ach[ja]; bc = bmap[st,ac]; printf " %s", bc > "/dev/stderr"; } printf "\n" > "/dev/stderr"; } printf "\n" > "/dev/stderr"; # Initial state: inistate = substr("_______________________", 1,order); # Current state: st = inistate; # Number of newlines newlines = 2; } (abort >= 0) { exit abort; } /./ { w = ( $1 "_" ); m = length(w); for (k = 1; k <= m; k++) { ac = substr(w,k,1); if (! ((st,ac) in bmap)) { prog_error(("no translation for (" st "," ac ")")); } bc = bmap[st,ac]; if (bc == "_") { if (newlines < 2) { printf "\n"; newlines++; } } else { printf "%s", bc; newlines = 0; } st = substr((st bc), 2); } } END { if (abort >= 0) { exit abort; } # Force file to end with newline, to keep tools happy: if (newlines == 0) { printf "\n"; newlines++; } } function read_char_table(fname, nlin,lin,fld,nfld) { # Global variables defined by this procedure: nach = 0; split("", ach); split("", apr); # Do it: nlin = 0; while((getline lin < fname) > 0) { nlin++; if (! match(lin, /^[ \011]*([#]|$)/)) { nfld = split(lin, fld); if ((nfld >= 4) && (fld[4] ~ /^[\#]/)) { nfld = 3; } if (nfld != 3) tbl_error(fname, nlin, ("bad A table entry = \"" lin "\" nfld = " nfld)); if (length(fld[3]) != 1) tbl_error(fname, nlin, ("bad A char = \"" fld[3] "\"")); ach[nach] = fld[3]; if (fld[2] !~ /^[01][.][0-9]+$/) tbl_error(fname, nlin, ("bad A freq = \"" fld[2] "\"")); apr[nach] = fld[2]; nach++; } } if (ERRNO != "0") { tbl_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf "read %d characters\n", nach > "/dev/stderr" } function read_pair_table(fname, npairs,nlin,lin,fld,nfld,ofld3,st,j,ch,bchseen) { # Global variables defined by this procedure: nbst = 0; split("", bst); nbch = 0; split("", bch); split("", nbtr); split("", btrch); split("", btrpr); # Do it: split("", bchseen); npairs = 0; nlin = 0; ofld3 = ""; while((getline lin < fname) > 0) { nlin++; if (! match(lin, /^[ \011]*([#]|$)/)) { nfld = split(lin, fld); if ((nfld >= 5) && (fld[5] ~ /^[\#]/)) { nfld = 4; } if (nfld != 4) tbl_error(fname, nlin, ("bad B table entry = \"" lin "\"")); if (fld[3] != ofld3) { if ((fld[3] !~ /^[$]/) || (length(fld[3]) != order+1)) tbl_error(fname, nlin, ("bad B state = \"" fld[3] "\"")); ofld3 = fld[3]; st = substr(fld[3],2); bst[nbst] = st; nbtr[st] = 0; nbst++; } if (length(fld[4]) != 1) tbl_error(fname, nlin, ("bad B char = \"" fld[3] "\"")); ch = fld[4]; j = nbtr[st]; btrch[st,j] = ch; if (! (ch in bchseen)) { bch[nbch] = ch; nbch++; bchseen[ch] = 1; } if (fld[2] !~ /^[01][.][0-9]+$/) tbl_error(fname, nlin, ("bad B freq = \"" fld[2] "\"")); btrpr[st,j] = fld[2]; nbtr[st]++; npairs++; # printf "* %s %s %s %d %d\n", st, ch, fld[2], nbtr[st], nbch > "/dev/stderr"; } } if (ERRNO != "0") { tbl_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf "read %d arcs, %d states, %d chars\n", npairs, nbst, nbch > "/dev/stderr" } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function prog_error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit 1 } function tbl_error(f,n,msg) { printf "file %s, line %d: %s\n", f, n, msg > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit 1 }