Hacking at the Voynich manuscript - Side notes 068 Language transducers with specified output freqs Last edited on 2004-10-05 13:06:25 by stolfi LINK AND DIRECTORY SETUP ln -s /home/staff/stolfi/voynich/work ln -s ../101/dat dat mkdir res VOYNICHESE SAMPLE Got some texts: set samples = ( \ voyn/maj/tot.1 \ engl/wow/tot.1 \ ital/psp/tot.1 \ chip/voa/tot.1 \ ) foreach smp ( ${samples} ) mkdir -p res/${smp} end GATHERING THE TEXTS Gather good words from source files, write them one word per line. Discard bad chars and "words" starting with digits (beware of pinyin tones). Map parag breaks to blank lines. foreach smp ( ${samples} ) set infile = dat/${smp}/raw.tlw set otfile = res/${smp}/gud.tks echo "=== ${infile} -> ${otfile} ===" cat ${infile} \ | gawk \ ' /^[a]/ { print $3; } \ /^[#] *[=]$/ { print ""; } \ /^[p].* [=]$/ { print ""; } \ ' \ | egrep -v -e '[?*]' \ | egrep -v -e '^[0-9]' \ > ${otfile} head -200 ${otfile} | fmt -w 72 end COMPUTING LETTER FREQUENCIES Computing table of letter frequencies, sorted, for each language: foreach smp ( ${samples} ) set infile = res/${smp}/gud.tks set otfile = res/${smp}/gud.lfr echo "=== ${infile} -> ${otfile} ===" cat ${infile} \ | gather-letter-freqs \ | sort -b +0 -1nr \ > ${otfile} cat ${otfile} end BUILDING MARKOV MODELS Collecting the Markov transition probabilities: set orders = ( 0 1 2 3 ) foreach smp ( ${samples} ) foreach ord ( ${orders} ) set infile = res/${smp}/gud.tks set otfile = res/${smp}/gud-${ord}.pfr echo "=== ${infile} -> ${otfile} ===" cat ${infile} \ | gather-transition-freqs \ -v order=${ord} \ | sort -b +2 -3 +0 -1nr \ > ${otfile} head -100 ${otfile} end end TRANSDUCTION We read the letter frequency table of language A, the letter pair frequency table of language B, combine them into a transducer, and apply it to the sample of text A. set pairs = ( \ engl/wow/tot.1,ital/psp/tot.1 \ ital/psp/tot.1,engl/wow/tot.1 \ voyn/maj/tot.1,engl/wow/tot.1 \ voyn/maj/tot.1,chip/voa/tot.1 \ voyn/maj/tot.1,ital/psp/tot.1 \ ) foreach p ( ${pairs} ) foreach ord ( ${orders} ) set pp = ( `echo $p | tr ',' ' '` ) set asmp = "${pp[1]}"; set bsmp = "${pp[2]}" set atext = res/${asmp}/gud.tks set alcts = res/${asmp}/gud.lfr set bpcts = res/${bsmp}/gud-${ord}.pfr set xtext = res/${asmp}/syn/${bsmp}/syn-${ord}.tks if ( ! ( -d ${xtext:r} ) ) mkdir -p ${xtext:r} echo "=== ${atext} ( ${bpcts} ) -> ${xtext} ===" cat ${atext} \ | transducer \ -v order=${ord} \ -v achfile=${alcts} \ -v bprfile=${bpcts} \ > ${xtext} head -200 ${xtext} | fmt -w 72 end end