Hacking at the Voynich manuscript - Side notes 108 Computing and comparing entropy profiles Last edited on 2012-05-03 20:46:09 by stolfilocal INTRODUCTION In this note we compute the conditional glyph entropies and the token entropy profile, for Voynichese and other languages. The conditional symbol entropy h[k] of order k is the expected number of bits that are provided by a random symbol in the text, given the previous k-1 symbols. The token entropy profile t[k] is the expected number of bits contained in the kth symbol of a token, given the previous k-1 symbols. The sum of t[k] for all k should be equal to the token entropy. SETTING UP THE ENVIRONMENT Links: ln -s ../tr-stats/dat ln -s ../tr-stats/exp ln -s /home/staff/stolfi/voynich/work ln -s work/capitalize-ligatures ln -s work/compute-cum-cum-freqs ln -s work/compute-cum-freqs ln -s work/compute-freqs ln -s work/combine-counts ln -s work/compute-cond-entropy ln -s work/remove-freqs ln -s work/totalize-fields ln -s work/select-units ln -s work/words-from-evt ln -s work/format-counts-packed ln -s work/parse-elem-list.gawk ln -s work/factor-text-trivial.gawk ln -s work/factor-text-viqr-to-phon.gawk ln -s work/factor-text-pinyin-to-phon.gawk ln -s work/factor-text-pinyin-std.gawk ln -s work/factor-text-pinyin-fix.gawk ln -s work/factor-text-eva-to-basic.gawk ln -s work/factor-text-eva-to-oko.gawk TESTING THE CONDITIONAL ENTROPY CALCULATOR Generating a list of tuples with known entropies rm -f .test.wct foreach wc ( \ "{0}{0}.{a}" "{0}{1}.{b}" "{0}{2}.{c}" "{0}{3}.{d}" \ "{1}{0}.{b}" "{1}{1}.{c}" "{1}{2}.{d}" "{1}{3}.{a}" \ "{2}{0}.{c}" "{2}{1}.{d}" "{2}{2}.{a}" "{2}{3}.{b}" \ "{3}{0}.{d}" "{3}{1}.{a}" "{3}{2}.{b}" "{3}{3}.{c}" \ ) echo "40 ${wc:r} ${wc:e}" >> .test.wct end foreach w ( \ "{0}{a}" "{0}{b}" "{0}{c}" "{0}{d}" \ "{1}{a}" "{1}{b}" "{1}{c}" "{1}{d}" \ "{2}{a}" "{2}{b}" "{2}{c}" "{2}{d}" \ "{3}{a}" "{3}{b}" "{3}{c}" "{3}{d}" \ ) foreach c ( "{0}" "{1}" "{2}" "{3}" ) echo "10 $w $c" >> .test.wct end end foreach w ( \ "{a}{0}" "{b}{0}" "{c}{0}" "{d}{0}" \ "{a}{1}" "{b}{1}" "{c}{1}" "{d}{1}" \ "{a}{2}" "{b}{2}" "{c}{2}" "{d}{2}" \ "{a}{3}" "{b}{3}" "{c}{3}" "{d}{3}" \ ) foreach c ( "{0}" "{1}" "{2}" "{3}" ) echo "10 $w $c" >> .test.wct end end Testing the script (the result should be 1.333 bits): cat .test.wct \ | sort -b +1 -2 \ | compute-cond-entropy \ > .test.ents cat .test.ents cat .test.ents \ | gawk '//{s+= $2;n++;} END{print s/n;}' EXTRACTING THE N-GRAM DISTRIBUTIONS Selecting the samples and length-defining encodings: set sampelems = ( \ voyn/maj.bgly \ voyn/prs.bgly \ voyn/lab.bgly \ \ voyn/maj.qoko \ voyn/prs.qoko \ voyn/lab.qoko \ \ engl/wow.lets \ engl/cul.lets \ latn/ptt.lets \ grek/nwt.lets \ span/qvi.lets \ geez/gok.sera \ viet/ptt.viqr \ viet/ptt.phon \ tibe/vim.acip \ tibe/ccv.acip \ chin/ptt.stpy \ chin/ptt.fxpy \ chin/ptt.phon \ chin/red.stpy \ chin/red.fxpy \ chin/red.phon \ \ enrc/wow.lets \ chrc/red.lets \ \ engl/wnm.lets \ engl/cnp.lets \ ) ###################################################################### TO FIX AND REDO Let's extract the N-grams (strings of consecutive glyphs) of Voynichese text, considering word space a glyph. We start from the `bad' text because we must reject any n-gram that touches a `bad' word. We also consider only the main text since the labels are not ordered in the VMS, and not meaningful in the other languages. set basicglyphs = 'e,i,o,a,y,q,l,d,r,s,n,m,Ch,Sh,k,t,CKh,CTh,f,p,CFh,CPh' set okoglyphs = 'q,y,a,o,k,t,f,p,ke,te,fe,pe,CKh,CTh,CFh,CPh,CKhe,CThe,CFhe,CPhe,Ch,Sh,ee,Che,She,eee,d,l,r,s,n,m,in,im,ir,iin,iim,iir,iiin' set wkind = "text"; foreach ekind ( basic oko ) set ifile = "lang/voyn/${wkind}/raw.wds" set ofile = "lang/voyn/${wkind}/ngrams-${ekind}.nct" echo "${ifile} -> ${ofile}" set glyphs = "${basicglyphs}" if ( "/${ekind}" == "/oko" ) set glyphs = "${okoglyphs}" cat ${ifile} \ | capitalize-ligatures -v field=1 \ | factor-word-${ekind} -v inField=1 -v outField=2 \ | gawk '/./{ print $2; }' \ | extract-and-count-ngrams -f parse-elem-list.gawk \ -v maxOrder=10 \ -v elemList="${glyphs}" \ -v spaceMarker='_' \ -v showBadWords=0 \ | sort -b +0 -1nr \ > ${ofile} end Extracting the N-letter ngrams of English and Latin tokens, and their counts: set trivialglyphs = 'a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z' foreach lang ( engl latn ) set wkind = "text"; set ekind = "trivial" set ifile = "lang/${lang}/${wkind}/raw.wds" set ofile = "lang/${lang}/${wkind}/ngrams-${ekind}.nct" echo "${ifile} -> ${ofile}" cat ${ifile} \ | factor-word-${ekind} -v inField=1 -v outField=2 \ | gawk '/./{ print $2; }' \ | extract-and-count-ngrams -f parse-elem-list.gawk \ -v maxOrder=10 \ -v elemList="${trivialglyphs}" \ -v spaceMarker='_' \ -v showBadWords=0 \ | sort -b +0 -1nr \ > ${ofile} end dicio-wc lang/*/*/ngrams-*.nct lines words bytes file ------- ------- --------- ------------ 586922 1173844 19463037 lang/engl/text/ngrams-trivial.nct 720891 1441782 24036696 lang/latn/text/ngrams-trivial.nct 485787 971574 16889950 lang/voyn/text/ngrams-basic.nct 529010 1058020 18794978 lang/voyn/text/ngrams-oko.nct Computing the conditional entropies of a random element, given the preceding k-1 elements. We must add an extra "{}" at the beginning of each ngram to avoid empty prefixes when k = 1. foreach le ( voyn/{basic,oko} {engl,latn}/trivial ) set lang = "${le:h}"; set ekind = "${le:t}" set ifile = "lang/${lang}/text/ngrams-${ekind}.nct" set ofile = "lang/${lang}/text/ngram-entropies-${ekind}.tbl"; echo "${ifile} --> ${ofile}" cat ${ifile} \ | gawk '/./{ \ ct = $1; w = ("{}" $2); \ w = gensub(/([{][^{}]*[}])$/, " \\1", "s", w); \ print ct, w; \ }' \ | sort -b +1 -2 \ | compute-cond-entropy \ | sort -b +1 -2gr \ > ${ofile} end dicio-wc lang/*/*/ngram-entropies-[a-z]*.tbl lines words bytes file ------- ------- --------- ------------ 440491 1321473 18597686 lang/engl/text/ngram-entropies-trivial.tbl 540140 1620420 22928920 lang/latn/text/ngram-entropies-trivial.tbl 349254 1047762 15270975 lang/voyn/text/ngram-entropies-basic.tbl 395069 1185207 17587988 lang/voyn/text/ngram-entropies-oko.tbl Extract the entropy after specific characters and character pairs: foreach le ( voyn/{basic,oko} {engl,latn}/trivial ) foreach order ( 01 02 03 ) set lang = "${le:h}"; set ekind = "${le:t}" set ifile = "lang/${lang}/text/ngram-entropies-${ekind}.tbl"; set ofile = "lang/${lang}/text/ngram-entropies-${order}-${ekind}.tbl"; echo "${ifile} --> ${ofile}" cat ${ifile} \ | gawk -v order="${order}" \ '/./{w=$3; gsub(/[^{]/,"",w); if(length(w)==order) {print;}}' \ > ${ofile} end end Computing the k-order entropy, defined as h[k] = average entropy of the kth character (including word-stop) given the k-1 preceding characters. foreach le ( voyn/{basic,oko} {engl,latn}/trivial ) set lang = "${le:h}"; set ekind = "${le:t}" set ifile = "lang/${lang}/text/ngram-entropies-${ekind}.tbl"; set ofile = "lang/${lang}/text/hk-${ekind}.tbl"; echo "${ifile} --> ${ofile}" cat ${ifile} \ | compute-hk-entropies \ > ${ofile} end Plots of the average k-order conditional entropy h[k] as a function of k. (These commands must be executed on a Sun.) foreach fmt ( eps gif ) foreach ekind ( basic oko ) set ofile = "hk-plots-${ekind}.${fmt}"; echo "${ofile}" compare-entropy-profiles \ -column 3 -size 0.75,0.75 -maxlen 10 -format ${fmt} \ lang/voyn/text/hk-${ekind}.tbl "Voynichese" \ lang/engl/text/hk-trivial.tbl "English" \ lang/latn/text/hk-trivial.tbl "Latin" \ > ${ofile} mv -b ${ofile} ${figdir}/ end end Compacting the garbage: gzip lang/*/*/ngram-*.tbl gzip lang/*/*/ngrams-*.nct EXTRACTING THE N-LETTER PREFIX DISTRIBUTIONS Let's extract the N-symbol prefixes of Voynichese tokens, in the basic glyphs and OKO elements, and their counts. foreach wkind ( text labs ) foreach ekind ( basic oko ) set lang = "voyn" set ifile = "lang/${lang}/${wkind}/gud.wfr" set ofile = "lang/${lang}/${wkind}/prefs-${ekind}.pct" echo "${ifile} -> ${ofile}" cat ${ifile} \ | capitalize-ligatures -v field=3 \ | factor-word-${ekind} -v inField=3 -v outField=4 \ | gawk '/./{ print $1, $4; }' \ | extract-and-count-prefixes \ -v spaceMarker='_' \ -v showBadWords=0 \ | sort -b +0 -1nr \ > ${ofile} end end Extracting the N-letter prefixes of English and Latin tokens, and their counts: foreach lang ( engl latn ) set wkind = "text"; set ekind = "trivial" set ifile = "lang/${lang}/${wkind}/gud.wfr" set ofile = "lang/${lang}/${wkind}/prefs-${ekind}.pct" echo "${ifile} -> ${ofile}" cat ${ifile} \ | factor-word-trivial -v inField=3 -v outField=4 \ | gawk '/./{ print $1, $4; }' \ | extract-and-count-prefixes \ -v spaceMarker='_' \ -v showBadWords=0 \ | sort -b +0 -1nr \ > ${ofile} end dicio-wc lang/{labs,text}/*-prefs.pct lines words bytes file ------- ------- --------- ------------ 2446 4892 69818 lang/labs/voyn-basic-prefs.pct 2304 4608 63756 lang/labs/voyn-oko-prefs.pct 20040 40080 634290 lang/text/engl-trivial-prefs.pct 32932 65864 1133625 lang/text/latn-trivial-prefs.pct 19179 38358 572713 lang/text/voyn-basic-prefs.pct 17746 35492 504665 lang/text/voyn-oko-prefs.pct COMPUTING THE LETTER ENTROPIES Computing the conditional entropies of the last element in the token prefix, given the preceding letters: foreach lwe ( voyn/{text,labs}.{basic,oko} {engl,latn}/text.trivial ) set lang = "${lwe:h}"; set we = "${lwe:t}" set wkind = "${we:r}"; set ekind = "${we:e}" set ifile = "lang/${wkind}/${lang}-${ekind}-prefs.pct" set ofile = "lang/${wkind}/${lang}-prefix-entropies-${ekind}.tbl"; echo "${ifile} --> ${ofile}" cat ${ifile} \ | gawk '/./{ \ ct = $1; w = $2; w = gensub(/([{][^{}]*[}])$/, " \\1", "s", w); \ print ct, w; \ }' \ | sort -b +1 -2 \ | compute-cond-entropy \ | sort -b +1 -2gr \ > ${ofile} end Computing the entropy profile, defined as h[k] = average entropy of the kth character (including word-stop) given the k-1 preceding characters. foreach lwe ( voyn/{text,labs}.{basic,oko} {engl,latn}/text.trivial ) set lang = "${lwe:h}"; set we = "${lwe:t}" set wkind = "${we:r}"; set ekind = "${we:e}" set ifile = "lang/${wkind}/${lang}-prefix-entropies-${ekind}.tbl"; set ofile = "lang/${wkind}/${lang}-entropy-profile-${ekind}.tbl"; echo "${ifile} --> ${ofile}" cat ${ifile} \ | compute-token-entropy-profile \ > ${ofile} end Checking whether the sum of the weihgted conditional entropies is equal to the token entropy: foreach lwe ( voyn/{text,labs}.{basic,oko} {engl,latn}/text.trivial ) set lang = "${lwe:h}"; set we = "${lwe:t}" set wkind = "${we:r}"; set ekind = "${we:e}" set ifile = "lang/${wkind}/${lang}-entropy-profile-${ekind}.tbl"; echo " " printf "%s %s %-6s: " "${lang}" "${wkind}" "${ekind}" cat ${ifile} \ | gawk '/./{ s+= $5; } END { print s; }' end voyn text basic : 10.121 voyn text oko : 10.123 voyn labs basic : 9.204 voyn labs oko : 9.204 engl text trivial : 9.176 latn text trivial : 10.618 Plots of the conditional entropy for each character position: foreach fmt ( eps gif ) foreach ekind ( basic oko ) set ofile = "entropy-profiles-${ekind}.${fmt}"; echo "${ofile}" compare-entropy-profiles \ -column 4 -size 1.50,0.75 -maxlen 20 -format ${fmt} \ lang/text/voyn-entropy-profile-${ekind}.tbl "Voynichese (text)" \ lang/labs/voyn-entropy-profile-${ekind}.tbl "Voynichese (labels)" \ lang/text/engl-entropy-profile-trivial.tbl "English" \ lang/text/latn-entropy-profile-trivial.tbl "Latin" \ > ${ofile} mv -b ${ofile} ${figdir}/ end end