Hacking at the Voynich manuscript - Side notes 108 Computing and comparing entropy profiles Last edited on 2012-05-03 20:46:09 by stolfilocal INTRODUCTION In this note we compute the conditional glyph entropies and the token entropy profile, for Voynichese and other languages. The conditional symbol entropy h[k] of order k is the expected number of bits that are provided by a random symbol in the text, given the previous k-1 symbols. The token entropy profile t[k] is the expected number of bits contained in the kth symbol of a token, given the previous k-1 symbols. The sum of t[k] for all k should be equal to the token entropy. We start from the `bad' text because we must reject any n-gram that touches a `bad' word. We also consider only the main text since the labels are not ordered in the VMS, and not meaningful in the other languages. set basicglyphs = 'e,i,o,a,y,q,l,d,r,s,n,m,Ch,Sh,k,t,CKh,CTh,f,p,CFh,CPh' set okoglyphs = 'q,y,a,o,k,t,f,p,ke,te,fe,pe,CKh,CTh,CFh,CPh,CKhe,CThe,CFhe,CPhe,Ch,Sh,ee,Che,She,eee,d,l,r,s,n,m,in,im,ir,iin,iim,iir,iiin' set wkind = "text"; foreach ekind ( basic oko ) set ifile = "lang/voyn/${wkind}/raw.wds" set ofile = "lang/voyn/${wkind}/ngrams-${ekind}.nct" echo "${ifile} -> ${ofile}" set glyphs = "${basicglyphs}" if ( "/${ekind}" == "/oko" ) set glyphs = "${okoglyphs}" cat ${ifile} \ | capitalize-ligatures -v field=1 \ | factor-word-${ekind} -v inField=1 -v outField=2 \ | gawk '/./{ print $2; }' \ | extract-and-count-ngrams -f parse-elem-list.gawk \ -v maxOrder=10 \ -v elemList="${glyphs}" \ -v spaceMarker='_' \ -v showBadWords=0 \ | sort -b +0 -1nr \ > ${ofile} end Extracting the N-letter ngrams of English and Latin tokens, and their counts: set trivialglyphs = 'a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z' foreach lang ( engl latn ) set wkind = "text"; set ekind = "trivial" set ifile = "lang/${lang}/${wkind}/raw.wds" set ofile = "lang/${lang}/${wkind}/ngrams-${ekind}.nct" echo "${ifile} -> ${ofile}" cat ${ifile} \ | factor-word-${ekind} -v inField=1 -v outField=2 \ | gawk '/./{ print $2; }' \ | extract-and-count-ngrams -f parse-elem-list.gawk \ -v maxOrder=10 \ -v elemList="${trivialglyphs}" \ -v spaceMarker='_' \ -v showBadWords=0 \ | sort -b +0 -1nr \ > ${ofile} end dicio-wc lang/*/*/ngrams-*.nct lines words bytes file ------- ------- --------- ------------ 586922 1173844 19463037 lang/engl/text/ngrams-trivial.nct 720891 1441782 24036696 lang/latn/text/ngrams-trivial.nct 485787 971574 16889950 lang/voyn/text/ngrams-basic.nct 529010 1058020 18794978 lang/voyn/text/ngrams-oko.nct Computing the conditional entropies of a random element, given the preceding k-1 elements. We must add an extra "{}" at the beginning of each ngram to avoid empty prefixes when k = 1. foreach le ( voyn/{basic,oko} {engl,latn}/trivial ) set lang = "${le:h}"; set ekind = "${le:t}" set ifile = "lang/${lang}/text/ngrams-${ekind}.nct" set ofile = "lang/${lang}/text/ngram-entropies-${ekind}.tbl"; echo "${ifile} --> ${ofile}" cat ${ifile} \ | gawk '/./{ \ ct = $1; w = ("{}" $2); \ w = gensub(/([{][^{}]*[}])$/, " \\1", "s", w); \ print ct, w; \ }' \ | sort -b +1 -2 \ | compute-cond-entropy \ | sort -b +1 -2gr \ > ${ofile} end dicio-wc lang/*/*/ngram-entropies-[a-z]*.tbl lines words bytes file ------- ------- --------- ------------ 440491 1321473 18597686 lang/engl/text/ngram-entropies-trivial.tbl 540140 1620420 22928920 lang/latn/text/ngram-entropies-trivial.tbl 349254 1047762 15270975 lang/voyn/text/ngram-entropies-basic.tbl 395069 1185207 17587988 lang/voyn/text/ngram-entropies-oko.tbl Extract the entropy after specific characters and character pairs: foreach le ( voyn/{basic,oko} {engl,latn}/trivial ) foreach order ( 01 02 03 ) set lang = "${le:h}"; set ekind = "${le:t}" set ifile = "lang/${lang}/text/ngram-entropies-${ekind}.tbl"; set ofile = "lang/${lang}/text/ngram-entropies-${order}-${ekind}.tbl"; echo "${ifile} --> ${ofile}" cat ${ifile} \ | gawk -v order="${order}" \ '/./{w=$3; gsub(/[^{]/,"",w); if(length(w)==order) {print;}}' \ > ${ofile} end end Computing the k-order entropy, defined as h[k] = average entropy of the kth character (including word-stop) given the k-1 preceding characters. foreach le ( voyn/{basic,oko} {engl,latn}/trivial ) set lang = "${le:h}"; set ekind = "${le:t}" set ifile = "lang/${lang}/text/ngram-entropies-${ekind}.tbl"; set ofile = "lang/${lang}/text/hk-${ekind}.tbl"; echo "${ifile} --> ${ofile}" cat ${ifile} \ | compute-hk-entropies \ > ${ofile} end Plots of the average k-order conditional entropy h[k] as a function of k. (These commands must be executed on a Sun.) foreach fmt ( eps gif ) foreach ekind ( basic oko ) set ofile = "hk-plots-${ekind}.${fmt}"; echo "${ofile}" compare-entropy-profiles \ -column 3 -size 0.75,0.75 -maxlen 10 -format ${fmt} \ lang/voyn/text/hk-${ekind}.tbl "Voynichese" \ lang/engl/text/hk-trivial.tbl "English" \ lang/latn/text/hk-trivial.tbl "Latin" \ > ${ofile} mv -b ${ofile} ${figdir}/ end end Compacting the garbage: gzip lang/*/*/ngram-*.tbl gzip lang/*/*/ngrams-*.nct EXTRACTING THE N-LETTER PREFIX DISTRIBUTIONS Let's extract the N-symbol prefixes of Voynichese tokens, in the basic glyphs and OKO elements, and their counts. foreach wkind ( text labs ) foreach ekind ( basic oko ) set lang = "voyn" set ifile = "lang/${lang}/${wkind}/gud.wfr" set ofile = "lang/${lang}/${wkind}/prefs-${ekind}.pct" echo "${ifile} -> ${ofile}" cat ${ifile} \ | capitalize-ligatures -v field=3 \ | factor-word-${ekind} -v inField=3 -v outField=4 \ | gawk '/./{ print $1, $4; }' \ | extract-and-count-prefixes \ -v spaceMarker='_' \ -v showBadWords=0 \ | sort -b +0 -1nr \ > ${ofile} end end Extracting the N-letter prefixes of English and Latin tokens, and their counts: foreach lang ( engl latn ) set wkind = "text"; set ekind = "trivial" set ifile = "lang/${lang}/${wkind}/gud.wfr" set ofile = "lang/${lang}/${wkind}/prefs-${ekind}.pct" echo "${ifile} -> ${ofile}" cat ${ifile} \ | factor-word-trivial -v inField=3 -v outField=4 \ | gawk '/./{ print $1, $4; }' \ | extract-and-count-prefixes \ -v spaceMarker='_' \ -v showBadWords=0 \ | sort -b +0 -1nr \ > ${ofile} end dicio-wc lang/{labs,text}/*-prefs.pct lines words bytes file ------- ------- --------- ------------ 2446 4892 69818 lang/labs/voyn-basic-prefs.pct 2304 4608 63756 lang/labs/voyn-oko-prefs.pct 20040 40080 634290 lang/text/engl-trivial-prefs.pct 32932 65864 1133625 lang/text/latn-trivial-prefs.pct 19179 38358 572713 lang/text/voyn-basic-prefs.pct 17746 35492 504665 lang/text/voyn-oko-prefs.pct COMPUTING THE LETTER ENTROPIES Computing the conditional entropies of the last element in the token prefix, given the preceding letters: foreach lwe ( voyn/{text,labs}.{basic,oko} {engl,latn}/text.trivial ) set lang = "${lwe:h}"; set we = "${lwe:t}" set wkind = "${we:r}"; set ekind = "${we:e}" set ifile = "lang/${wkind}/${lang}-${ekind}-prefs.pct" set ofile = "lang/${wkind}/${lang}-prefix-entropies-${ekind}.tbl"; echo "${ifile} --> ${ofile}" cat ${ifile} \ | gawk '/./{ \ ct = $1; w = $2; w = gensub(/([{][^{}]*[}])$/, " \\1", "s", w); \ print ct, w; \ }' \ | sort -b +1 -2 \ | compute-cond-entropy \ | sort -b +1 -2gr \ > ${ofile} end Computing the entropy profile, defined as h[k] = average entropy of the kth character (including word-stop) given the k-1 preceding characters. foreach lwe ( voyn/{text,labs}.{basic,oko} {engl,latn}/text.trivial ) set lang = "${lwe:h}"; set we = "${lwe:t}" set wkind = "${we:r}"; set ekind = "${we:e}" set ifile = "lang/${wkind}/${lang}-prefix-entropies-${ekind}.tbl"; set ofile = "lang/${wkind}/${lang}-entropy-profile-${ekind}.tbl"; echo "${ifile} --> ${ofile}" cat ${ifile} \ | compute-token-entropy-profile \ > ${ofile} end Checking whether the sum of the weihgted conditional entropies is equal to the token entropy: foreach lwe ( voyn/{text,labs}.{basic,oko} {engl,latn}/text.trivial ) set lang = "${lwe:h}"; set we = "${lwe:t}" set wkind = "${we:r}"; set ekind = "${we:e}" set ifile = "lang/${wkind}/${lang}-entropy-profile-${ekind}.tbl"; echo " " printf "%s %s %-6s: " "${lang}" "${wkind}" "${ekind}" cat ${ifile} \ | gawk '/./{ s+= $5; } END { print s; }' end voyn text basic : 10.121 voyn text oko : 10.123 voyn labs basic : 9.204 voyn labs oko : 9.204 engl text trivial : 9.176 latn text trivial : 10.618 Plots of the conditional entropy for each character position: foreach fmt ( eps gif ) foreach ekind ( basic oko ) set ofile = "entropy-profiles-${ekind}.${fmt}"; echo "${ofile}" compare-entropy-profiles \ -column 4 -size 1.50,0.75 -maxlen 20 -format ${fmt} \ lang/text/voyn-entropy-profile-${ekind}.tbl "Voynichese (text)" \ lang/labs/voyn-entropy-profile-${ekind}.tbl "Voynichese (labels)" \ lang/text/engl-entropy-profile-trivial.tbl "English" \ lang/text/latn-entropy-profile-trivial.tbl "Latin" \ > ${ofile} mv -b ${ofile} ${figdir}/ end end