Hacking at the Voynich manuscript - Side notes 201 Statistics of crescent and circle sequences Last edited on 2002-01-04 00:50:45 by stolfi INTRODUCTION In this note we compute some statistics about the crescent and circle glyphs with the purpose of deciding how to parse strings of those elements in the fine-structure model. SETTING UP THE ENVIRONMENT Commands: ln -s ../../capitalize-ligatures ln -s ../../compute-cum-cum-freqs ln -s ../../compute-cum-freqs ln -s ../../compute-freqs ln -s ../../combine-counts ln -s ../../remove-freqs ln -s ../../totalize-fields ln -s ../../select-units ln -s ../../words-from-evt ln -s ../../format-counts-packed Data: ln -s ../100/subsections.tags ln -s ../100/data ln -s ../101/lang ln -s ../103/factor-word-basic ln -s ../103/count-elems ln -s ../103/count-elem-pairs ln -s ../103/compute-row-col-freqs ln -s ../103/tex-format-elem-pair-freqs ln -s ../103/parse-elem-list.gawk Paper directories: set tbldir = "/home/staff/stolfi/papers/voynich-words/techrep/tables/auto" set figdir = "/home/staff/stolfi/papers/voynich-words/techrep/figures/auto" Result directories: mkdir -p stats/{text,labs}/{t,w} Section tags: set secs = ( `cat subsections.tags` ) set secscm = `echo ${secs} | tr ' ' ','` echo ${secs}; echo ${secscm} INVESTIGATING THE CONTEXT STATISTICS OF CIRCLE AND CRESCENT GLYPHS Collecting all the crescent and circle letter slots: foreach wkind ( text labs ) foreach tw ( t w ) foreach glyphs ( aoy e aoye ) set ifile = "lang/voyn/${wkind}/gud.wfr" set ofile = "stats/${wkind}/${tw}/slots-${glyphs}-in-ctx.frq" echo " "; echo "${ifile} -> ${ofile}" cat ${ifile} \ | capitalize-ligatures -v field=3 \ | factor-word-basic -v inField=3 -v outField=4 \ | gawk -v tw="${tw}" '/./{print (tw == "w" ? 1 : $1), $4;}' \ | extract-glyph-strings -v glyphs="${glyphs}" \ | combine-counts \ | sort -b +0 -1nr +1 -2 \ | compute-freqs \ > ${ofile} end end end Tabulating slots of each type: foreach wkind ( text labs ) foreach tw ( t w ) foreach glyphs ( aoy e aoye ) set ifile = "stats/${wkind}/${tw}/slots-${glyphs}-in-ctx.frq" set ofile = "stats/${wkind}/${tw}/slots-${glyphs}.frq" echo " "; echo "${ifile} -> ${ofile}" cat ${ifile} \ | gawk '/./{print $1, $3}' \ | sed -e 's/{[^{}]*}//g' \ | combine-counts \ | sort -b +0 -1nr +1 -2 \ | compute-freqs \ > ${ofile} end end end DOUBLE CONTEXS OF SINGLE CRESCENT AND CIRCLE GLYPHS Computing and formatting the two-sided contexts of single , , , , and [aoy]: foreach wkind ( text labs ) foreach tw ( t w ) foreach fg ( aoy/a aoy/o aoy/y aoy/aoy e/e ) set fam = "${fg:h}"; set glyphs = "${fg:t}" set ifile = "stats/${wkind}/${tw}/slots-${fam}-in-ctx.frq" set ofile = "stats/${wkind}/${tw}/single-${glyphs}-ctx.frq" echo " "; echo "${ifile} -> ${ofile}" cat ${ifile} \ | gawk -v glyphs="${glyphs}" \ ' ($3 ~ ("<[" glyphs "]>")){ \ wd = $3; gsub(/[<].[>]/, ":", wd); gsub(/[{}]/,"", wd); \ print $1, wd; \ } \ ' \ | combine-counts \ | compute-row-col-freqs -v outputTotals=1 \ | sort -b +0 -1nr +4 -5 \ > ${ofile} end end end Format the results as TeX tables: foreach wkind ( text labs ) foreach tw ( t w ) foreach ge ( e/a,o,-,y a/e o/e y/e aoy/e ) set glyphs = "${ge:h}"; set elems = "${ge:t}" set ifile = "stats/${wkind}/${tw}/single-${glyphs}-ctx.frq" set ofile = "single-${glyphs}-ctx-${wkind}-${tw}.tex" echo " "; echo "${ifile} -> ${ofile}" set xelems = '_,-,q,-,'"${elems}"',-,l,d,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh' cat ${ifile} \ | tex-format-elem-pair-freqs -f parse-elem-list.gawk \ -v rowList="${xelems}" -v colList="${xelems}" \ -v endMarker='_' -v freqDigits=2 -v minFreq=0.005 \ -v showCounts=1 -v showRowFreqs=0 -v showColFreqs=0 \ > ${ofile} mv -bv ${ofile} ${tbldir}/ end end end ONE-SIDED CONTEXS OF SINGLE CRESCENT AND CIRCLE GLYPHS Computing and formatting the previous and next glyph distributions for single [e] and [aoy] strings: foreach wkind ( text labs ) foreach tw ( t w ) foreach fg ( aoy/a aoy/o aoy/y aoy/aoy e/e ) set fam = "${fg:h}"; set glyphs = "${fg:t}" set ifile = "stats/${wkind}/${tw}/slots-${fam}-in-ctx.frq" set ofile = "stats/${wkind}/${tw}/prev-next-single-${glyphs}.frq" echo " "; echo "${ifile} -> ${ofile}" cat ${ifile} \ | gawk -v glyphs="${glyphs}" \ ' ($3 ~ ("<[" glyphs "]>")){ \ a = $3; gsub(/[<].[>].*$/, "", a); gsub(/[{}]/,"",a); \ b = $3; gsub(/^.*[<].[>]/, "", b); gsub(/[{}]/,"",b); \ print $1,("prev:" a); print $1,("next:" b); \ } \ ' \ | combine-counts \ | compute-row-col-freqs -v outputTotals=1 \ | sort -b +0 -1nr +4 -5 \ > ${ofile} end end end Format the results as TeX tables: foreach wkind ( text labs ) foreach tw ( t w ) foreach ge ( e/a,o,-,y a/e o/e y/e aoy/e ) set glyphs = "${ge:h}"; set elems = "${ge:t}" set ifile = "stats/${wkind}/${tw}/prev-next-single-${glyphs}.frq" set ofile = "prev-next-single-${glyphs}-${wkind}-${tw}.tex" echo " "; echo "${ifile} -> ${ofile}" set xelems = '_,-,q,-,'"${elems}"',-,l,d,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh' cat ${ifile} \ | tex-format-elem-pair-freqs -f parse-elem-list.gawk \ -v rowList="prev,next" -v colList="${xelems}" \ -v endMarker='_' -v freqDigits=2 -v minFreq=0.005 \ -v showCounts=0 -v showRowFreqs=1 -v showColFreqs=0 \ > ${ofile} mv -bv ${ofile} ${tbldir}/ end end end COMPARING DISTRIBUTIONS OF MODIFIED AND UNMODIFIED GLYPHS Computing the basic symbol pair frequencies in the good text (for comparison with -modified glyph frequencies): set bglyphs = '_,-,e,-,i,-,q,-,y,-,a,o,-,d,l,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh' foreach wkind ( text labs ) foreach tw ( t w ) set ifile = "lang/voyn/${wkind}/gud.wfr" set ofile = "stats/${wkind}/${tw}/basic-glyph-pairs.gfr" echo " "; echo "${ifile} -> ${ofile}" cat ${ifile} \ | capitalize-ligatures -v field=3 \ | factor-word-basic -v inField=3 -v outField=4 \ | gawk -v tw="${tw}" '/./{print (tw == "w" ? 1 : $1), $4;}' \ | count-elem-pairs -f parse-elem-list.gawk \ -v endMarker='_' -v showBadWords=0 \ -v rowList="${bglyphs}" -v colList="${bglyphs}" \ | compute-row-col-freqs -v outputTotals=1 \ > ${ofile} end end Computing next-glyph and prev-glyph distributions for and pairs, respectively; and ditto for and , etc. foreach wkind ( text labs ) foreach tw ( t w ) foreach dir ( next prev ) foreach fg ( aoy/a aoy/o aoy/y aoy/aoy e/e ) set fam = "${fg:h}"; set glyphs = "${fg:t}" set ifile = "stats/${wkind}/${tw}/slots-${fam}-in-ctx.frq" set ofile = "stats/${wkind}/${tw}/elem-${glyphs}-${dir}.gfr" echo " "; echo "${ifile} -> ${ofile}" cat ${ifile} \ | gawk -v dir="${dir}" -v glyphs="${glyphs}" \ ' BEGIN { \ gp = ("{" glyphs "}"); cp = ("[<][" glyphs "][>]"); \ rep = ("\\1" (dir == "next" ? (gp ":") : (":" gp)) "\\2"); \ } \ ($3 ~ cp){ a = $3; \ a = gensub(/^(.*)[<].[>](.*)$/, rep, "g", a); \ gsub(/[{}]/,"",a); print $1, a; \ } \ ' \ | combine-counts \ | compute-row-col-freqs -v outputTotals=1 \ | sort -b +0 -1nr +4 -5 \ > ${ofile} end end end end Formatting the data for and as TeX tables: set nglyphs = '_,-,q,-,y,-,a,o,-,d,l,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh' set xepairs = 'Ch,Che,-,Sh,She,-,k,ke,-,t,te,-,CKh,CKhe,-,CTh,CThe' foreach wkind ( text labs ) foreach tw ( t w ) set pfile = "stats/${wkind}/${tw}/basic-glyph-pairs.gfr" set efile = "stats/${wkind}/${tw}/elem-e-next.gfr" set ofile = "e-elem-next-distr-${wkind}-${tw}.tex" echo " "; echo "${pfile}, ${efile} -> ${ofile}" cat ${pfile} ${efile} \ | tex-format-elem-pair-freqs -f parse-elem-list.gawk \ -v rowList="${xepairs}" -v colList="${nglyphs},-,+" \ -v endMarker='_' -v minFreq=0.015 \ -v showCounts=0 -v showRowFreqs=1 -v showColFreqs=0 \ > ${ofile} mv -bv ${ofile} ${tbldir}/ end end set pglyphs = '_,-,q,-,y,-,a,o,-,d,l,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh' set expairs = '_,e_,-,y,ey,-,a,ea,-,o,eo,-,d,ed,-,Ch,eCh,-,Sh,eSh,-,k,ek,-,t,et,-,p,ep,-,f,ef,-,CKh,eCKh,-,CTh,eCTh' foreach wkind ( text labs ) foreach tw ( t w ) set pfile = "stats/${wkind}/${tw}/basic-glyph-pairs.gfr" set efile = "stats/${wkind}/${tw}/elem-e-prev.gfr" set ofile = "e-elem-prev-distr-${wkind}-${tw}.tex" echo " "; echo "${ifile} -> ${ofile}" cat ${pfile} ${efile} \ | tex-format-elem-pair-freqs -f parse-elem-list.gawk \ -v rowList="${pglyphs},-,+" -v colList="${expairs}" \ -v endMarker='_' -v minFreq=0.015 \ -v showCounts=0 -v showRowFreqs=0 -v showColFreqs=1 \ > ${ofile} mv -bv ${ofile} ${tbldir}/ end end Formatting the data for and as TeX tables: set nglyphs = '_,-,q,-,e,-,i,-,d,l,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh' set xopairs = '_,_o,-,d,do,-,r,ro,-,s,so,-,l,lo,-,Ch,Cho,-,Sh,Sho,-,k,ko,-,t,to,-,CKh,CKho,-,CTh,CTho' foreach wkind ( text labs ) foreach tw ( t w ) set pfile = "stats/${wkind}/${tw}/basic-glyph-pairs.gfr" set efile = "stats/${wkind}/${tw}/elem-o-next.gfr" set ofile = "o-elem-next-distr-${wkind}-${tw}.tex" echo " "; echo "${pfile}, ${efile} -> ${ofile}" cat ${pfile} ${efile} \ | tex-format-elem-pair-freqs -f parse-elem-list.gawk \ -v rowList="${xopairs}" -v colList="${nglyphs},-,+" \ -v endMarker='_' -v minFreq=0.015 \ -v showCounts=0 -v showRowFreqs=1 -v showColFreqs=0 \ > ${ofile} mv -bv ${ofile} ${tbldir}/ end end set pglyphs = '_,-,q,-,e,-,i,-,d,l,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh' set oxpairs = '_,o_,-,d,od,-,l,ol,r,or,-,-,s,os,-,Ch,oCh,-,Sh,oSh,-,k,ok,-,t,ot,-,p,op,-,f,of,-,CKh,oCKh,-,CTh,oCTh' foreach wkind ( text labs ) foreach tw ( t w ) set pfile = "stats/${wkind}/${tw}/basic-glyph-pairs.gfr" set efile = "stats/${wkind}/${tw}/elem-o-prev.gfr" set ofile = "o-elem-prev-distr-${wkind}-${tw}.tex" echo " "; echo "${ifile} -> ${ofile}" cat ${pfile} ${efile} \ | tex-format-elem-pair-freqs -f parse-elem-list.gawk \ -v rowList="${pglyphs},-,+" -v colList="${oxpairs}" \ -v endMarker='_' -v minFreq=0.015 \ -v showCounts=0 -v showRowFreqs=0 -v showColFreqs=1 \ > ${ofile} mv -bv ${ofile} ${tbldir}/ end end INVESTIGATING SINGLE "E"s Count single s (not preceded by gallows or benches) foreach wkind ( text labs ) set ifile = "stats/${wkind}/prev-next-single-e.frq" echo " "; printf "%s: " "${wkind}" cat ${ifile} \ | egrep 'prev:' \ | egrep -v 'prev:([+]|Ch|Sh|[Cc]?[KTPFktpf][Hh]?)$' \ | gawk '/./{s+=$1;} END{print s;}' end text: 287 labs: 23 Show the offending words, by section: foreach wkind ( text labs ) foreach sec ( ${secs} tot.n ) set ifile = "data/gud/${wkind}/${sec}.wfr" set ofile = "stats/${wkind}/${sec}.wfr" echo " "; echo "${ifile} -> ${ofile}" cat ${ifile} \ | capitalize-ligatures -v field=3 \ | gawk \ ' /./{ \ ct = $1; wd = $3; \ gsub(/ee/, "Ee", wd); \ wd = gensub(/(^|[^ktpfhEe])[e]/, "\\1:e", "g", wd); \ if (wd ~ /[:]/) { print ct, wd; } \ } ' \ | revbytes | sort -t: +1 -2 | revbytes \ > ${ofile} end end JUSTIFYING THE PARSING OF EE-GROUPS AND EEE-GROUPS AS SINGLE ELEMS Compute the frequency of ambiguous groups. echo "From text/tot.n.wfr:"; echo " "; \ cat data/gud/text/tot.n.wfr \ | gawk '($3 ~ /[ktpfhe]eee/){print $1, $3; }' \ | sort -b +0 -1nr +1 -2 \ | format-counts-packed \ | sed -e 's/^/ /' From text/tot.n.wfr: qokeeey(27) okeeey(26) keeey(10) okeeedy(9) cheeey(8) lkeeey(8) olkeeey(7) oteeey(7) ykeeey(6) yteeey(5) keeedy(4) lkeeedy(4) qokeeedy(4) teeedy(4) okeeeo(3) olkeeedy(3) oteeedy(3) qokeees(3) qoteeedy(3) qoteeey(3) sheeey(3) ykeeedy(3) chokeeey(2) cholkeeey(2) keeeol(2) olcheeey(2) oteees(2) qoteees(2) sheeeky(2) cheeedaiin(1) cheeen(1) cheeeo(1) cheees(1) cheeetchol(1) cheeety(1) cheykeeed(1) chkeeey(1) cholkeeedy(1) choteeen(1) choteeey(1) ckheeey(1) ctheees(1) dalcheeeky(1) dalkeeey(1) dcheeey(1) deeeese(1) feeedy(1) keeed(1) keeedal(1) keeees(1) keeeo(1) keeeody(1) keeeos(1) keees(1) lcheeey(1) lkeeed(1) lkeeedam(1) lkeeeody(1) lkeees(1) oeeees(1) okchoteees(1) okeeeody(1) okeeeol(1) okeees(1) olcheees(1) olkeeeary(1) olkeeed(1) olkeees(1) olteeedy(1) orkeeey(1) orokeeeey(1) oteee(1) oteeen(1) oteeeo(1) oteeeodar(1) oteeeor(1) otokeeey(1) qkeeey(1) qoeeeey(1) qofsheeey(1) qokaekeeey(1) qokeee(1) qokeeen(1) qokeeeody(1) qokeeeos(1) qolkeeey(1) qoteeeo(1) qoykeeey(1) sheees(1) sheeetchy(1) shefeeedy(1) shokeeey(1) teeey(1) tolkeeedy(1) ykeeeedaiir(1) ykeeeos(1) ypcheeey(1) ysheees(1) yteeed(1) yteeedy(1) yteeeor(1) Compare with frequency of and where K = { k t p f h e } echo "From text/tot.n.wfr:" ; echo " " ; \ cat data/gud/text/tot.n.wfr \ | gawk '($3 ~ /([ktpf]|[cs]h)e*([cs]he|e[cs]h)/){ \ w = gensub(/^.*(([ktpf]|[cs]h)e*([cs]he|e[cs]h)e*).*$/, "\\1", "g", $3); \ gsub(/[ktpf]/,"k",w); gsub(/[cs]h/,"ch",w); \ print $1, w; }' \ | combine-counts \ | sort -b +0 -1nr +1 -2 \ | format-counts-packed \ | sed -e 's/^/ /' From text/tot.n.wfr: kche(1192) kchee(118) kech(90) keche(39) keech(25) chech(15) chche(9) keeche(5) cheche(2) kcheee(2) kechee(2) chchee(1) cheech(1) As we can see, is about 12 times as common as , and plus is more common than .