Hacking at the Voynich manuscript - Side notes 059 Spatial distribution of gallows and tables Last edited on 2002-01-05 23:52:22 by stolfi INTRODUCTION We map each word of the text to a simple numeric attribute, and study the properties of that sequence. SETTING UP THE ENVIRONMENT Scripts ln -s ../../compute-cum-cum-freqs ln -s ../../compute-cum-freqs ln -s ../../combine-counts ln -s ../../remove-freqs ln -s ../../totalize-fields ln -s ../../compute-freqs ln -s ../../columnate ln -s ../../compare-counts ln -s ../../compare-freqs ln -s ../../select-units ln -s ../../words-from-evt ln -s ../../map-field Unit-tag-to-text-type map: ln -s ../019/unit-to-type.tbl Page-to-subsection map: ln -s ../019/fnum-to-subsec.tbl Text in subsections: ln -s ../045/subsecs-m text-subsecs CREATING THE SUBSECTION-TO-FNUMS TABLE cat fnum-to-subsec.tbl \ | gawk \ ' /./ { \ f=$1; s=$2; \ if(s != os){ \ if(os != ""){ print os, (fi "-" ff); } \ fi = f; os = s; \ } \ ff = f; \ } \ END { if(os != ""){ print os, (fi "-" ff); } } \ ' \ > subsec-to-fnums.tbl (Had to edit the resulting table by hand.) OBTAINING THE WORD STREAMS mkdir word-seqs ( cat text-subsecs/all.names ; echo "lab.n"; echo "txt.n" ) \ > word-seqs/all.names \ set secs = ( `cat text-subsecs/all.names` ) echo ${secs} foreach sec ( ${secs} ) echo ${sec} cat text-subsecs/${sec}.evt \ | select-units \ -v types='parags,starred-parags,circular-lines,circular-text,radial-lines,titles' \ -v table=unit-to-type.tbl \ | words-from-evt -v showLines=1 \ > word-seqs/${sec}.wds end set slist = `echo ${secs} | tr ' ' ','` cat word-seqs/{${slist}}.wds \ > word-seqs/txt.n.wds cat text-subsecs/{${slist}}.evt \ | select-units \ -v types='labels' \ -v table=unit-to-type.tbl \ | words-from-evt -v showLines=1 \ > word-seqs/lab.n.wds Word frequencies: foreach sec ( ${secs} txt.n lab.n ) echo "${sec}" cat word-seqs/${sec}.wds \ | egrep '.' \ | sort | uniq -c | expand \ | sort +0 -1nr +1 -2 \ | compute-freqs \ > word-seqs/${sec}.wfr end TABULATING ATTRIBUTES Tabulation by section: mkdir stats foreach sec ( txt.n lab.n ${secs} ) echo "${sec}" cat word-seqs/${sec}.wds \ | tabulate-attributes \ -f extract-word-attributes.gawk \ > stats/${sec}.tbl end Global summary of gallows counts: foreach f ( g m ) /bin/rm -f .tmp set ofile = "stats/all-${f}.tots" foreach sec ( ${secs} txt.n lab.n ) echo "${sec} ->> .tmp" cat stats/${sec}.tbl \ | egrep 'ct_'"${f}" \ | sed -e 's/^ *ct_[a-z]*/'"${sec}"' /' \ >> .tmp end echo ".tmp -> ${ofile}" echo 'Totals for attribute "'"${f}"'"' > ${ofile} cat .tmp \ | tabulate-attribute-totals \ >> ${ofile} end GENERATING THE BIT STREAMS Generating digit streams that indicate words with gallows (-g.bst), words with table characters (-m.bst), and the "ed" group (-ed.bst): foreach sec ( txt.n lab.n ${secs} ) foreach attr ( g m ed r ) echo "${sec}-${attr}" cat word-seqs/${sec}.wds \ | make-attribute-string \ -v attr=${attr} \ -f extract-word-attributes.gawk \ > stats/${sec}-${attr}.bst end end Separating the "ed" words: foreach sec ( txt.n lab.n ${secs} ) echo "${sec}" cat word-seqs/${sec}.wds \ | gawk '/ed/{print;next;} /./{print "*";next;} //{print;}' \ > word-seqs/${sec}-ed.wds end foreach sec ( txt.n lab.n ${secs} ) echo "${sec}" cat word-seqs/${sec}-ed.wds \ | egrep '.' \ | sort | uniq -c | expand \ | sort +0 -1nr +1 -2 \ | compute-freqs \ > word-seqs/${sec}-ed.wfr end TABULATING THE RUN LENGTHS IN BIT SEQUENCES Consider only lines that have this minimum length: set minlen = 1 Testing the run extraction scripts: cat stats/txt.n-g.bst \ | tr '23456789' '11111111' \ | tr -d ' ' \ | egrep -v '[?*]' \ > .bar cat .bar \ | extract-runs \ -v sticky_qmarks=0 \ -v min_line_length=${minlen} \ > .foo Extracting and counting runs by value, length, and section: foreach sec ( txt.n lab.n ${secs} ) foreach attr ( g m ed r ) echo "${sec}-${attr}.rct" cat stats/${sec}-${attr}.bst \ | tr '23456789' '11111111' \ | tr -d ' ' \ | egrep -v '[?*]' \ | extract-runs \ -v sticky_qmarks=0 \ -v min_line_length=${minlen} \ | sort | uniq -c | expand | sort -b +1 -2 \ > stats/${sec}-${attr}.rct end end Computing the relative frequency of each 0/1 run foreach sec ( txt.n lab.n ${secs} ) foreach attr ( g m ed r ) echo "${sec}-${attr}.rfr" cat stats/${sec}-${attr}.rct \ | gawk '($2 ~ /^([0]+|[1]+)$/){print;}' \ | compute-freqs \ > stats/${sec}-${attr}.rfr end end Computing the expected run frequencies: foreach sec ( txt.n lab.n ${secs} ) foreach attr ( g m ed r ) echo "${sec}-${attr}.xfr" cat stats/${sec}-${attr}.rct \ | compute-expected-run-stats \ > stats/${sec}-${attr}.xfr end end Tabulating the relative frequencies of 0-words and 1-words per section: foreach attr ( g m ed r ) set sfile = "stats/summary-${attr}.bct" echo ${sfile} /bin/rm -f .tmp foreach sec ( txt.n lab.n ${secs} ) set xfile = stats/${sec}-${attr}.xfr set pr = ( `egrep '^s[01] =' ${xfile}` ) echo "${sec}" "${pr[3]}" "${pr[6]}" "${pr[9]}" "${pr[12]}" >> .tmp end cat .tmp \ | map-field \ -v table=subsec-to-fnums.tbl \ -v inField=1 -v outField=6 \ | gawk \ ' /./{ \ printf "%-5s %5d %5.3f %5d %5.3f %s\n", \ $1,$2,$3,$4,$5,$6; \ } \ ' \ > ${sfile} end