Let's repeat the investigation of [czHP] strings, but including the `8' letter: cat bio-j-jsa-gut.wds \ | sed \ -e 's/^/_/g' \ -e 's/$/_/g' \ -e 's/[ql]j/H/' \ -e 's/[ql]g/P/' \ -e 's/cs/z/g' \ -e 's/ij/k/g' \ -e 's/ix/e/g' \ -e 's/is/r/g' \ -e 's/iiu/n/g' \ -e 's/y/i/g' \ -e 's/ci/a/g' \ -e 's/cg/8/g' \ | enum-contexts -vPAT='[czHP8][czHP8]*' -vCTX=0 \ | wfreq 793 0.19 H 382 0.09 8 374 0.09 Hc8 314 0.07 Hcc8 305 0.07 ccc8 277 0.06 zcc8 178 0.04 Hcc 163 0.04 ccc 152 0.04 z 140 0.03 zcc 102 0.02 Hc 74 0.02 cc 56 0.01 cccHc 49 0.01 zccc8 49 0.01 Pccc8 49 0.01 P 48 0.01 cc8 46 0.01 zc 41 0.01 ccccHc 40 0.01 cccc 39 0.01 zccHc 35 0.01 zcccHc 35 0.01 Hccc8 34 0.01 zccc 27 0.01 cccc8 25 0.01 Hccc 24 0.01 zccH 20 0.00 zc8 18 0.00 cccH 18 0.00 8zcc8 16 0.00 Hzcc 15 0.00 Hzcc8 14 0.00 cHc 14 0.00 Pccc 13 0.00 zcccH 12 0.00 ccccH 12 0.00 cHcc 11 0.00 cccz 11 0.00 ccH 11 0.00 8ccc8 9 0.00 zccHcc 9 0.00 cccHc8 8 0.00 cHcc8 7 0.00 cccHcc8 7 0.00 Pzcc8 7 0.00 Hzc8 6 0.00 zccHcc8 6 0.00 8cc 5 0.00 zcccHcc8 5 0.00 zcH 5 0.00 ccccz 5 0.00 cHc8 5 0.00 c 5 0.00 Pcc8 5 0.00 8cc8 4 0.00 zzcc8 4 0.00 ccccHcc 4 0.00 cccHcc 4 0.00 cH 4 0.00 Pcc 4 0.00 Hcccc 4 0.00 8ccc 3 0.00 zcccz 3 0.00 cccP 3 0.00 ccHc8 3 0.00 cPcc 3 0.00 cPc 3 0.00 P8 3 0.00 8zcc 3 0.00 8zc 2 0.00 zzcc 2 0.00 zcccPc 2 0.00 zcccHcc 2 0.00 zcccHc8 2 0.00 zccPcc 2 0.00 zccHc8 2 0.00 ccz 2 0.00 ccccHcc8 2 0.00 cccPcc8 2 0.00 cccPcc 2 0.00 cP 2 0.00 Pzc8 2 0.00 Pzc 2 0.00 Pcccc 2 0.00 Hzc 2 0.00 Hczcc 2 0.00 Hczc 2 0.00 Hccz 2 0.00 Hcccc8 2 0.00 H8 2 0.00 8zccc8 2 0.00 8zccc 2 0.00 8cccc 2 0.00 8c8 2 0.00 8Hc8 1 0.00 zzcccHc 1 0.00 zzccH 1 0.00 zzcHcc8 1 0.00 zcz8 1 0.00 zccz 1 0.00 zccccHcc 1 0.00 zcccc 1 0.00 zcccHcc8cc 1 0.00 zccPccc8 1 0.00 zccP 1 0.00 zccHccc 1 0.00 zcHcc 1 0.00 zcHc 1 0.00 zPcc 1 0.00 zHcc 1 0.00 zH 1 0.00 ccccc 1 0.00 ccccPcc8 1 0.00 cccPccc8 1 0.00 cccHccc8 1 0.00 ccc8cc 1 0.00 ccPzccc8 1 0.00 ccPzccc 1 0.00 ccPccc8 1 0.00 ccP 1 0.00 ccHcc8 1 0.00 ccHcc 1 0.00 cc8cc 1 0.00 cPccc8 1 0.00 cPccc 1 0.00 cPcc8 1 0.00 cPc8 1 0.00 cHccz 1 0.00 cHccc8 1 0.00 Pzcc 1 0.00 Hczc8 1 0.00 Hcz8 1 0.00 Hcz 1 0.00 Hccz8 1 0.00 Hc8zcc8 1 0.00 Hc8cc 1 0.00 Hc8c8 1 0.00 Hc8c 1 0.00 8zcccz 1 0.00 8zc8 1 0.00 8cccc8 1 0.00 8cccHcc8 1 0.00 8Hzcc 1 0.00 8Hcc 1 0.00 88 ----- ---- ---- 4282 1.00 TOT Apparently the `8' (\cg/) does not tend to be surrounded by [czHP] strokes, it is either preceded or followed by them. Thus `8' seems quite unlike `P'. Let's look at some `P' strings and try to find similar words with the `P' replaced by something else: cat bio-j-jsa-gut.wds \ | sed \ -e 's/^/_/g' \ -e 's/$/_/g' \ -e 's/[ql]j/H/' \ -e 's/[ql]g/P/' \ -e 's/cs/z/g' \ -e 's/ij/k/g' \ -e 's/ix/e/g' \ -e 's/is/r/g' \ -e 's/iiu/n/g' \ -e 's/y/i/g' \ -e 's/ci/a/g' \ -e 's/cg/8/g' \ | egrep '[^czHP]P[^czHP]' \ | wfreq 8 0.15 _Poe_ 2 0.04 _oPoea_ 2 0.04 _oPar_ 2 0.04 _oPa_ 2 0.04 _Poeccc8a_ 1 0.02 _qoPor_ 1 0.02 _qoPoe_ 1 0.02 _qoPa_ 1 0.02 _qoP_ 1 0.02 _qoP8a_ 1 0.02 _qoHoPa_ 1 0.02 _oePocHca_ 1 0.02 _oPor_ 1 0.02 _oPoe_ 1 0.02 _oPaezcc8a_ 1 0.02 _oPaea_ 1 0.02 _oPae_ 1 0.02 _oPaeHain_ 1 0.02 _ePoe_ 1 0.02 _Poin_ 1 0.02 _Poezcca_ 1 0.02 _Poezca_ 1 0.02 _Poezc8ae_ 1 0.02 _Poezc8a_ 1 0.02 _Poeccc8_ 1 0.02 _Poecca_ 1 0.02 _Poecc8arcn_ 1 0.02 _Poecc8a_ 1 0.02 _Poearar_ 1 0.02 _Poeain_ 1 0.02 _Poeaecc8a_ 1 0.02 _PoeHczcoe_ 1 0.02 _PoeHcca_ 1 0.02 _Poe8zcc8a_ 1 0.02 _Poe8aHa_ 1 0.02 _PoHan_ 1 0.02 _Par_ 1 0.02 _PaHc8a_ 1 0.02 _P8oe_ 1 0.02 _P8aezcor_ 1 0.02 _HoePa_ ----- ---- ---- 52 1.00 TOT set noglob foreach f ( \ '_'.'oe_' \ '_o'.'oea_' \ '_o'.'ar_' \ '_o'.'a_' \ '_'.'oeccc8a_' \ ) echo " " echo "-----------------------------------------------------------------------" echo " " cat bio-j-jsa-gut.wds \ | sed \ -e 's/^/_/g' \ -e 's/$/_/g' \ -e 's/[ql]j/H/' \ -e 's/[ql]g/P/' \ -e 's/cs/z/g' \ -e 's/ij/k/g' \ -e 's/ix/e/g' \ -e 's/is/r/g' \ -e 's/iiu/n/g' \ -e 's/y/i/g' \ -e 's/ci/a/g' \ -e 's/cg/8/g' \ | compare-contexts -rctx 0 -lctx 0 -colw 24 \ "${f:r}P${f:e}" "${f:r}[^P]${f:e}" "${f:r}[^P][^P]${f:e}" end unset noglob ----------------------------------------------------------------------- 8 1.00 _Poe_ 81 0.52 _qoe_ 11 0.19 _zcoe_ ----- ---- ---- 25 0.16 _zoe_ 11 0.19 _oHoe_ 8 1.00 TOT 17 0.11 _eoe_ 11 0.19 _ccoe_ 17 0.11 _8oe_ 8 0.14 _oeoe_ 9 0.06 _roe_ 5 0.09 _oroe_ 6 0.04 _Hoe_ 3 0.05 _Hcoe_ ----- ---- ---- 2 0.04 _aroe_ 155 1.00 TOT 1 0.02 _qooe_ 1 0.02 _oqoe_ 1 0.02 _eHoe_ 1 0.02 _e8oe_ 1 0.02 _aeoe_ 1 0.02 _8roe_ ----- ---- ---- 57 1.00 TOT ----------------------------------------------------------------------- 2 1.00 _oPoea_ 2 1.00 _oroea_ ----- ---- ---- ----- ---- ---- ----- ---- ---- 0 1.00 TOT 2 1.00 TOT 2 1.00 TOT ----------------------------------------------------------------------- 2 1.00 _oPar_ 35 0.92 _oHar_ 7 1.00 _oeHar_ ----- ---- ---- 1 0.03 _orar_ ----- ---- ---- 2 1.00 TOT 1 0.03 _oear_ 7 1.00 TOT 1 0.03 _o8ar_ ----- ---- ---- 38 1.00 TOT ----------------------------------------------------------------------- 2 1.00 _oPa_ 25 0.45 _oHa_ 21 0.53 _oHca_ ----- ---- ---- 23 0.41 _oea_ 10 0.25 _oeHa_ 2 1.00 TOT 6 0.11 _ora_ 9 0.23 _oe8a_ 2 0.04 _o8a_ ----- ---- ---- ----- ---- ---- 40 1.00 TOT 56 1.00 TOT ----------------------------------------------------------------------- 2 1.00 _Poeccc8a_ 6 0.67 _qoeccc8a_ ----- ---- ---- ----- ---- ---- 2 0.22 _zoeccc8a_ 0 1.00 TOT 2 1.00 TOT 1 0.11 _8oeccc8a_ ----- ---- ---- 9 1.00 TOT ----------------------------------------------------------------------- It ssems that isolated `P' = {\lg/,\qg/} is closely related to `r'=\is/, `q' = \q/, `z' = \cs/, `8' = \cg/, `e' = \ix/, `H' = {\lj/,\qj/}. cat bio-j-jsa-gut.wds \ | sed \ -e 's/^/_/g' \ -e 's/$/_/g' \ -e 's/[ql]j/H/' \ -e 's/[ql]g/P/' \ -e 's/cs/z/g' \ -e 's/ij/k/g' \ -e 's/ix/e/g' \ -e 's/is/r/g' \ -e 's/iiu/n/g' \ -e 's/y/i/g' \ -e 's/ci/a/g' \ -e 's/cg/8/g' \ | egrep '[^czHP]Pccc[^czHP]' \ | wfreq 14 0.22 _oPccc8a_ 14 0.22 _Pccc8a_ 8 0.13 _qoPccc8a_ 4 0.06 _oePccc8a_ 4 0.06 _oPccca_ 4 0.06 _Pcccoe_ 4 0.06 _Pccc8ar_ 3 0.05 _Pccca_ 2 0.03 _qoPccca_ 1 0.02 _oqoPccc8a_ 1 0.02 _ePccc8a_ 1 0.02 _aPccca_ 1 0.02 _aPccc8a_ 1 0.02 _Pccc8ae_ 1 0.02 _8oePccc8a_ ----- ---- ---- 63 1.00 TOT set noglob foreach f ( \ '_o'.'ccc8a_' \ '_'.'ccc8a_' \ '_qo'.'ccc8a_' \ '_oe'.'ccc8a_' \ '_o'.'ccca_' \ '_'.'cccoe_' \ '_'.'ccc8ar_' \ '_'.'ccca_' \ '_qo'.'ccca_' \ ) echo " " echo "-----------------------------------------------------------------------" echo " " cat bio-j-jsa-gut.wds \ | sed \ -e 's/^/_/g' \ -e 's/$/_/g' \ -e 's/[ql]j/H/' \ -e 's/[ql]g/P/' \ -e 's/cs/z/g' \ -e 's/ij/k/g' \ -e 's/ix/e/g' \ -e 's/is/r/g' \ -e 's/iiu/n/g' \ -e 's/y/i/g' \ -e 's/ci/a/g' \ -e 's/cg/8/g' \ | compare-contexts -rctx 0 -lctx 0 -colw 24 \ "${f:r}P${f:e}" "${f:r}[^P]${f:e}" "${f:r}[^P][^P]${f:e}" end unset noglob ----------------------------------------------------------------------- 14 1.00 _oPccc8a_ 23 0.79 _oeccc8a_ 1 0.50 _orzccc8a_ ----- ---- ---- 5 0.17 _oHccc8a_ 1 0.50 _oecccc8a_ 14 1.00 TOT 1 0.03 _o8ccc8a_ ----- ---- ---- ----- ---- ---- 2 1.00 TOT 29 1.00 TOT ----------------------------------------------------------------------- 14 1.00 _Pccc8a_ 52 0.37 _eccc8a_ 23 0.49 _oeccc8a_ ----- ---- ---- 36 0.26 _zccc8a_ 5 0.11 _oHccc8a_ 14 1.00 TOT 19 0.13 _cccc8a_ 4 0.09 _ezccc8a_ 14 0.10 _Hccc8a_ 2 0.04 _rzccc8a_ 9 0.06 _8ccc8a_ 2 0.04 _qoccc8a_ 5 0.04 _rccc8a_ 2 0.04 _ecccc8a_ 4 0.03 _accc8a_ 2 0.04 _eHccc8a_ 1 0.01 _qccc8a_ 1 0.02 _o8ccc8a_ 1 0.01 _occc8a_ 1 0.02 _eoccc8a_ ----- ---- ---- 1 0.02 _azccc8a_ 141 1.00 TOT 1 0.02 _aeccc8a_ 1 0.02 _acccc8a_ 1 0.02 _8zccc8a_ 1 0.02 _8cccc8a_ ----- ---- ---- 47 1.00 TOT ----------------------------------------------------------------------- 8 1.00 _qoPccc8a_ 11 0.65 _qoHccc8a_ 2 0.40 _qoezccc8a_ ----- ---- ---- 6 0.35 _qoeccc8a_ 2 0.40 _qoHcccc8a_ 8 1.00 TOT ----- ---- ---- 1 0.20 _qoecccc8a_ 17 1.00 TOT ----- ---- ---- 5 1.00 TOT ----------------------------------------------------------------------- 4 1.00 _oePccc8a_ 1 1.00 _oecccc8a_ 1 1.00 _oeoHccc8a_ ----- ---- ---- ----- ---- ---- ----- ---- ---- 4 1.00 TOT 1 1.00 TOT 1 1.00 TOT ----------------------------------------------------------------------- 4 1.00 _oPccca_ 12 0.63 _oeccca_ 2 0.40 _oezccca_ ----- ---- ---- 6 0.32 _oHccca_ 2 0.40 _oecccca_ 4 1.00 TOT 1 0.05 _orccca_ 1 0.20 _oeHccca_ ----- ---- ---- ----- ---- ---- 19 1.00 TOT 5 1.00 TOT ----------------------------------------------------------------------- 4 1.00 _Pcccoe_ 3 0.27 _zcccoe_ 1 0.33 _oecccoe_ ----- ---- ---- 3 0.27 _8cccoe_ 1 0.33 _cccccoe_ 4 1.00 TOT 2 0.18 _ecccoe_ 1 0.33 _8ccccoe_ 2 0.18 _Hcccoe_ ----- ---- ---- 1 0.09 _acccoe_ 3 1.00 TOT ----- ---- ---- 11 1.00 TOT ----------------------------------------------------------------------- 4 1.00 _Pccc8ar_ 1 0.25 _eccc8ar_ ----- ---- ---- ----- ---- ---- 1 0.25 _cccc8ar_ 0 1.00 TOT 4 1.00 TOT 1 0.25 _accc8ar_ 1 0.25 _Hccc8ar_ ----- ---- ---- 4 1.00 TOT ----------------------------------------------------------------------- 3 1.00 _Pccca_ 31 0.39 _cccca_ 12 0.35 _oeccca_ ----- ---- ---- 23 0.29 _zccca_ 6 0.18 _oHccca_ 3 1.00 TOT 15 0.19 _eccca_ 3 0.09 _ezccca_ 5 0.06 _Hccca_ 3 0.09 _azccca_ 4 0.05 _rccca_ 2 0.06 _ecccca_ 1 0.01 _accca_ 2 0.06 _8zccca_ 1 0.01 _8ccca_ 1 0.03 _rcccca_ ----- ---- ---- 1 0.03 _qoccca_ 80 1.00 TOT 1 0.03 _orccca_ 1 0.03 _acccca_ 1 0.03 _Hcccca_ 1 0.03 _8cccca_ ----- ---- ---- 34 1.00 TOT ----------------------------------------------------------------------- 2 1.00 _qoPccca_ 8 0.50 _qoeccca_ 1 0.50 _qoeHccca_ ----- ---- ---- 8 0.50 _qoHccca_ 1 0.50 _qoHcccca_ 2 1.00 TOT ----- ---- ---- ----- ---- ---- 16 1.00 TOT 2 1.00 TOT It seems that `Pccc' is closely related to `Hccc' `eccc' `zccc' `8ccc' `cccc'. cat bio-j-jsa-gut.wds \ | sed \ -e 's/^/_/g' \ -e 's/$/_/g' \ -e 's/[ql]j/H/' \ -e 's/[ql]g/P/' \ -e 's/cs/z/g' \ -e 's/ij/k/g' \ -e 's/ix/e/g' \ -e 's/is/r/g' \ -e 's/iiu/n/g' \ -e 's/y/i/g' \ -e 's/ci/a/g' \ -e 's/cg/8/g' \ | egrep '[^czHP]Pzcc[^czHP]' \ | wfreq 3 0.38 _qoPzcc8a_ 2 0.25 _oPzcc8a_ 1 0.12 _oPzcc8ae_ 1 0.12 _oPzcc8_ 1 0.12 _Pzccoe8a_ ----- ---- ---- 8 1.00 TOT set noglob foreach f ( \ '_qo'.'zcc8a_' \ '_o'.'zcc8a_' \ '_o'.'zcc8ae_' \ '_o'.'zcc8_' \ '_'.'zccoe8a_' \ ) echo " " echo "-----------------------------------------------------------------------" echo " " cat bio-j-jsa-gut.wds \ | sed \ -e 's/^/_/g' \ -e 's/$/_/g' \ -e 's/[ql]j/H/' \ -e 's/[ql]g/P/' \ -e 's/cs/z/g' \ -e 's/ij/k/g' \ -e 's/ix/e/g' \ -e 's/is/r/g' \ -e 's/iiu/n/g' \ -e 's/y/i/g' \ -e 's/ci/a/g' \ -e 's/cg/8/g' \ | compare-contexts -rctx 0 -lctx 0 -colw 24 \ "${f:r}P${f:e}" "${f:r}[^P]${f:e}" "${f:r}[^P][^P]${f:e}" end unset noglob ----------------------------------------------------------------------- 3 1.00 _qoPzcc8a_ 7 0.88 _qoHzcc8a_ ----- ---- ---- ----- ---- ---- 1 0.12 _qoezcc8a_ 0 1.00 TOT 3 1.00 TOT ----- ---- ---- 8 1.00 TOT ----------------------------------------------------------------------- 2 1.00 _oPzcc8a_ 14 0.88 _oezcc8a_ 1 1.00 _oeHzcc8a_ ----- ---- ---- 2 0.12 _oHzcc8a_ ----- ---- ---- 2 1.00 TOT ----- ---- ---- 1 1.00 TOT 16 1.00 TOT ----------------------------------------------------------------------- 1 1.00 _oPzcc8ae_ ----- ---- ---- ----- ---- ---- ----- ---- ---- 0 1.00 TOT 0 1.00 TOT 1 1.00 TOT ----------------------------------------------------------------------- 1 1.00 _oPzcc8_ 2 1.00 _oezcc8_ ----- ---- ---- ----- ---- ---- ----- ---- ---- 0 1.00 TOT 1 1.00 TOT 2 1.00 TOT ----------------------------------------------------------------------- 1 1.00 _Pzccoe8a_ ----- ---- ---- ----- ---- ---- ----- ---- ---- 0 1.00 TOT 0 1.00 TOT 1 1.00 TOT Again the `P' seems to be similar to `H' and `e'. And now for something completely different. Let's look at how the words are distributed among the paragraphs: cat bio-j-jsa.wds \ | sed \ -e 's/[ql]j/H/g' \ -e 's/[ql]g/P/g' \ -e 's/ij/k/g' \ -e 's/ix/e/g' \ -e 's/is/r/g' \ -e 's/iiu/n/g' \ -e 's/cy/a/g' \ -e 's/ci/a/g' \ -e 's/in/m/g' \ -e 's/ir/w/g' \ -e 's/cs/z/g' \ -e 's/cg/8/g' \ | enum-words-in-blocks -vWPB=100 \ | egrep -v '[^a-zA-Z0-9_ ]' \ | sort +1 -2 +0 -1n \ | make-word-location-map -vNBLOCKS=71 \ > .foo The result has been posted as http://www.dcc.unicamp.br/~stolfi/voynich/word-distr-map.html Recomputing with fewer large blocks: cat bio-j-jsa.wds \ | sed \ -e 's/[ql]j/H/g' \ -e 's/[ql]g/P/g' \ -e 's/ij/k/g' \ -e 's/ix/e/g' \ -e 's/is/r/g' \ -e 's/iiu/n/g' \ -e 's/cy/a/g' \ -e 's/ci/a/g' \ -e 's/in/m/g' \ -e 's/ir/w/g' \ -e 's/cs/z/g' \ -e 's/cg/8/g' \ | enum-words-in-blocks -vWPB=1010 \ | egrep -v '[^a-zA-Z0-9_ ]' \ | sort +1 -2 +0 -1n \ | make-word-location-map -vCTWD=3 -vNBLOCKS=7 \ > .foo cat .foo \ | gawk '/./ { printf"%5d %-16s ", $1, $2; for (i=3; i<=NF; i++) printf " %2d", int(($(i)*99/$1)+0.5); printf "\n" }' \ > .bar Results posted in my Voynich page.