Let's now create a consensus version for each unit. It turns out that the transcription code ";J>" is already used and means Jim Reeds. I will use ";S" for my consensus. mkdir L16-ecc-x foreach f ( L16-ecc/f[0-9]* ) set g = "L16-ecc-x/${f:t}" echo "$f -> $g" cat ${f} \ | make-consensus-interlin \ > ${g}~~ cat ${g}~~ \ | egrep ';S>|^#' \ > ${g} end Let's concatenate all the paragraph locations into a single file: cat .units-parags.dir \ | sed \ -e 's/:.*$//g' \ -e 's:^:L16-ecc-x/:g' \ > .tmp cat `cat .tmp` \ > .parags-j-ecc.evt cat .parags-j-ecc.evt \ | egrep '^<' \ | sed \ -e 's/^<.*> *//g' \ -e 's/ *//g' \ | dicio-wc lines words bytes ------ ------- --------- 3918 3918 168489 Note that the count above includes newlines, so we actually have 164571 Voynich characters in the parags file. Now let's extract the good label text: cat .units-labels.dir \ | sed \ -e 's/:.*$//g' \ -e 's:^:L16-ecc-x/:g' \ > .tmp cat `cat .tmp` \ > .labels-j-ecc.evt extract-words-from-interlin \ -chars "8coqHPemrwkij" \ .labels-j-ecc.evt \ .labels-j-ecc lines words bytes file ------ ------- --------- ------------ 986 986 4243 .labels-j-ecc.wds 277 277 2234 .labels-j-ecc.dic 282 282 2201 .labels-j-ecc-gut.wds 225 225 1831 .labels-j-ecc-gut.dic 652 652 1630 .labels-j-ecc-fun.wds 2 2 5 .labels-j-ecc-fun.dic 52 52 412 .labels-j-ecc-bad.wds 50 50 398 .labels-j-ecc-bad.dic 2526 2526 12954 total Digraph counts: TT 8 c o q H P e m r k i j ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 282 . 21 26 200 4 3 1 1 . 26 . . . 8 108 11 . 6 88 . . . . . 2 1 . . c 342 1 16 165 120 . 15 8 . . 8 . . 9 o 787 119 48 24 24 . 170 20 156 36 171 19 . . q 4 . . . 3 . 1 . . . . . . . H 194 . . 60 131 . . . 2 . 1 . . . P 31 2 . 14 15 . . . . . . . . . e 159 37 18 21 63 . 4 1 . 1 13 . 1 . m 38 23 3 1 10 . . . . . 1 . . . r 226 71 2 22 125 . 1 . . 1 4 . . . k 20 16 . 3 1 . . . . . . . . . i 1 . . . . . . 1 . . . . . . j 9 2 . . 7 . . . . . . . . . ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- TOT 2201 282 108 342 787 4 194 31 159 38 226 20 1 9 Next-symbol probability (× 99): TT 8 c o q H P e m r k i j -- -- -- -- -- -- -- -- -- -- -- -- -- -- 99 . 7 9 70 1 1 . . . 9 . . . 8 99 10 . 6 81 . . . . . 2 1 . . c 99 . 5 48 35 . 4 2 . . 2 . . 3 o 99 15 6 3 3 . 21 3 20 5 22 2 . . q 99 . . . 74 . 25 . . . . . . . H 99 . . 31 67 . . . 1 . 1 . . . P 99 6 . 45 48 . . . . . . . . . e 99 23 11 13 39 . 2 1 . 1 8 . 1 . m 99 60 8 3 26 . . . . . 3 . . . r 99 31 1 10 55 . . . . . 2 . . . k 99 79 . 15 5 . . . . . . . . . i 99 . . . . . . 99 . . . . . . j 99 22 . . 77 . . . . . . . . . -- -- -- -- -- -- -- -- -- -- -- -- -- -- TOT 99 13 5 15 35 0 9 1 7 2 10 1 0 0 Previous-symbol probability (× 99): TT 8 c o q H P e m r k i j -- -- -- -- -- -- -- -- -- -- -- -- -- -- 13 . 19 8 25 99 2 3 1 . 11 . . . 8 5 4 . 2 11 . . . . . 1 5 . . c 15 . 15 48 15 . 8 26 . . 4 . . 99 o 35 42 44 7 3 . 87 64 97 94 75 94 . . q 0 . . . . . 1 . . . . . . . H 9 . . 17 16 . . . 1 . . . . . P 1 1 . 4 2 . . . . . . . . . e 7 13 17 6 8 . 2 3 . 3 6 . 99 . m 2 8 3 . 1 . . . . . . . . . r 10 25 2 6 16 . 1 . . 3 2 . . . k 1 6 . 1 . . . . . . . . . . i 0 . . . . . . 3 . . . . . . j 0 1 . . 1 . . . . . . . . . -- -- -- -- -- -- -- -- -- -- -- -- -- -- TOT 99 99 99 99 99 99 99 99 99 99 99 99 99 99 Symbol entropy: 2.764 Next-symbol entropy: 2.020 Now, let's make a list of all labels therein. Multiword labels (where words are separated by "-") will be entered as a single word, as well as separate words. /bin/rm -f .labels.def First, the labels without word breaks: cat .labels-j-ecc.evt \ | remove-comments-from-evt \ | sed \ -e 's/ *//g' \ -e 's/;[A-Z]>/>/g' \ -e 's/[-=]//g' \ -e 's/>\(.*\)[.]/>\1/g' \ -e 's/>\(.*\)[.]/>\1/g' \ -e 's/>\(.*\)[.]/>\1/g' \ -e 's/>\(.*\)[.]/>\1/g' \ -e 's/>\(.*\)[.]/>\1/g' \ -e 's/>/> /g' \ > .labels1.def Second, the labels split at word boundaries: cat .labels-j-ecc.evt \ | remove-comments-from-evt \ | /n/gnu/bin/sed \ -e 's/ *//g' \ -e 's/;[A-Z]>/>/g' \ -e 's/[=-]$//g' \ -e 's/^/@/g' \ -e 's/>\(.*\)[.]/>\1/g' \ -e 's/>\(.*\)[.]/>\1/g' \ -e 's/>\(.*\)[.]/>\1/g' \ -e 's/>\(.*\)[.]/>\1/g' \ -e 's/>\(.*\)[.]/>\1/g' \ -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \ -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \ -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \ -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \ -e 's/@\(<[^>]*>\)\([^ @-][^ @-]*\)[ -][ -]*/@\1\2@\1/g' \ -e 's/>/> /g' \ | tr '@' '\012' \ | egrep '.' \ > .labels2.def Now merge the two files, and insert sequential page numbers: cat .labels1.def .labels2.def \ | sort | uniq \ | sed \ -e 's/<\(.*\)> \(.*\)/<\1> \2 {\1}/g' \ -e 's/\.[^>]*> */> /g' \ | panel-to-page \ | tr '{}' '<>' \ > .labels.def Keep only the first definition of each label as its "official position": cat .labels.def \ | sort +1 -2 +0 -1n \ | gawk 'BEGIN{b=""} /./ {if(b!=$2) {print; b=$2; next}}' \ | sort \ > .labels-first.def Collect the labels proper: cat .labels.def \ | gawk '/./ {print $2}' \ | sort | uniq \ | egrep -v '\?' \ > .labels.dic dicio-wc .labels.dic lines words bytes file ------ ------- --------- ------------ 231 231 1906 .labels.dic cat .labels.dic\ | gawk 'BEGIN{m=0} /./{s=length($0);m=(s>m?s:m);next} END{print ("max len " m)}' max len 18 Now let's find all occurrences of the labels in the parags text: cat .parags-j-ecc.evt \ | enum-word-locations .labels.dic \ | sort -b +2 -3n \ > .label-occurrences.idx Let's tabulate the reference frequencies per label: cat .label-occurrences.idx \ | gawk '/./ { print $4 }' \ | sort | uniq -c | expand \ | compute-freqs \ | sort +0 -1nr \ > .label-refs-by-label.frq --- .label-refs-by-label.frq ------------------------ 4774 0.173 oHo 2151 0.078 ccoe 1540 0.056 oHom 1162 0.042 oHoe 976 0.035 oHor 944 0.034 rom 896 0.033 oHcc8o 748 0.027 oroe 737 0.027 cccco 716 0.026 cccoe 646 0.023 oror 612 0.022 8oro ... ..... ........ 4 0.000 oHoeoror 4 0.000 oHoroe8o 3 0.000 ccPoeo 3 0.000 ccoccro 3 0.000 oPcco8oo 3 0.000 oPccorom 3 0.000 roPoe 3 0.000 roeoer 2 0.000 oHcc8occcHoe 2 0.000 oHco8oer 2 0.000 oHcooe8o 2 0.000 occc8oe8om 2 0.000 qHoe 2 0.000 qoHcro 2 0.000 roromr 1 0.000 8o8orm 1 0.000 Hoccorom 1 0.000 oHcco8oer 1 0.000 oHco8occcHco 1 0.000 oHcoeroe 1 0.000 oHcooeo 1 0.000 oHcororor 1 0.000 oHorccok 1 0.000 oHorco 1 0.000 oHroe 1 0.000 oPoeror 1 0.000 oecccccco 1 0.000 ororcco8om 1 0.000 ro8ororo 1 0.000 roeccror ----------------------------------------------------- Obviously "oHo", "ccoe", "oHom", "oHoe", etc. are not really labels; they are either common words (function words? "Star"? "Plant"? "Day"?), or common letter groups that got split off by accident. Let's list the unreferenced labels: cat .label-occurrences.idx \ | gawk '/./ { print $4 }' \ | sort | uniq \ | bool 2-1 - .labels.dic \ > .labels-unref.dic 8ccccorocccPoeom 8cccoe8o 8oHocHc 8oHoecj 8orokcjoe cc8or8omo ccoHcco8orr ccorcHcoroe ccoroeiP eHcccPoe eHcccPoeooPcco o8orcc8oom oHcc8ccor oHccoroeokcco oHcoHcororo oHcooeoeroroeo oHcoorororoeo oHcororoo8o oHe8ok oHeorcjo oHo8rco8occPooeo oHoHokom oHoecHo oHoecPoromok oHoeccHco oHoeccoHorokcjo oHoecjor oHoeoPccoroe oHoeoorok oHooccooHoeoHcoeor oHorccorrr oHorcjo oHorcororo oHorcr oHoroecHoeHoo oHoroecjo oPcc8ocjo oPcccoroe oPccoe8k oPccorok oPocPcor oPoeo8om oPoeo8omrr oPoeoro oPoeoror oPorHo8oe oPoroeor occcHoroHoem8 occcrororcco occoeccorok ocoeoecroror omoHoeoccor oo8cco ooeccccj oomororo qoHoomocHcco roeomcccoe roeooHoro8 room8oeo8 rrcHcrcccHo Note that these "labels" are distinctly longer than those that do occur, and are almost certainly multiword phrases. Now let's prepare a map showing for each label its occurrences in the running text. First, a block-based map: setenv BLOCKSZ 1646 cat .label-occurrences.idx \ | sort -b +3 -4 \ | gawk '/./ { print int($3 / '"$BLOCKSZ"') + 1, $4 }' \ | make-word-location-map \ -v MAXLEN=18 \ -v CTWD=1 \ -v NBLOCKS=100 \ > .label-by-block.map Add a column with the panel where each label was first defined: cat .labels-first.def \ | sed -e 's/\.[^ >]*>/>/g' \ | tr -d '<>' \ | sort +1 -2 \ > .foo join \ -a 1 -e '000' \ -j1 4 -j2 2 \ -o0,2.3,2.1,1.1,1.2,1.3,1.5 \ .label-by-block.map .foo \ | gawk \ '/./ {printf "%-18s %-6s %-3s %5d %5.1f %5.1f %s\n", $1,$2,$3,$4,$5,$6,$7}' \ > .label-by-block-def.map Some comments: * The occurrences of a rarer label in the text usually form one or more tight clusters. * The labels occuring in some pages are relatively common in the text, those in other pages do not occur at all in the text. I computed the number of text characters in each panel: cat .units-parags.dir \ | sed -e 's/:.*$//g' \ > .ups /bin/rm -f .tmp foreach f ( `cat .panels.dir` ) set pp = ( `egrep $f'[.]' .ups` ) echo "$f $pp" > /dev/stderr echo "<$f> `( cd L16-ecc-x && cat $f $pp ) | count-text-chars`" >> .tmp end cat .tmp \ | sort \ > .panels.nchars Let's make a table that gives the range of byte offsets for each panel. Each line has the sequential panel number, the physical panel number, the first offset, and the last offset plus one. cat .panels.nchars \ | sed -e 's/<\(.*\)>/<\1> {\1}/g' \ | panel-to-page \ | tr '{}' '<>' \ | sort +0 -1n \ | gawk 'BEGIN {a=0} /./ {b = a+$3; print $1, $2, a, b; a=b}' \ > .panels.chrange Let's make tables that map block index to panel number and vice-versa: echo 'block size = '$BLOCKSZ cat .panels.chrange \ | tr -d '<>' \ | gawk '/./ {printf "s/<%s>/<%03d>/g\n", $2, 1+int($3/'"$BLOCKSZ"')}' \ > panel-to-block chmod a+x panel-to-block echo 'block size = '$BLOCKSZ cat .panels.chrange \ | grep -v '' \ | tr -d '<>' \ | gawk '/./ {printf "%03d %s\n", 1+int($3/'"$BLOCKSZ"'), $2}' \ | gawk 'BEGIN {n=0} /./ {while($1>n){n++;printf "s/<%03d>/<%s>/g\n", n,$2}}' \ > block-to-panel chmod a+x block-to-panel Formatting block-to-panel as a header: cat block-to-panel \ | tr '<>/' ' ' \ | gawk '/./ {print $2, $3}' \ | sed -e 's/ f\([0-9][0-9]*\)/ f\1 /g' \ | format-block-map-header \ > .block-map-header Computing frequencies of references per panel: cat .label-occurrences.idx \ | gawk '/./ { print $1 }' \ | sed -e 's/\..*>/>/g' \ | sort | uniq -c | expand \ | compute-freqs \ | sort +0 -1nr \ > .label-refs-by-panel.frq --- .label-refs-by-panel.frq ------------------------ 636 0.023 563 0.020 546 0.020 511 0.019 498 0.018 494 0.018 492 0.018 491 0.018 482 0.017 475 0.017 465 0.017 ... ..... ........ 30 0.001 30 0.001 29 0.001 28 0.001 28 0.001 27 0.001 27 0.001 22 0.001 20 0.001 19 0.001 19 0.001 15 0.001 12 0.000 11 0.000 ----------------------------------------------------- There are a few pages that are particularly rich in label references. The panels from f103 on are "starred paragraphs", and so is f58r. Panel f86v6 is on the back of the big fold-out.