Hacking at the Voynich manuscript - Side notes 024 Creating a new interim release of Landini's interlinear in EVA Last edited on 2004-07-15 23:12:49 by stolfi GENERAL RECIPE Link setup ln -s ../../L16+H-eva ln -s ../.. work ln -s work/basify-weirdos ln -s work/unbasify-weirdos ln -s work/remove-needless-capitalization ln -s work/validate-new-evt-format Parameters set vnew = "16e7" set vold = "16e6" List of units: /bin/rm -f .all.units cat L16+H-eva/INDEX \ | gawk -v FS=':' '/^[^#]/{printf "L16+H-eva/%s\n", $2;}' \ > .all.units Checking for missing units in index: ls -d L16+H-eva/f* \ | egrep '^L16\+H-eva/f[0-9]+[rv]?[0-6]?(|[.][A-Za-z0-9]+)$' \ | sort \ > .all.files cat .all.units \ | sort \ > .all.srtun diff .all.{files,srtun} Validating format: foreach file ( `cat .all.units` ) set fu = "${file:t}" echo '=== '$file' ===' validate-new-evt-format \ -v fnum="${fu:r}" \ -v unit="${fu:e}" \ ${file} end List of units that contain only comments: cat L16+H-eva/INDEX \ | gawk -v FS=':' '($6=="-"){printf "L16+H-eva/%s\n", $2;}' \ > .cmt.units Checking for mislabeled comment files: ls -d L16+H-eva/f* \ | egrep '^L16\+H-eva/f[0-9]+[rv][0-6]?$' \ | sort \ > .cmt.files cat .cmt.units \ | sort \ > .cmt.srtun diff .cmt.{files,srtun} Unit index file: cp -p L16+H-eva/INDEX L16+H-eva/unit${vnew}.txt cp -p L16+H-eva/unit${vnew}.txt ${exnew}/ Archives with separate file for each unit: tar cvf - L16+H-eva/INDEX `cat .all.units` \ | gzip \ > ${exnew}/arch${vnew}.tgz rm -f ${exnew}/arch${vnew}.zip zip -klv ${exnew}/arch${vnew} L16+H-eva/INDEX `cat .all.units` All the text in a single file: cat `cat .all.units` \ | sed -e '/^## *<[^<>.]*>/s/^## *//' \ | egrep -v '^ *$' \ > L16+H-eva/text${vnew}.evt cp -p L16+H-eva/text${vnew}.evt ${exnew}/ rm -f ${exnew}/text${vnew}.evt.gz gzip ${exnew}/text${vnew}.evt rm -f ${exnew}/text${vnew}.zip zip -klv ${exnew}/text${vnew} L16+H-eva/text${vnew}.evt Edits made to Landini's original version: cat L16+H-eva/f0.{M,S,U,V,W} \ > ${exnew}/edit${vnew}.txt Comments only, separate files: tar cvf - L16+H-eva/INDEX `cat .cmt.units` \ | gzip \ > ${exnew}/acmt${vnew}.tgz rm -f ${exnew}/acmt${vnew}.zip zip -klv ${exnew}/acmt${vnew} L16+H-eva/INDEX `cat .cmt.units` Comments only, single file: cat `cat .cmt.units` \ | sed -e '/^## *<[^<>.]*>/s/^## *//' \ | egrep -v '^ *$' \ > L16+H-eva/tcmt${vnew}.evt cp -p L16+H-eva/tcmt${vnew}.evt ${exnew}/ rm -f ${exnew}/tcmt${vnew}.evt.gz gzip ${exnew}/tcmt${vnew}.evt rm -f ${exnew}/tcmt${vnew}.zip zip -klv ${exnew}/tcmt${vnew} L16+H-eva/tcmt${vnew}.evt Listing differences: cat ${exnew}/text${vnew}.evt.gz \ | gunzip \ > .new.evt cat ${exold}/text${vold}.evt.gz \ | gunzip \ > .old.evt diff .old.evt .new.evt \ | prettify-diff-output \ > .diffs.evt Comparing INDEX files: cat L16+H-eva/INDEX \ | gawk -v FS=':' -v OFS=':' \ '//{$1=""; print;}' \ > .new.units cat ${exold}/unit${vold}.txt \ | gawk -v FS=':' -v OFS=':' \ '//{$1=""; print;}' \ > .old.units diff .old.units .new.units \ | prettify-diff-output \ > .diffs.units Checking new transcriptions per page: foreach f ( new old ) echo $f cat .${f}.evt \ | egrep '^<[^<>]*;' \ | egrep -v ';H>' \ | sed \ -e 's/<\(f[0-9][0-9]*[rv][0-6]*\)[.][^<>]*[;]/<\1 /' \ -e 's/[-\!%=.,]//g' \ -e 's/{[^{}]*}//g' \ -e 's/[ ][ ][ ]*/ /g' \ | sort +0 -1 +2 -3 \ > .${f}.esr end diff .old.esr .new.esr \ | prettify-diff-output \ > .diffs.esr SPECIAL HACKS Inserting the unit locator line at the top of each file: pushd L16+H-eva foreach f ( `ls | egrep '^f[0-9]+[rv]?[0-6]?(|[.][A-Za-z0-9]+)$'` ) echo '=== '${f}' ===' set fnum = "${f:r}" set unit = "${f:e}" if ( "x${unit}" != "x" ) then if ( -r ${f}~ ) mv ${f}~ ${f}~~ cat ${f} \ | sed -e '1s/^/## <'"${f}"'> {} /' \ | tr '\015' '\012' \ > "${f}@" mv -i "${f}" "${f}~" && mv -i "${f}@" "${f}" endif end foreach f ( `ls | sed -e 's/~$//' | egrep '^f[0-9]+[rv]?[0-6]?(|[.][A-Za-z0-9]+)$'` ) echo '=== '${f}' ===' mv -i "${f}~" "$f" end popd ADDING THE MAJORITY AND CONSENSUS VERSIONS The goal of this note is to condense the various transcriptions present in the EVA interlinear into some sort of "consensus" or "majority" version. The majority version is also chopped into per-section and per-page files for statistical analysis. JOINING THE INTERLINEAR INTO A SINGLE FILE Making a list of all text units: ln -s ../../L16+H-eva cat L16+H-eva/INDEX \ | gawk -v FS=':' '/./{print $2;}' \ > .all.units set units = ( `cat .all.units` ) Safety check: ( cd L16+H-eva && ls f[0-9]* | egrep -v '[~]$' ) | sort > .foo cat .all.units | sort > .bar diff .foo .bar Concatenating all units, with basic uncapitalized EVA ( cd L16+H-eva && cat ${units} ) \ > inter.evt Checking validity and synchronism: cat inter.evt \ | validate-new-evt-format \ -v checkTerminators=1 \ -v checkLineLengths=1 \ >& inter.bugs EXTRACTING THE TUPLES OF VARIANT READINGS Next we extract from the interlinear a list of all "reading tuples", one for each character position in the text. Each tuple is a string of 26 characters, the readings of that character position in each of the 26 potential variants. Whenever a particular variant does not cover a particular character position, the corresponding tuple element is set to "%". (Note that "%" is presently used in the interlinear itself to mark lines or parts of lines that were skipped by a particular transcriber.) cat inter.evt \ | unbasify-weirdos \ | egrep -v ';[AY]>' \ | extract-reading-tuples \ -f tuple-procs.gawk \ | sort | uniq -c | expand \ | sort +0 -1nr +1 -2 \ > inter.tfr 5362 VMS text lines found 17357 interlinear text lines read 245136 tuples written 17357 interlinear text lines read dicio-wc inter.tfr lines words bytes file ------- ------- --------- ------------ 6126 12252 214410 inter.tfr cat inter.tfr \ | gawk '/./{s+=$1} END{print s;}' 245136 We then compute tables that map reading tuples to and consensus readings: cat inter.tfr \ | compute-consensus-table \ > tuple-to-consensus.stats cat tuple-to-consensus.stats \ | gawk '/./{print $2,$3;}' \ > tuple-to-consensus.tbl Similarly, we compute a table that maps reading tuples to majority readings (using equal weights on the first iteration): cat inter.tfr \ | compute-majority-table \ -v alternates="CD,FG,JI,KQ,LM" \ > tuple-to-majority.stats cat tuple-to-majority.stats \ | gawk '/./{print $2,$3;}' \ > tuple-to-majority.tbl Since the weights are unity, the total weight column is the number of transcribers in that reading tuple (ignoring alternates and "%" or "*" readings). Let's count how many recorded positions have been read by 0, 1, 2, ..., 26 transcribers, excluding also positions that are all [!%=-] since those prositions are either fillers or were provided by me: cat tuple-to-majority.stats \ | gawk \ ' ($2 \!~ /^[-=%\!]*$/){ \ nt = int($4+0.0001); ct[nt] += $1; tot += $1; \ if (nt>zt+0){zt=nt} if (26-nt>at+0){at=26-nt} \ }; \ END{ \ for(i=26-at;i<=zt;i++){printf "%3d %7d\n", i, ct[i]} \ printf "tot %7d\n", tot \ } \ ' 0 268 1 2509 2 59253 3 95048 4 68280 5 4055 6 341 tot 229754 Next we can compute some statistics about the accuracy of each transcriber: cat inter.tfr \ | compute-transcriber-correlations \ -v alternates="CD,FG,JI,KQ,LM" \ > inter.trcorrs CREATING THE CONSENSUS AND MAJORITY VERSIONS Now we read the HTML file and produce another file with two extra variants: a "majority" version (first in each batch, transcriber code "A") and a "consensus" version (last in each group, transcriber code "Y"). See the scripts "compute-consensus-table" and "compute-majority-table" for definitions of these terms. cat inter.evt \ | egrep -v '^<.*;[AY]>' \ | unbasify-weirdos \ | combine-versions \ -f tuple-procs.gawk \ -v code=Y \ -v position=last \ -v table=tuple-to-consensus.tbl \ > inter-c.evt cat inter-c.evt \ | combine-versions \ -f tuple-procs.gawk \ -v ignore=Y \ -v code=A \ -v position=first \ -v table=tuple-to-majority.tbl \ | basify-weirdos \ > inter-cm.evt Extracting the bare text of the majority and consensus versions: cat inter-cm.evt \ | sed -e '/^## <[^<>.]*>/s/^## *//g' \ | egrep -v '^#' \ | egrep -v '^<.*;[^A]>' \ | unbasify-weirdos \ > only-m.evt cat inter-cm.evt \ | sed -e '/^## <[^<>.]*>/s/^## *//g' \ | egrep -v '^#' \ | egrep -v '^<.*;[^Y]>' \ | unbasify-weirdos \ > only-c.evt Publishing: foreach f ( only-c only-m ) cat $f.evt | gzip > $f.evt.gz zip $f $f.evt end DISPLAYING THE DIFFERENCES BETWEEN VERSIONS The file will show the majority line at the top and variants below, in the format fNNN.UU LLL A EEEEEEEEEE... T E EE T E EE T E EE Y EEEEEEEEE.. where LLL is a line number, T is a transcriber code, E an EVA character. rm disc/*.html cat inter-cm.evt \ | egrep -v '^## *<[^<>.]*[.][^<>.]*>' \ | egrep -v '^#([ ]|$)' \ | unbasify-weirdos \ | show-discrepancies \ -v title='EVA interlinear 1.6e6 - Discrepancies between versions' \ -f tuple-procs.gawk \ -v dir=disc Publishing the concordance: ( cd disc && rm -f disc.zip && pkzip disc index.html legend.html f*.html ) CREATING PER-PAGE AND PER-SECTION FILES Now let's produce the following files: pages-m/FNUM.evt the majority version split into one file per page. pages-m/all.names the f-numbers of all existing pages, in natural reading order. subsecs-m/TAG.evt the majority version split into one file per subsection. subsecs-m/all.names the tags of all existing sections, in some nice order subsecs-m/TAG.fnums the f-numbers of all existing pages in section TAG, in natural reading order Gathering the page lists: set pages = ( `cat .all.units | egrep -v 'f0' | egrep -v '[.]'` ) mkdir pages-m /bin/rm -f pages-m/all.names pages-m/*.evt .foo cat only-m.evt \ | basify-weirdos \ | sed -e 's/[&][*]/**/g' \ | egrep -v '^<[^<>.]*>' \ | split-pages \ -v outdir=pages-m \ > pages-m/all.names Collecting the list of pages in each section: mkdir subsecs-m set subsecs = ( \ `cat fnum-to-subsec.tbl | gawk '($2 \!~ /xxx/){print $2}' | sort | uniq` \ ) echo "subsecs = ( ${subsecs} )" /bin/rm -f subsecs-m/all.names subsecs-m/*.fnums subsecs-m/.foo foreach tag ( ${subsecs} ) echo "${tag}" cat fnum-to-subsec.tbl \ | grep -w ${tag} \ | gawk '/./{print $1;}' \ > subsecs-m/${tag}.fnums cat `cat subsecs-m/${tag}.fnums | sed -e 's@^\(.*\)$@pages-m/\1.evt@g'` \ > subsecs-m/${tag}.evt echo ${tag} >> subsecs-m/all.names end dicio-wc subsecs-m/*.evt lines words bytes file ------- ------- --------- ------------ 916 1832 62661 subsecs-m/bio.1.evt 13 26 1132 subsecs-m/cos.1.evt 399 798 19260 subsecs-m/cos.2.evt 186 372 9994 subsecs-m/cos.3.evt 1066 2132 64512 subsecs-m/hea.1.evt 134 268 8660 subsecs-m/hea.2.evt 316 632 24711 subsecs-m/heb.1.evt 61 122 4644 subsecs-m/heb.2.evt 174 348 10021 subsecs-m/pha.1.evt 284 568 15718 subsecs-m/pha.2.evt 80 160 6158 subsecs-m/str.1.evt 1084 2168 90650 subsecs-m/str.2.evt 53 106 2535 subsecs-m/unk.1.evt 52 104 2476 subsecs-m/unk.2.evt 7 14 461 subsecs-m/unk.3.evt 82 164 3762 subsecs-m/unk.4.evt 35 70 2844 subsecs-m/unk.5.evt 45 90 3845 subsecs-m/unk.6.evt 39 78 3002 subsecs-m/unk.7.evt 1 2 67 subsecs-m/unk.8.evt 335 670 15343 subsecs-m/zod.1.evt Let's list the pages in each section: ( cd pages-m && ls f*.evt ) \ | sed -e 's/\.evt/ +/' \ > /tmp/present.tbl /bin/rm -f pages-summary.txt foreach sec ( `cat subsecs-m/all.names` ) echo "${sec}" echo "subsection ${sec}" \ >> pages-summary.txt cat subsecs-m/${sec}.fnums \ | map-field \ -v table=/tmp/present.tbl \ -v default='-' \ | sed -e 's/[+] //' -e 's/- \(f[0-9vr]*\)/(\1)/' \ | fmt -w 50 \ | sed -e 's/^/ /' \ >> pages-summary.txt echo " " \ >> pages-summary.txt end EXPORTING THE FILES set dtnew = `date +%y-%m-%d` set exnew = "/n/ftp/pub/staff/stolfi/EXPORT/projects/voynich/${dtnew}-interln${vnew}" set dtold = "98-12-28" set exold = "/n/ftp/pub/staff/stolfi/EXPORT/projects/voynich/${dtold}-interln${vold}" Creating the export directory mkdir ${exnew}