Hacking at the Voynich manuscript - Side notes
045 Computing consensus/majority editions of the EVA interlinear

Last edited on 1999-07-26 19:34:45 by stolfi

  The goal of this note is to condense the various transcriptions
  present in the EVA interlinear into some sort of "consensus" 
  or "majority" version.
  
  The majority version is also chopped into per-section and 
  per-page files for statistical analysis.

JOINING THE INTERLINEAR INTO A SINGLE FILE

  Making a list of all text units:

    ln -s ../../L16+H-eva
    
    cat L16+H-eva/INDEX \
      | gawk -v FS=':' '/./{print $2;}' \
      > .all.units

    set units = ( `cat .all.units` )

  Safety check:

    ( cd L16+H-eva && ls f[0-9]* | egrep -v '[~]$' ) | sort > .foo
    cat .all.units | sort > .bar
    diff .foo .bar

  Concatenating all units, with basic uncapitalized EVA 

    ( cd L16+H-eva && cat ${units} ) \
      > inter.evt
      
  Checking validity and synchronism:
  
    cat inter.evt \
      | validate-new-evt-format \
          -v checkTerminators=1 \
          -v checkLineLengths=1 \
      >& inter.bugs
  
EXTRACTING THE TUPLES OF VARIANT READINGS

  Next we extract from the interlinear a list of all "reading tuples",
  one for each character position in the text. Each tuple is a string
  of 26 characters, the readings of that character position in each of
  the 26 potential variants. 
  
  Whenever a particular variant does not
  cover a particular character position, the corresponding tuple
  element is set to "%".  (Note that "%" is presently used in the
  interlinear itself to mark lines or parts of lines that were
  skipped by a particular transcriber.)

    cat inter.evt \
      | unbasify-weirdos \
      | egrep -v ';[AY]>' \
      | extract-reading-tuples \
          -f tuple-procs.gawk \
      | sort | uniq -c | expand \
      | sort +0 -1nr +1 -2 \
      > inter.tfr
      
       5362 VMS text lines found
      17357 interlinear text lines read
     245136 tuples written
      17357 interlinear text lines read

    dicio-wc inter.tfr

      lines   words     bytes file        
    ------- ------- --------- ------------
       6126   12252    214410 inter.tfr

    cat inter.tfr \
      | gawk '/./{s+=$1} END{print s;}'
      
      245136

  We then compute tables that map reading tuples to
  and consensus readings:
  
    cat inter.tfr \
      | compute-consensus-table \
      > tuple-to-consensus.stats
      
    cat tuple-to-consensus.stats \
      | gawk '/./{print $2,$3;}' \
      > tuple-to-consensus.tbl
      
  Similarly, we compute a table that maps reading tuples
  to majority readings (using equal weights on the first iteration):
  
    cat inter.tfr \
      | compute-majority-table \
          -v alternates="CD,FG,JI,KQ,LM" \
      > tuple-to-majority.stats
      
    cat tuple-to-majority.stats \
      | gawk '/./{print $2,$3;}' \
      > tuple-to-majority.tbl
      
  Since the weights are unity, the total weight column is the number
  of transcribers in that reading tuple (ignoring alternates and "%"
  or "*" readings). Let's count how many recorded positions have been
  read by 0, 1, 2, ..., 26 transcribers, excluding also positions that
  are all [!%=-] since those prositions are either fillers or were
  provided by me:
  
    cat tuple-to-majority.stats \
      | gawk \
          ' ($2 \!~ /^[-=%\!]*$/){ \
              nt = int($4+0.0001); ct[nt] += $1; tot += $1; \
              if (nt>zt+0){zt=nt} if (26-nt>at+0){at=26-nt} \
            }; \
            END{ \
              for(i=26-at;i<=zt;i++){printf "%3d %7d\n", i, ct[i]} \
              printf "tot %7d\n", tot \
            } \
          '

      0     268
      1    2509
      2   59253
      3   95048
      4   68280
      5    4055
      6     341
    tot  229754

  
  Next we can compute some statistics about the accuracy of each transcriber:
  
    cat inter.tfr \
      | compute-transcriber-correlations \
          -v alternates="CD,FG,JI,KQ,LM" \
      > inter.trcorrs
    
CREATING THE CONSENSUS AND MAJORITY VERSIONS

  Now we read the HTML file and produce another file with two
  extra variants: a "majority" version (first in each batch, 
  transcriber code "A") and a "consensus" version (last in each
  group, transcriber code "Y").  See the scripts "compute-consensus-table"
  and "compute-majority-table" for definitions of these terms.
  
    cat inter.evt \
      | egrep -v '^<.*;[AY]>' \
      | unbasify-weirdos \
      | combine-versions \
          -f tuple-procs.gawk \
          -v code=Y \
          -v position=last \
          -v table=tuple-to-consensus.tbl \
      > inter-c.evt
      
    cat inter-c.evt \
      | combine-versions \
          -f tuple-procs.gawk \
          -v ignore=Y \
          -v code=A \
          -v position=first \
          -v table=tuple-to-majority.tbl \
      | basify-weirdos \
      > inter-cm.evt
      
  Extracting the bare text of the majority and consensus versions:
  
    cat inter-cm.evt \
      | sed -e '/^## <[^<>.]*>/s/^## *//g' \
      | egrep -v '^#' \
      | egrep -v '^<.*;[^A]>' \
      | unbasify-weirdos \
      > only-m.evt 

    cat inter-cm.evt \
      | sed -e '/^## <[^<>.]*>/s/^## *//g' \
      | egrep -v '^#' \
      | egrep -v '^<.*;[^Y]>' \
      | unbasify-weirdos \
      > only-c.evt 
      
  Publishing:
  
    foreach f ( only-c only-m )
      cat $f.evt | gzip > $f.evt.gz
      zip $f $f.evt 
    end

DISPLAYING THE DIFFERENCES BETWEEN VERSIONS

  The file will show the majority line  at the top and variants below,
  in the format 
   
    fNNN.UU LLL A  EEEEEEEEEE...
                T    E  EE
                T    E  EE
                T    E  EE
                Y  EEEEEEEEE..
                    
  where LLL is a line number, T is a transcriber code, E an EVA character.
  
    rm disc/*.html 
    cat inter-cm.evt \
      | egrep -v '^## *<[^<>.]*[.][^<>.]*>' \
      | egrep -v '^#([ ]|$)' \
      | unbasify-weirdos \
      | show-discrepancies \
          -v title='EVA interlinear 1.6e6 - Discrepancies between versions' \
          -f tuple-procs.gawk \
          -v dir=disc
  
  Publishing the concordance:
  
    ( cd disc && rm -f disc.zip && pkzip disc index.html legend.html f*.html )
    
CREATING PER-PAGE AND PER-SECTION FILES

  Now let's produce the following files:
  
    pages-m/FNUM.evt      the majority version split into
                          one file per page.
                       
    pages-m/all.names     the f-numbers of all existing pages, 
                          in natural reading order.
                       
    subsecs-m/TAG.evt     the majority version split into
                          one file per subsection.
                       
    subsecs-m/all.names   the tags of all existing sections,
                          in some nice order

    subsecs-m/TAG.fnums   the f-numbers of all existing pages in 
                          section TAG, in natural reading order

  Gathering the page lists:

    set pages = ( `cat .all.units | egrep -v 'f0' | egrep -v '[.]'` )
    
    mkdir pages-m

    /bin/rm -f pages-m/all.names pages-m/*.evt .foo
    cat only-m.evt \
      | basify-weirdos \
      | sed -e 's/[&][*]/**/g' \
      | egrep -v '^<[^<>.]*>' \
      | split-pages \
         -v outdir=pages-m \
      > pages-m/all.names
                       
  Collecting the list of pages in each section:

    mkdir subsecs-m

    set subsecs = ( \
      `cat fnum-to-subsec.tbl | gawk '($2 \!~ /xxx/){print $2}' | sort | uniq` \
    )
    echo "subsecs = ( ${subsecs} )"

    /bin/rm -f subsecs-m/all.names subsecs-m/*.fnums subsecs-m/.foo
    foreach tag ( ${subsecs} )
      echo "${tag}"
      cat fnum-to-subsec.tbl \
        | grep -w ${tag}  \
        | gawk '/./{print $1;}' \
        > subsecs-m/${tag}.fnums
      cat `cat subsecs-m/${tag}.fnums | sed -e 's@^\(.*\)$@pages-m/\1.evt@g'` \
        > subsecs-m/${tag}.evt
      echo ${tag} >> subsecs-m/all.names
    end 
    dicio-wc subsecs-m/*.evt

      lines   words     bytes file        
    ------- ------- --------- ------------
        916    1832     62661 subsecs-m/bio.1.evt
         13      26      1132 subsecs-m/cos.1.evt
        399     798     19260 subsecs-m/cos.2.evt
        186     372      9994 subsecs-m/cos.3.evt
       1066    2132     64512 subsecs-m/hea.1.evt
        134     268      8660 subsecs-m/hea.2.evt
        316     632     24711 subsecs-m/heb.1.evt
         61     122      4644 subsecs-m/heb.2.evt
        174     348     10021 subsecs-m/pha.1.evt
        284     568     15718 subsecs-m/pha.2.evt
         80     160      6158 subsecs-m/str.1.evt
       1084    2168     90650 subsecs-m/str.2.evt
         53     106      2535 subsecs-m/unk.1.evt
         52     104      2476 subsecs-m/unk.2.evt
          7      14       461 subsecs-m/unk.3.evt
         82     164      3762 subsecs-m/unk.4.evt
         35      70      2844 subsecs-m/unk.5.evt
         45      90      3845 subsecs-m/unk.6.evt
         39      78      3002 subsecs-m/unk.7.evt
          1       2        67 subsecs-m/unk.8.evt
        335     670     15343 subsecs-m/zod.1.evt

  Let's list the pages in each section:

    ( cd pages-m && ls f*.evt ) \
      | sed -e 's/\.evt/ +/' \
      > /tmp/present.tbl

    /bin/rm -f pages-summary.txt
    foreach sec ( `cat subsecs-m/all.names` )
      echo "${sec}"
      echo "subsection ${sec}" \
        >> pages-summary.txt
      cat subsecs-m/${sec}.fnums \
        | map-field \
            -v table=/tmp/present.tbl \
            -v default='-' \
        | sed -e 's/[+] //' -e 's/- \(f[0-9vr]*\)/(\1)/' \
        | fmt -w 50 \
        | sed -e 's/^/  /' \
        >> pages-summary.txt
      echo " " \
        >> pages-summary.txt
    end