Hacking at the Voynich manuscript - Side notes
201 Statistics of crescent and circle sequences

Last edited on 2002-01-04 00:50:45 by stolfi

INTRODUCTION

  In this note we compute some statistics about the crescent and circle glyphs
  with the purpose of deciding how to parse strings of those elements
  in the fine-structure model.

SETTING UP THE ENVIRONMENT

  Commands:
  
    ln -s ../../capitalize-ligatures
    ln -s ../../compute-cum-cum-freqs
    ln -s ../../compute-cum-freqs
    ln -s ../../compute-freqs
    ln -s ../../combine-counts
    ln -s ../../remove-freqs
    ln -s ../../totalize-fields
    ln -s ../../select-units
    ln -s ../../words-from-evt
    ln -s ../../format-counts-packed
    
  Data:

    ln -s ../100/subsections.tags

    ln -s ../100/data
    ln -s ../101/lang
    
    ln -s ../103/factor-word-basic
    ln -s ../103/count-elems
    ln -s ../103/count-elem-pairs
    ln -s ../103/compute-row-col-freqs
    ln -s ../103/tex-format-elem-pair-freqs
    ln -s ../103/parse-elem-list.gawk

  Paper directories:

    set tbldir = "/home/staff/stolfi/papers/voynich-words/techrep/tables/auto"
    set figdir = "/home/staff/stolfi/papers/voynich-words/techrep/figures/auto"

  Result directories:
  
    mkdir -p stats/{text,labs}/{t,w}
    
  Section tags:

    set secs = ( `cat subsections.tags` )
    set secscm = `echo ${secs} | tr ' ' ','`
    echo ${secs}; echo ${secscm}

INVESTIGATING THE CONTEXT STATISTICS OF CIRCLE AND CRESCENT GLYPHS

  Collecting all the crescent and circle letter slots:

    foreach wkind ( text labs )
      foreach tw ( t w )
        foreach glyphs ( aoy e aoye )
          set ifile = "lang/voyn/${wkind}/gud.wfr"
          set ofile = "stats/${wkind}/${tw}/slots-${glyphs}-in-ctx.frq"
          echo " "; echo "${ifile} -> ${ofile}" 
          cat ${ifile} \
            | capitalize-ligatures -v field=3 \
            | factor-word-basic -v inField=3 -v outField=4 \
            | gawk -v tw="${tw}" '/./{print (tw == "w" ? 1 : $1), $4;}' \
            | extract-glyph-strings -v glyphs="${glyphs}" \
            | combine-counts \
            | sort -b +0 -1nr +1 -2 \
            | compute-freqs \
            > ${ofile}
        end
      end
    end
    
  Tabulating slots of each type:
  
    foreach wkind ( text labs )
      foreach tw ( t w )
        foreach glyphs ( aoy e aoye )
          set ifile = "stats/${wkind}/${tw}/slots-${glyphs}-in-ctx.frq"
          set ofile = "stats/${wkind}/${tw}/slots-${glyphs}.frq"
          echo " "; echo "${ifile} -> ${ofile}"
          cat ${ifile} \
            | gawk '/./{print $1, $3}' \
            | sed -e 's/{[^{}]*}//g' \
            | combine-counts \
            | sort -b +0 -1nr +1 -2 \
            | compute-freqs \
            > ${ofile}
        end
      end
    end

DOUBLE CONTEXS OF SINGLE CRESCENT AND CIRCLE GLYPHS

  Computing and formatting the two-sided contexts of single 
  <e>, <a>, <o>, <y>, and [aoy]:

    foreach wkind ( text labs )
      foreach tw ( t w )
        foreach fg ( aoy/a aoy/o aoy/y aoy/aoy e/e )
          set fam = "${fg:h}"; set glyphs = "${fg:t}"
          set ifile = "stats/${wkind}/${tw}/slots-${fam}-in-ctx.frq"
          set ofile = "stats/${wkind}/${tw}/single-${glyphs}-ctx.frq"
          echo " "; echo "${ifile} -> ${ofile}"
          cat ${ifile} \
            | gawk -v glyphs="${glyphs}" \
                ' ($3 ~ ("<[" glyphs "]>")){ \
                    wd = $3; gsub(/[<].[>]/, ":", wd); gsub(/[{}]/,"", wd); \
                    print $1, wd; \
                  } \
                ' \
            | combine-counts \
            | compute-row-col-freqs -v outputTotals=1 \
            | sort -b +0 -1nr +4 -5 \
            > ${ofile}
        end
      end
    end
 
  Format the results as TeX tables:

    foreach wkind ( text labs )
      foreach tw ( t w )
        foreach ge ( e/a,o,-,y  a/e o/e y/e aoy/e )
          set glyphs = "${ge:h}"; set elems = "${ge:t}"
          set ifile = "stats/${wkind}/${tw}/single-${glyphs}-ctx.frq"
          set ofile = "single-${glyphs}-ctx-${wkind}-${tw}.tex"
          echo " "; echo "${ifile} -> ${ofile}"
          set xelems = '_,-,q,-,'"${elems}"',-,l,d,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh'
          cat ${ifile} \
            | tex-format-elem-pair-freqs -f parse-elem-list.gawk \
                -v rowList="${xelems}" -v colList="${xelems}" \
                -v endMarker='_' -v freqDigits=2 -v minFreq=0.005 \
                -v showCounts=1 -v showRowFreqs=0 -v showColFreqs=0 \
            > ${ofile}
          mv -bv ${ofile} ${tbldir}/
        end
      end
    end
    
ONE-SIDED CONTEXS OF SINGLE CRESCENT AND CIRCLE GLYPHS

  Computing and formatting the previous and next glyph distributions
  for single [e] and [aoy] strings:
  
    foreach wkind ( text labs )
      foreach tw ( t w )
        foreach fg ( aoy/a aoy/o aoy/y aoy/aoy e/e )
          set fam = "${fg:h}"; set glyphs = "${fg:t}"
          set ifile = "stats/${wkind}/${tw}/slots-${fam}-in-ctx.frq"
          set ofile = "stats/${wkind}/${tw}/prev-next-single-${glyphs}.frq"
          echo " "; echo "${ifile} -> ${ofile}"
          cat ${ifile} \
            | gawk -v glyphs="${glyphs}" \
                ' ($3 ~ ("<[" glyphs "]>")){ \
                    a = $3; gsub(/[<].[>].*$/, "", a); gsub(/[{}]/,"",a); \
                    b = $3; gsub(/^.*[<].[>]/, "", b); gsub(/[{}]/,"",b); \
                    print $1,("prev:" a);  print $1,("next:" b); \
                  } \
                ' \
            | combine-counts \
            | compute-row-col-freqs -v outputTotals=1 \
            | sort -b +0 -1nr +4 -5 \
            > ${ofile}
        end
      end
    end
    
  Format the results as TeX tables:

    foreach wkind ( text labs )
      foreach tw ( t w )
        foreach ge ( e/a,o,-,y  a/e o/e y/e aoy/e )
          set glyphs = "${ge:h}"; set elems = "${ge:t}"
          set ifile = "stats/${wkind}/${tw}/prev-next-single-${glyphs}.frq"
          set ofile = "prev-next-single-${glyphs}-${wkind}-${tw}.tex"
          echo " "; echo "${ifile} -> ${ofile}"
          set xelems = '_,-,q,-,'"${elems}"',-,l,d,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh'
          cat ${ifile} \
            | tex-format-elem-pair-freqs -f parse-elem-list.gawk \
                -v rowList="prev,next" -v colList="${xelems}" \
                -v endMarker='_' -v freqDigits=2 -v minFreq=0.005 \
                -v showCounts=0 -v showRowFreqs=1 -v showColFreqs=0 \
            > ${ofile}
          mv -bv ${ofile} ${tbldir}/
        end
      end
    end
    
COMPARING DISTRIBUTIONS OF MODIFIED AND UNMODIFIED GLYPHS

  Computing the basic symbol pair frequencies in the good text
  (for comparison with <e>-modified glyph frequencies):
  
    set bglyphs = '_,-,e,-,i,-,q,-,y,-,a,o,-,d,l,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh'
    foreach wkind ( text labs )
      foreach tw ( t w )
        set ifile = "lang/voyn/${wkind}/gud.wfr"
        set ofile = "stats/${wkind}/${tw}/basic-glyph-pairs.gfr"
        echo " "; echo "${ifile} -> ${ofile}"
        cat ${ifile} \
          | capitalize-ligatures -v field=3 \
          | factor-word-basic -v inField=3 -v outField=4 \
          | gawk -v tw="${tw}" '/./{print (tw == "w" ? 1 : $1), $4;}' \
          | count-elem-pairs -f parse-elem-list.gawk \
              -v endMarker='_' -v showBadWords=0 \
              -v rowList="${bglyphs}" -v colList="${bglyphs}" \
          | compute-row-col-freqs -v outputTotals=1 \
          > ${ofile}
      end
    end

  Computing next-glyph and prev-glyph distributions for <Xe> and <eX>
  pairs, respectively; and ditto for <Xo> and <oX>, etc.
  
    foreach wkind ( text labs )
      foreach tw ( t w )
        foreach dir ( next prev )
          foreach fg ( aoy/a aoy/o aoy/y aoy/aoy e/e )
            set fam = "${fg:h}"; set glyphs = "${fg:t}"
            set ifile = "stats/${wkind}/${tw}/slots-${fam}-in-ctx.frq"
            set ofile = "stats/${wkind}/${tw}/elem-${glyphs}-${dir}.gfr"
            echo " "; echo "${ifile} -> ${ofile}"
            cat ${ifile} \
              | gawk -v dir="${dir}" -v glyphs="${glyphs}" \
                  ' BEGIN { \
                      gp = ("{" glyphs "}"); cp = ("[<][" glyphs "][>]"); \
                      rep = ("\\1" (dir == "next" ? (gp ":") : (":" gp)) "\\2"); \
                    } \
                    ($3 ~ cp){ a = $3; \
                      a = gensub(/^(.*)[<].[>](.*)$/, rep, "g", a); \
                      gsub(/[{}]/,"",a); print $1, a; \
                    } \
                  ' \
              | combine-counts \
              | compute-row-col-freqs -v outputTotals=1 \
              | sort -b +0 -1nr +4 -5 \
              > ${ofile}
          end
        end
      end
    end
    
  Formatting the data for <Xe> and <eX> as TeX tables:

    set nglyphs = '_,-,q,-,y,-,a,o,-,d,l,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh'
    set xepairs = 'Ch,Che,-,Sh,She,-,k,ke,-,t,te,-,CKh,CKhe,-,CTh,CThe'
    foreach wkind ( text labs )
      foreach tw ( t w )
        set pfile = "stats/${wkind}/${tw}/basic-glyph-pairs.gfr"
        set efile = "stats/${wkind}/${tw}/elem-e-next.gfr"
        set ofile = "e-elem-next-distr-${wkind}-${tw}.tex"
        echo " "; echo "${pfile}, ${efile} -> ${ofile}"
        cat ${pfile} ${efile} \
          | tex-format-elem-pair-freqs -f parse-elem-list.gawk \
              -v rowList="${xepairs}" -v colList="${nglyphs},-,+" \
              -v endMarker='_' -v minFreq=0.015 \
              -v showCounts=0 -v showRowFreqs=1 -v showColFreqs=0 \
          > ${ofile}
        mv -bv ${ofile} ${tbldir}/
      end
    end
      
    set pglyphs = '_,-,q,-,y,-,a,o,-,d,l,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh'
    set expairs = '_,e_,-,y,ey,-,a,ea,-,o,eo,-,d,ed,-,Ch,eCh,-,Sh,eSh,-,k,ek,-,t,et,-,p,ep,-,f,ef,-,CKh,eCKh,-,CTh,eCTh'
    foreach wkind ( text labs )
      foreach tw ( t w )
        set pfile = "stats/${wkind}/${tw}/basic-glyph-pairs.gfr"
        set efile = "stats/${wkind}/${tw}/elem-e-prev.gfr"
        set ofile = "e-elem-prev-distr-${wkind}-${tw}.tex"
        echo " "; echo "${ifile} -> ${ofile}"
        cat ${pfile} ${efile} \
          | tex-format-elem-pair-freqs -f parse-elem-list.gawk \
              -v rowList="${pglyphs},-,+" -v colList="${expairs}" \
              -v endMarker='_' -v minFreq=0.015 \
              -v showCounts=0 -v showRowFreqs=0 -v showColFreqs=1 \
          > ${ofile}
        mv -bv ${ofile} ${tbldir}/
      end
    end
      
  Formatting the data for <Xo> and <oX> as TeX tables:

    set nglyphs = '_,-,q,-,e,-,i,-,d,l,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh'
    set xopairs = '_,_o,-,d,do,-,r,ro,-,s,so,-,l,lo,-,Ch,Cho,-,Sh,Sho,-,k,ko,-,t,to,-,CKh,CKho,-,CTh,CTho'
    foreach wkind ( text labs )
      foreach tw ( t w )
        set pfile = "stats/${wkind}/${tw}/basic-glyph-pairs.gfr"
        set efile = "stats/${wkind}/${tw}/elem-o-next.gfr"
        set ofile = "o-elem-next-distr-${wkind}-${tw}.tex"
        echo " "; echo "${pfile}, ${efile} -> ${ofile}"
        cat ${pfile} ${efile} \
          | tex-format-elem-pair-freqs -f parse-elem-list.gawk \
              -v rowList="${xopairs}" -v colList="${nglyphs},-,+" \
              -v endMarker='_' -v minFreq=0.015 \
              -v showCounts=0 -v showRowFreqs=1 -v showColFreqs=0 \
          > ${ofile}
        mv -bv ${ofile} ${tbldir}/
      end
    end
      
    set pglyphs = '_,-,q,-,e,-,i,-,d,l,r,s,-,n,m,-,Ch,Sh,-,k,t,f,p,-,CKh,CTh,CFh,CPh'
    set oxpairs = '_,o_,-,d,od,-,l,ol,r,or,-,-,s,os,-,Ch,oCh,-,Sh,oSh,-,k,ok,-,t,ot,-,p,op,-,f,of,-,CKh,oCKh,-,CTh,oCTh'
    foreach wkind ( text labs )
      foreach tw ( t w )
        set pfile = "stats/${wkind}/${tw}/basic-glyph-pairs.gfr"
        set efile = "stats/${wkind}/${tw}/elem-o-prev.gfr"
        set ofile = "o-elem-prev-distr-${wkind}-${tw}.tex"
        echo " "; echo "${ifile} -> ${ofile}"
        cat ${pfile} ${efile} \
          | tex-format-elem-pair-freqs -f parse-elem-list.gawk \
              -v rowList="${pglyphs},-,+" -v colList="${oxpairs}" \
              -v endMarker='_' -v minFreq=0.015 \
              -v showCounts=0 -v showRowFreqs=0 -v showColFreqs=1 \
          > ${ofile}
        mv -bv ${ofile} ${tbldir}/
      end
    end
      
INVESTIGATING SINGLE "E"s

  Count single <e>s (not preceded by gallows or benches)
  
    foreach wkind ( text labs )
      set ifile = "stats/${wkind}/prev-next-single-e.frq"
      echo " "; printf "%s: " "${wkind}"
      cat ${ifile} \
        | egrep 'prev:' \
        | egrep -v 'prev:([+]|Ch|Sh|[Cc]?[KTPFktpf][Hh]?)$' \
        | gawk '/./{s+=$1;} END{print s;}'
    end

      text: 287
      labs: 23

  Show the offending words, by section:
  
    foreach wkind ( text labs )
      foreach sec ( ${secs} tot.n )
        set ifile = "data/gud/${wkind}/${sec}.wfr"
        set ofile = "stats/${wkind}/${sec}.wfr"
        echo " "; echo "${ifile} -> ${ofile}" 
        cat ${ifile} \
          | capitalize-ligatures -v field=3 \
          | gawk \
              ' /./{ \
                  ct = $1; wd = $3; \
                  gsub(/ee/, "Ee", wd); \
                  wd = gensub(/(^|[^ktpfhEe])[e]/, "\\1:e", "g", wd); \
                  if (wd ~ /[:]/) { print ct, wd; } \
                } ' \
          | revbytes | sort -t: +1 -2 | revbytes \
          > ${ofile}
      end
    end

JUSTIFYING THE PARSING OF EE-GROUPS AND EEE-GROUPS AS SINGLE ELEMS

  Compute the frequency of ambiguous <eee> groups.
  
    echo "From text/tot.n.wfr:"; echo " "; \
    cat data/gud/text/tot.n.wfr \
      | gawk '($3 ~ /[ktpfhe]eee/){print $1, $3; }' \
      | sort -b +0 -1nr +1 -2 \
      | format-counts-packed \
      | sed -e 's/^/  /'
    
      From text/tot.n.wfr:

        qokeeey(27) okeeey(26) keeey(10) okeeedy(9) cheeey(8)
        lkeeey(8) olkeeey(7) oteeey(7) ykeeey(6) yteeey(5) keeedy(4)
        lkeeedy(4) qokeeedy(4) teeedy(4) okeeeo(3) olkeeedy(3)
        oteeedy(3) qokeees(3) qoteeedy(3) qoteeey(3) sheeey(3)
        ykeeedy(3) chokeeey(2) cholkeeey(2) keeeol(2) olcheeey(2)
        oteees(2) qoteees(2) sheeeky(2) cheeedaiin(1) cheeen(1)
        cheeeo(1) cheees(1) cheeetchol(1) cheeety(1) cheykeeed(1)
        chkeeey(1) cholkeeedy(1) choteeen(1) choteeey(1) ckheeey(1)
        ctheees(1) dalcheeeky(1) dalkeeey(1) dcheeey(1) deeeese(1)
        feeedy(1) keeed(1) keeedal(1) keeees(1) keeeo(1) keeeody(1)
        keeeos(1) keees(1) lcheeey(1) lkeeed(1) lkeeedam(1)
        lkeeeody(1) lkeees(1) oeeees(1) okchoteees(1) okeeeody(1)
        okeeeol(1) okeees(1) olcheees(1) olkeeeary(1) olkeeed(1)
        olkeees(1) olteeedy(1) orkeeey(1) orokeeeey(1) oteee(1)
        oteeen(1) oteeeo(1) oteeeodar(1) oteeeor(1) otokeeey(1)
        qkeeey(1) qoeeeey(1) qofsheeey(1) qokaekeeey(1) qokeee(1)
        qokeeen(1) qokeeeody(1) qokeeeos(1) qolkeeey(1) qoteeeo(1)
        qoykeeey(1) sheees(1) sheeetchy(1) shefeeedy(1) shokeeey(1)
        teeey(1) tolkeeedy(1) ykeeeedaiir(1) ykeeeos(1) ypcheeey(1)
        ysheees(1) yteeed(1) yteeedy(1) yteeeor(1)

  Compare with frequency of <Kech> and <Kche> where K = { k t p f h e }
    
    echo "From text/tot.n.wfr:" ; echo " " ; \
    cat data/gud/text/tot.n.wfr \
      | gawk '($3 ~ /([ktpf]|[cs]h)e*([cs]he|e[cs]h)/){ \
          w = gensub(/^.*(([ktpf]|[cs]h)e*([cs]he|e[cs]h)e*).*$/, "\\1", "g", $3); \
          gsub(/[ktpf]/,"k",w); gsub(/[cs]h/,"ch",w); \
          print $1, w; }' \
      | combine-counts \
      | sort -b +0 -1nr +1 -2 \
      | format-counts-packed \
      | sed -e 's/^/  /'
    
      From text/tot.n.wfr:

        kche(1192) kchee(118) kech(90) keche(39) keech(25) chech(15)
        chche(9) keeche(5) cheche(2) kcheee(2) kechee(2) chchee(1)
        cheech(1)

  As we can see, <Kche> is about 12 times as common as <Kech>,
  and <Keech> plus <Kchee> is more common than <Keche>.