Hacking at the Voynich manuscript - Side notes
103 Statistics of basic glyphs, strokes, and pairs thereof

Last edited on 2012-05-03 20:45:07 by stolfilocal

INTRODUCTION

  In this note we decompose the Voynichese words into their basic glyphs,
  and compute glyph and digraph frequencies.
  
SETTING UP THE ENVIRONMENT

  Links:
  
    ln -s ../tr-stats/dat
    ln -s ../tr-stats/exp
    ln -s /home/staff/stolfi/voynich/work 

    ln -s work/basify-weirdos
    ln -s work/capitalize-ligatures
    ln -s work/compute-cum-cum-freqs
    ln -s work/compute-cum-freqs
    ln -s work/compute-freqs
    ln -s work/combine-counts
    ln -s work/remove-freqs
    ln -s work/totalize-fields
    ln -s work/select-units
    ln -s work/words-from-evt
    ln -s work/format-counts-packed
    ln -s work/factor-field-general
    ln -s work/update-paper-include
    ln -s work/factor-text-eva-to-basic.gawk
    
    ln -s ../100/subsections.tags

GLYPH AND GLYPH PAIR FREQUENCIES

  Tabulating "basic" and "rare" glyph frequencies in the text and lexicon

    make -f glyph-freqs.make all

  Computing the "basic" symbol pair counts and next/prev 
  frequencies in the text and lexicon:
  
    make -f glyph-pair-freqs.make all
    
  Tabulating repeated glyphs in the text and lexicon:

    make -f glyph-rep-freqs.make all

  Tabulating inter-glyph stroke pairs (last stroke of a glyph against
  the first stroke of the next glyph).

    make -f stroke-pair-freqs.make all

TABULATING GLYPH COUNTS PER SECTION 

    set secs = ( `cat dat/voyn/maj/subsections.tags` )
    set secscm = `echo ${secs} | tr ' ' ','`
    echo ${secs}; echo ${secscm}

    set tfile = "voyn/maj/glyph-counts-by-section.txt"
    /bin/rm -f dat/${tfile}
    foreach sec ( ${secs} tot.1 )
      foreach book ( prs lab )
        set dir = "voyn/${book}/${sec}"
        set ifile = "${dir}/raw.wfr"
        set ofile = ".glyphs-${book}"
        echo "dat/${ifile} -> ${ofile}"
        cat dat/${ifile} \
          | capitalize-ligatures -v field=3 \
          | factor-field-general \
              -f factor-text-eva-to-basic.gawk \
              -v inField=3 -v outField=4 \
          | gawk \
              ' BEGIN{ s = 0; } \
                //{ ct = $1; w = $4; \
                    gsub(/}{/, "} {", w); \
                    m = split(w, els); \
                    s += ct * m; \
                  } \
                END{ print s; } \
              ' \
          > ${ofile}
      end
      set nprs = "`cat .glyphs-prs`"
      set nlab = "`cat .glyphs-lab`"
      @ ntot = $nprs + $nlab
      printf "%s %7d %7d %7d\n" "${sec}" "$nprs" "$nlab" "$ntot" >> dat/${tfile}
    end
    cat dat/${tfile}
  
       sec      prs     lab     tot
      -----  ------  ------  ------
      hea.1   27925       6   27931
      hea.2    3783       0    3783
      heb.1   12755       0   12755
      heb.2    2471       0    2471
      cos.1     385      69     454
      cos.2    6714    1252    7966
      cos.3    3969     628    4597
      bio.1   30694     721   31415
      zod.1    4669    1893    6562
      pha.1    4044     537    4581
      pha.2    6354     835    7189
      str.1    3438       0    3438
      str.2   52179       0   52179
      unk.1     833       0     833
      unk.2     623       0     623
      unk.3     195       0     195
      unk.4    1404      67    1471
      unk.5    1621       0    1621
      unk.6    2261       0    2261
      unk.7    1707       0    1707
      unk.8       0       8       8
      tot.1  168020    6016  174036
      -----  ------  ------  ------
      tot.n  168020    6016  174036

>>> STOPPED HERE <<<

SORTING THE BASIC GLYPHS BASED ON DIGRAPH PROBABILITIES

  Let's try to find an optimum sequence for the glyphs --- one that
  brings glyphs with similar context statistics close together.  
  
  Let G be the set of glyphs, and d(u,v) be some penalty for placing
  glyphs u and v next to each other. We want to find a permutation
  u[0..n-1] of G that minimizes
  
    W(u) = sum{ d(u[i-1],u[i]) : i in [1..n-1] }
  
  First, let's compute the pairwise glyph distances d(u,v):
  
    set bglyphs = "q,y,l,r,s,n,m,i,a,o,d,e,Ch,Sh,k,t,f,p,CKh,CTh,CFh,CPh"
    
    foreach tw ( t w )
      set ifile = "voyn-vms-glyph-pair-${tw}.gpf"
      set ofile = "voyn-vms-glyph-distances-${tw}.dst"
      echo "${ifile} -> ${ofile}"
      cat ${ifile} \
        | gawk '/./{ ct=$1; w=$5; gsub(/[:]/, " ", w); print ct,w; }' \
        | compute-elem-distances -f parse-elem-list.gawk \
            -v elemList="${bglyphs}" \
            -v exponent=1.0 \
        > ${ofile}
    end
 
  #  d(u,v) to a fractional power, so that
  # keeping similar elements together is more important that rearranging
  # dissimilar ones.