Hacking at the Voynich manuscript - Side notes
108 Computing and comparing entropy profiles

Last edited on 2012-05-03 20:46:09 by stolfilocal

INTRODUCTION

  In this note we compute the conditional glyph entropies and the
  token entropy profile, for Voynichese and other languages.
  
  The conditional symbol entropy h[k] of order k is the expected 
  number of bits that are provided by a random symbol in the text,
  given the previous k-1 symbols.
  
  The token entropy profile t[k] is the expected number of bits
  contained in the kth symbol of a token, given the previous k-1
  symbols. The sum of t[k] for all k should be equal to the token
  entropy.
  
SETTING UP THE ENVIRONMENT

  Links:
  
    ln -s ../tr-stats/dat
    ln -s ../tr-stats/exp
    ln -s /home/staff/stolfi/voynich/work 

    ln -s work/capitalize-ligatures
    ln -s work/compute-cum-cum-freqs
    ln -s work/compute-cum-freqs
    ln -s work/compute-freqs
    ln -s work/combine-counts
    ln -s work/compute-cond-entropy
    ln -s work/remove-freqs
    ln -s work/totalize-fields
    ln -s work/select-units
    ln -s work/words-from-evt
    ln -s work/format-counts-packed
    
    ln -s work/parse-elem-list.gawk

    ln -s work/factor-text-trivial.gawk
    ln -s work/factor-text-viqr-to-phon.gawk
    ln -s work/factor-text-pinyin-to-phon.gawk
    ln -s work/factor-text-pinyin-std.gawk
    ln -s work/factor-text-pinyin-fix.gawk
    ln -s work/factor-text-eva-to-basic.gawk
    ln -s work/factor-text-eva-to-oko.gawk

TESTING THE CONDITIONAL ENTROPY CALCULATOR

  Generating a list of tuples with known entropies

    rm -f .test.wct
    foreach wc ( \
        "{0}{0}.{a}" "{0}{1}.{b}" "{0}{2}.{c}" "{0}{3}.{d}" \
        "{1}{0}.{b}" "{1}{1}.{c}" "{1}{2}.{d}" "{1}{3}.{a}" \
        "{2}{0}.{c}" "{2}{1}.{d}" "{2}{2}.{a}" "{2}{3}.{b}" \
        "{3}{0}.{d}" "{3}{1}.{a}" "{3}{2}.{b}" "{3}{3}.{c}" \
      )
      echo "40 ${wc:r} ${wc:e}" >> .test.wct
    end
    foreach w ( \
        "{0}{a}" "{0}{b}" "{0}{c}" "{0}{d}" \
        "{1}{a}" "{1}{b}" "{1}{c}" "{1}{d}" \
        "{2}{a}" "{2}{b}" "{2}{c}" "{2}{d}" \
        "{3}{a}" "{3}{b}" "{3}{c}" "{3}{d}" \
      )
      foreach c ( "{0}" "{1}" "{2}" "{3}" )
        echo "10 $w $c" >> .test.wct
      end
    end
    foreach w ( \
        "{a}{0}" "{b}{0}" "{c}{0}" "{d}{0}" \
        "{a}{1}" "{b}{1}" "{c}{1}" "{d}{1}" \
        "{a}{2}" "{b}{2}" "{c}{2}" "{d}{2}" \
        "{a}{3}" "{b}{3}" "{c}{3}" "{d}{3}" \
      )
      foreach c ( "{0}" "{1}" "{2}" "{3}" )
        echo "10 $w $c" >> .test.wct
      end
    end

  Testing the script (the result should be 1.333 bits):

    cat .test.wct \
      | sort -b +1 -2 \
      | compute-cond-entropy \
      > .test.ents
    cat .test.ents
    
    cat .test.ents \
      | gawk '//{s+= $2;n++;} END{print s/n;}'

EXTRACTING THE N-GRAM DISTRIBUTIONS

  Selecting the samples and length-defining encodings:
  
    set sampelems = ( \
      voyn/maj.bgly \
      voyn/prs.bgly \
      voyn/lab.bgly \
      \
      voyn/maj.qoko \
      voyn/prs.qoko \
      voyn/lab.qoko \
      \
      engl/wow.lets \
      engl/cul.lets \
      latn/ptt.lets \
      grek/nwt.lets \
      span/qvi.lets \
      geez/gok.sera \
      viet/ptt.viqr \
      viet/ptt.phon \
      tibe/vim.acip \
      tibe/ccv.acip \
      chin/ptt.stpy \
      chin/ptt.fxpy \
      chin/ptt.phon \
      chin/red.stpy \
      chin/red.fxpy \
      chin/red.phon \
      \
      enrc/wow.lets \
      chrc/red.lets \
      \
      engl/wnm.lets \
      engl/cnp.lets \
    )

######################################################################
TO FIX AND REDO

  Let's extract the N-grams (strings of consecutive glyphs) of
  Voynichese text, considering word space a glyph. We start from the
  `bad' text because we must reject any n-gram that touches a `bad'
  word. We also consider only the main text since the labels are not
  ordered in the VMS, and not meaningful in the other languages.
    
    set basicglyphs = 'e,i,o,a,y,q,l,d,r,s,n,m,Ch,Sh,k,t,CKh,CTh,f,p,CFh,CPh'
    set okoglyphs = 'q,y,a,o,k,t,f,p,ke,te,fe,pe,CKh,CTh,CFh,CPh,CKhe,CThe,CFhe,CPhe,Ch,Sh,ee,Che,She,eee,d,l,r,s,n,m,in,im,ir,iin,iim,iir,iiin'
    set wkind = "text";
    foreach ekind ( basic oko )
      set ifile = "lang/voyn/${wkind}/raw.wds"
      set ofile = "lang/voyn/${wkind}/ngrams-${ekind}.nct"
      echo "${ifile} -> ${ofile}"
      set glyphs = "${basicglyphs}" 
      if ( "/${ekind}" == "/oko" ) set glyphs = "${okoglyphs}" 
      cat ${ifile} \
        | capitalize-ligatures -v field=1 \
        | factor-word-${ekind} -v inField=1 -v outField=2 \
        | gawk '/./{ print $2; }' \
        | extract-and-count-ngrams -f parse-elem-list.gawk \
            -v maxOrder=10 \
            -v elemList="${glyphs}" \
            -v spaceMarker='_' \
            -v showBadWords=0 \
        | sort -b +0 -1nr \
        > ${ofile}
    end
    
  Extracting the N-letter ngrams of English and Latin tokens,
  and their counts:
    
    set trivialglyphs = 'a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z'
    foreach lang ( engl latn )
      set wkind = "text"; set ekind = "trivial"
      set ifile = "lang/${lang}/${wkind}/raw.wds"
      set ofile = "lang/${lang}/${wkind}/ngrams-${ekind}.nct"
      echo "${ifile} -> ${ofile}"
      cat ${ifile} \
        | factor-word-${ekind} -v inField=1 -v outField=2 \
        | gawk '/./{ print $2; }' \
        | extract-and-count-ngrams -f parse-elem-list.gawk \
            -v maxOrder=10 \
            -v elemList="${trivialglyphs}" \
            -v spaceMarker='_' \
            -v showBadWords=0 \
        | sort -b +0 -1nr \
        > ${ofile}
    end
    
    dicio-wc lang/*/*/ngrams-*.nct
    
      lines   words     bytes file        
    ------- ------- --------- ------------
     586922 1173844  19463037 lang/engl/text/ngrams-trivial.nct
     720891 1441782  24036696 lang/latn/text/ngrams-trivial.nct
     485787  971574  16889950 lang/voyn/text/ngrams-basic.nct
     529010 1058020  18794978 lang/voyn/text/ngrams-oko.nct

  Computing the conditional entropies of a random element, given the
  preceding k-1 elements. We must add an extra "{}" at the beginning of
  each ngram to avoid empty prefixes when k = 1.
  
    foreach le ( voyn/{basic,oko} {engl,latn}/trivial )
      set lang = "${le:h}"; set ekind = "${le:t}"
      set ifile = "lang/${lang}/text/ngrams-${ekind}.nct"
      set ofile = "lang/${lang}/text/ngram-entropies-${ekind}.tbl";
      echo "${ifile} --> ${ofile}"
      cat ${ifile} \
        | gawk '/./{ \
            ct = $1; w = ("{}" $2); \
            w = gensub(/([{][^{}]*[}])$/, " \\1", "s", w); \
            print ct, w; \
          }' \
        | sort -b +1 -2 \
        | compute-cond-entropy \
        | sort -b +1 -2gr \
        > ${ofile}
    end
  
    dicio-wc lang/*/*/ngram-entropies-[a-z]*.tbl

        lines   words     bytes file        
      ------- ------- --------- ------------
       440491 1321473  18597686 lang/engl/text/ngram-entropies-trivial.tbl
       540140 1620420  22928920 lang/latn/text/ngram-entropies-trivial.tbl
       349254 1047762  15270975 lang/voyn/text/ngram-entropies-basic.tbl
       395069 1185207  17587988 lang/voyn/text/ngram-entropies-oko.tbl

  Extract the entropy after specific characters and character pairs:

    foreach le ( voyn/{basic,oko} {engl,latn}/trivial )
      foreach order ( 01 02 03 )
        set lang = "${le:h}"; set ekind = "${le:t}"
        set ifile = "lang/${lang}/text/ngram-entropies-${ekind}.tbl";
        set ofile = "lang/${lang}/text/ngram-entropies-${order}-${ekind}.tbl";
        echo "${ifile} --> ${ofile}"
        cat ${ifile} \
          | gawk -v order="${order}" \
              '/./{w=$3; gsub(/[^{]/,"",w); if(length(w)==order) {print;}}' \
          > ${ofile}
      end
    end
    
  Computing the k-order entropy, defined as h[k] = average entropy of
  the kth character (including word-stop) given the k-1 preceding
  characters.

    foreach le ( voyn/{basic,oko} {engl,latn}/trivial )
      set lang = "${le:h}"; set ekind = "${le:t}"
      set ifile = "lang/${lang}/text/ngram-entropies-${ekind}.tbl";
      set ofile = "lang/${lang}/text/hk-${ekind}.tbl";
      echo "${ifile} --> ${ofile}"
      cat ${ifile} \
        | compute-hk-entropies \
        > ${ofile}
    end
    
  Plots of the average k-order conditional entropy h[k] as a function of k.
  (These commands must be executed on a Sun.)
  
    foreach fmt ( eps gif )
      foreach ekind ( basic oko )
        set ofile = "hk-plots-${ekind}.${fmt}"; echo "${ofile}"
        compare-entropy-profiles \
            -column 3 -size 0.75,0.75 -maxlen 10 -format ${fmt} \
            lang/voyn/text/hk-${ekind}.tbl "Voynichese" \
            lang/engl/text/hk-trivial.tbl "English"    \
            lang/latn/text/hk-trivial.tbl "Latin"      \
          > ${ofile}
        mv -b ${ofile} ${figdir}/
      end
    end
    
  Compacting the garbage:
  
    gzip lang/*/*/ngram-*.tbl
    gzip lang/*/*/ngrams-*.nct

EXTRACTING THE N-LETTER PREFIX DISTRIBUTIONS

  Let's extract the N-symbol prefixes of Voynichese tokens,
  in the basic glyphs and OKO elements, and their counts.
    
    foreach wkind ( text labs )
      foreach ekind ( basic oko )
        set lang = "voyn"
        set ifile = "lang/${lang}/${wkind}/gud.wfr"
        set ofile = "lang/${lang}/${wkind}/prefs-${ekind}.pct"
        echo "${ifile} -> ${ofile}"
        cat ${ifile} \
          | capitalize-ligatures -v field=3 \
          | factor-word-${ekind} -v inField=3 -v outField=4 \
          | gawk '/./{ print $1, $4; }' \
          | extract-and-count-prefixes \
              -v spaceMarker='_' \
              -v showBadWords=0 \
          | sort -b +0 -1nr \
          > ${ofile}
      end
    end
    
  Extracting the N-letter prefixes of English and Latin tokens,
  and their counts:
    
    foreach lang ( engl latn )
      set wkind = "text"; set ekind = "trivial"
      set ifile = "lang/${lang}/${wkind}/gud.wfr"
      set ofile = "lang/${lang}/${wkind}/prefs-${ekind}.pct"
      echo "${ifile} -> ${ofile}"
      cat ${ifile} \
        | factor-word-trivial -v inField=3 -v outField=4 \
        | gawk '/./{ print $1, $4; }' \
        | extract-and-count-prefixes \
            -v spaceMarker='_' \
            -v showBadWords=0 \
        | sort -b +0 -1nr \
        > ${ofile}
    end
    
    dicio-wc lang/{labs,text}/*-prefs.pct

        lines   words     bytes file        
      ------- ------- --------- ------------
         2446    4892     69818 lang/labs/voyn-basic-prefs.pct
         2304    4608     63756 lang/labs/voyn-oko-prefs.pct
        20040   40080    634290 lang/text/engl-trivial-prefs.pct
        32932   65864   1133625 lang/text/latn-trivial-prefs.pct
        19179   38358    572713 lang/text/voyn-basic-prefs.pct
        17746   35492    504665 lang/text/voyn-oko-prefs.pct

COMPUTING THE LETTER ENTROPIES

  Computing the conditional entropies of the last element in the 
  token prefix, given the preceding letters:
  
    foreach lwe ( voyn/{text,labs}.{basic,oko} {engl,latn}/text.trivial )
      set lang = "${lwe:h}"; set we = "${lwe:t}"
      set wkind = "${we:r}"; set ekind = "${we:e}"
      set ifile = "lang/${wkind}/${lang}-${ekind}-prefs.pct"
      set ofile = "lang/${wkind}/${lang}-prefix-entropies-${ekind}.tbl";
      echo "${ifile} --> ${ofile}"
      cat ${ifile} \
        | gawk '/./{ \
            ct = $1; w = $2; w = gensub(/([{][^{}]*[}])$/, " \\1", "s", w); \
            print ct, w; \
          }' \
        | sort -b +1 -2 \
        | compute-cond-entropy \
        | sort -b +1 -2gr \
        > ${ofile}
    end
  
  Computing the entropy profile, defined as h[k] = average entropy of
  the kth character (including word-stop) given the k-1 preceding
  characters.

    foreach lwe ( voyn/{text,labs}.{basic,oko} {engl,latn}/text.trivial )
      set lang = "${lwe:h}"; set we = "${lwe:t}"
      set wkind = "${we:r}"; set ekind = "${we:e}"
      set ifile = "lang/${wkind}/${lang}-prefix-entropies-${ekind}.tbl";
      set ofile = "lang/${wkind}/${lang}-entropy-profile-${ekind}.tbl";
      echo "${ifile} --> ${ofile}"
      cat ${ifile} \
        | compute-token-entropy-profile \
        > ${ofile}
    end
    
  Checking whether the sum of the weihgted conditional entropies is
  equal to the token entropy:
  
    foreach lwe ( voyn/{text,labs}.{basic,oko} {engl,latn}/text.trivial )
      set lang = "${lwe:h}"; set we = "${lwe:t}"
      set wkind = "${we:r}"; set ekind = "${we:e}"
      set ifile = "lang/${wkind}/${lang}-entropy-profile-${ekind}.tbl";
      echo " "
      printf "%s %s %-6s: " "${lang}" "${wkind}" "${ekind}"
      cat ${ifile} \
        | gawk '/./{ s+= $5; } END { print s; }'
    end
  
      voyn text basic : 10.121
      voyn text oko   : 10.123
      voyn labs basic : 9.204
      voyn labs oko   : 9.204
      engl text trivial : 9.176
      latn text trivial : 10.618

  Plots of the conditional entropy for each character position:
  
    foreach fmt ( eps gif )
      foreach ekind ( basic oko )
        set ofile = "entropy-profiles-${ekind}.${fmt}"; echo "${ofile}"
        compare-entropy-profiles \
            -column 4 -size 1.50,0.75 -maxlen 20 -format ${fmt} \
            lang/text/voyn-entropy-profile-${ekind}.tbl "Voynichese (text)" \
            lang/labs/voyn-entropy-profile-${ekind}.tbl "Voynichese (labels)" \
            lang/text/engl-entropy-profile-trivial.tbl "English"    \
            lang/text/latn-entropy-profile-trivial.tbl "Latin"      \
          > ${ofile}
        mv -b ${ofile} ${figdir}/
      end
    end