Hacking at the Voynich manuscript - Side notes
054 Word length distribution, revisited

Last edited on 2000-10-11 06:59:04 by stolfi

INTRODUCTION

  This note attempts to reproduce Gabriel Landini's study of the 
  word length and word frequency distributions, and compare them
  to Latin and English.
  
GATHERING THE DATA

  For the Voynich manuscript, we can use the texts from the 
  majority version:
  
    mkdir texts
    
    foreach sec ( bio hea heb str cos pha zod unk ) 
      set ifiles = ( `cd ../045/subsecs-m && ls ${sec}.?.evt | sort` )
      set ofile = "texts/voyn.${sec}.txt"
      echo "${ifiles} -> ${ofile}"
      echo "# Voynich Manuscript, majority version, ${sec} section" > ${ofile}
      echo "#" >> ${ofile}
      ( cd ../045/subsecs-m && cat ${ifiles} ) \
        | select-units \
            -v types='parags,starred-parags,circular-lines,circular-text,radial-lines,titles' \
            -v table=unit-to-type.tbl \
        | lines-from-evt \
        | egrep '.' \
        | tr '.' ' '\
        > ${ofile}
    end
  
    mkdir wfreqs
    
  We will include also some dummy sections containing text in other languages:
  Latin, English, Chinese; as well as Monkey versions of each. 
  
  Let's make a list of books per language, excluding "rnd":
  
    foreach lang ( voyn latn engl chin )
      mkdir wfreqs/${lang}/
      set texts = ( \
        `cd texts && ls ${lang}.*.txt | egrep -v '[.]rnd[.]txt$'` \
      )
      echo "=== ${lang} ==="
      echo ${texts} \
        | tr ' .' '\012 ' \
        | gawk '/./{print $2;}' \
        > wfreqs/${lang}/all.names
      cat wfreqs/${lang}/all.names
    end

  Now extract the words and count them per "section" (book actually),
  excluding "rnd".

    foreach lang ( `cd wfreqs && cat all.names`  )
      set secs = ( `cd wfreqs/${lang} && cat all.names` )
      foreach sec ( $secs )
        compute-word-frequencies ${lang} ${sec}
      end
    end
    
  Create the total frequencies per language:
  
    foreach lang ( `cd wfreqs && cat all.names`  )
      set ifiles = ( `cd wfreqs/${lang} && cat all.names | sed -e 's/$/.frq/'` )
      set ofile = "wfreqs/${lang}/tot.frq"
      echo "$ifiles -> $ofile"
      ( cd wfreqs/${lang} && cat ${ifiles} ) \
        | gawk '/./{ print $1,$3;}' \
        | combine-counts \
        | compute-freqs \
        | sort -b +0 -1nr +2 -3 \
        > ${ofile} 
      dicio-wc ${ofile}
    end

  Let's create the monkey language:
  
    foreach lang ( `cd wfreqs && cat all.names` )
      cat wfreqs/${lang}/tot.frq \
        | gawk '/./{print $1, $3;}' \
        | generate-monkey-text \
        > texts/${lang}.rnd.txt
    end
    
  Checking: 
  
    foreach lang ( `cd wfreqs && cat all.names` )
      cat texts/${lang}.rnd.txt \
        | egrep -v '^[#]' \
        | tr ' ' '\012' \
        | sort | uniq -c | expand \
        | generate-monkey-text \
        > .${lang}.rn2.txt 
    end
    
  Let's compute the frequencies for the random version:

    foreach lang ( `cd wfreqs && cat all.names`  )
      compute-word-frequencies ${lang} rnd
    end

COMPUTING THE WORD LENGTH DISTRIBUTIONS

  Let's compute the word length distribution per language, for each
  section separately and all together. Remember to exclude the random
  text from the totals!
  
  The "szf" tables are for standard EVA encoding; "szu"
  use a more compact encoding, appropriate to each language.
  
    mkdir szfreqs
    foreach lang ( `cd wfreqs && cat all.names` )
      mkdir szfreqs/${lang}
    end
    
    foreach type ( szf szu )
      foreach lang ( `cd wfreqs && cat all.names` )
        foreach sec ( `cat wfreqs/${lang}/all.names` tot rnd )
          set ifile = ( "${sec}.frq" )
          set abbr = "cat"; if ( "$type" == "szu" ) set abbr = "abbrev-${lang}"
          set ofile = "szfreqs/${lang}/${sec}.${type}"
          echo "${ifile} -> ${ofile}"
          ( cd wfreqs/${lang} && cat ${ifile} ) \
            | egrep -v '[?*]' \
            | ${abbr} \
            | gawk \
                ' /./{ m=length($3); ct[m] += $1; if(\! (m in wd)) {wd[m] = $3;} \
                  } \
                  END{for(m in ct){printf "%7d %02d:%s\n",ct[m],m,wd[m];}} \
                ' \
            | compute-freqs \
            | tr ':' ' ' \
            | sort -b +2 -3n \
            > ${ofile}
        end
      end
    end

  Generating separate plots per language
  
    foreach etype ( normal.szf abbreviated.szu )
      set etit = "${etype:r}"
      set eext = "${etype:e}"
      foreach lang ( `cd wfreqs && cat all.names` )
        plot-lang-word-x-length "${lang}" "${eext}" "${etit} ${lang}"
      end
    end
  
  Now a plot comparing the three languages, minus chinese:
  
    foreach etype ( normal.szf abbreviated.szu )
      set etit = "${etype:r}"
      set eext = "${etype:e}"
        plot-multilang-word-x-length \
          "${eext}" "${etit}" \
          `cd wfreqs && cat all.names | grep -v 'chin'`
    end
  
  Now a plot comparing the four languages with chinese:
  
    foreach etype ( normal.szf abbreviated.szu )
      set etit = "${etype:r}"
      set eext = "${etype:e}"
        plot-multilang-word-x-length \
          "${eext}" "${etit}" \
          `cd wfreqs && cat all.names`
    end
  
GENERATING THE ZIPF PLOTS

  Generating plots of word frequency versus word frequency rank.
  
    mkdir zipf
    foreach lang ( `cd wfreqs && cat all.names` )
      mkdir zipf/${lang}
    end

    foreach etype ( normal.szf abbreviated.szu )
      set etit = "${etype:r}"
      set eext = "${etype:e}"
      foreach lang ( `cd wfreqs && cat all.names` )
        plot-lang-zipf ${lang} ${eext} "${etit}"
      end
    end

  Plots with the three languages together:

    foreach etype ( normal.szf abbreviated.szu )
      set etit = "${etype:r}"
      set eext = "${etype:e}"
      plot-multilang-zipf \
          "${eext}" "${etit}" \
          `cd wfreqs && cat all.names | grep -v 'chin'`
    end

  The plots for all sections are similar.  For the RAW files,
  the plot is rather flat at the beginning (up to about rank 10)
  but then follows the 1/x slope quite well.  For the EQV files,
  the plot is markedly convex in the middle part.
  
VOCABULARY GROWTH

  Let's plot the number of distinct words as a function 
  of vocabulary size. Better plot the number of new words 
  at each doubling of the sample:

    mkdir vocsizes
    foreach lang ( `cd wfreqs && cat all.names` )
      mkdir vocsizes/${lang}
    end
    
    foreach lang ( `cd wfreqs && cat all.names` )
      foreach sec ( `cat wfreqs/${lang}/all.names` rnd )
        set ifile = "${lang}.${sec}.txt"
        set ofile = "vocsizes/${lang}/${sec}.vsz"
        echo "${ifile} -> ${ofile}"
        ( cd texts && cat ${ifile} ) \
          | tr '. -' '\012' \
          | egrep -v '[?*]' \
          | egrep '.' \
          | gawk \
              ' BEGIN { nread=0; mvoc=0; split("", voc); } \
                /./{ nread++; w = $1; \
                  if (\! (w in voc)) \
                    { mvoc++; voc[w] = 1; printf "%7d %7d %s\n",nread,mvoc,w; } \
                } \
              ' \
          > ${ofile}
      end
    end

  Generating per-language plots with individual sections:

    foreach lang ( `cd wfreqs && cat all.names` )
      set psfile = "vsz.eps"
      set ifiles = ( `cat wfreqs/${lang}/all.names | sed -e 's/$/.vsz/'` )
      echo "${ifiles} -> ${psfile}"
      ( cd vocsizes/${lang} && ../../make-vocsize-plot \
          ${ifiles} \
        > ${psfile} \
      )
      ghostview vocsizes/${lang}/${psfile}
    end

  Plots with the three languages together, largest section:
  
    make-vocsize-plot \
        -keys 'voyn.str engl.wow latn.gen chin.tao' \
        vocsizes/{voyn/str,engl/wow,latn/gen,chin/tao}.vsz \
      > vocsizes/vsz.eps
    ghostview vocsizes/vsz.eps

=== TO MERGE WITH THE ABOVE ==========================================

TABULATING WORDS BY LENGTH AND GALLOWOSITY

  Tabulating words without gallows, and words with gallows minus the gallows:
    
    cat prob/obs/txt.n/word.frq \
      | gawk '($3 \!~ /[ktpf]/) { print $1,$3; }' \
      | sort -b +0 -1nr +1 -2 \
      | compute-freqs \
      > .gal-0.frq
      
    cat prob/obs/txt.n/word.frq \
      | egrep -v '[ktpf].*[ktpf]' \
      | gawk \
          '($3 ~ /[ktpf]/) { \
            w=$3; gsub(/eeee/,"EE",w); gsub(/eee/,"eE",w); gsub(/ee/,"E",w); \
            gsub(/[aoy]*[ic]*[ktpf][h]*([e]|)/,"",w); \
            if (w == "") { w = "_"; } gsub(/[E]/,"ee",w); \
            print $1,w; \
          }' \
      | combine-counts \
      | sort -b +0 -1nr +1 -2 \
      | compute-freqs \
      > .gal-1.frq
      
    foreach k ( 0 1 ) 
      cat .gal-${k}.frq \
        | gawk \
          ' /./ { \
              w=$3; gsub(/[_]/,"",w); print $1,length(w); \
          } ' \
        | combine-counts \
        | sort -b +1 -2n \
        | compute-freqs \
        > .ngal-${k}.frq
    end
    
    plot-word-lengths-by-gallows \
        .ngal-{0,1}.frq \
      > .ngal.gif


      | gawk \
          ' /./{ \
              w = $4; gsub(/[_]/,"",w); gsub(/[{][}]/,"",w); gsub(/[aoy]/,"o",w); \
              gsub(/[{][^{}]*[ktpf][^{}]*[}]/,"#",w); \
              gsub(/[{][cs]h[^{}]*[}]/,"m",w); \
              gsub(/[{]ee[^{}]*[}]/,"m",w); \
              gsub(/[{][^{}]*[}]/,"x",w); \
              s=w; gsub(/[^#]/,"",s); k = length(s); \
              printf "%02d %02d %5d %s\n", k, length(w), $1, ( "(" w ")" ); \
            } ' \
      | sort -b +0 -1n +1 -2n +3 -4 \
      | gawk '/./{print $3,$4;}' \
      | combine-counts \
      | compute-freqs \
      | sort -b +0 -1nr +2 -3 \
      > .gal.distr
=== STOPPED HERE ===================================================================

  Tabulating words by length, number of gallows, and position 
  of gallows:

    cat prob/obs/txt.n/word.frq \
      | factor-field-OK -v inField=3 -v outField=4 \
      | gawk \
          ' /./{ \
              w = $4; gsub(/[_]/,"",w); gsub(/[{][}]/,"",w); gsub(/[aoy]/,"o",w); \
              gsub(/[{][^{}]*[ktpf][^{}]*[}]/,"#",w); \
              gsub(/[{][cs]h[^{}]*[}]/,"m",w); \
              gsub(/[{]ee[^{}]*[}]/,"m",w); \
              gsub(/[{][^{}]*[}]/,"x",w); \
              s=w; gsub(/[^#]/,"",s); k = length(s); \
              printf "%02d %02d %5d %s\n", k, length(w), $1, ( "(" w ")" ); \
            } ' \
      | sort -b +0 -1n +1 -2n +3 -4 \
      | gawk '/./{print $3,$4;}' \
      | combine-counts \
      | compute-freqs \
      | sort -b +0 -1nr +2 -3 \
      > .gal.distr

  Tabulating words by length and number of gallows
  (irrespective of position):

    cat .gal.distr \
      | gawk '/./{ \
            w = $3; gsub(/[()]/,"",w); \
            gsub(/[aoy][#]/,"#",w); r=length(w); \
            gsub(/[a-z]/,"",w); g=length(w); \
            printf "%5d %d-%02d\n", $1, g, r-g; \
          }' \
      | combine-counts \
      | sort -b +1 -2 +0 -1 \
      | compute-cum-freqs \
      > .ngal.distr
      
    foreach k ( 0 1 2 3 )
      cat .ngal.distr \
        | gawk -v k=${k} \
            '(substr($5,1,1) == k){ print $1,substr($5,3);}' \
        | sort -b +1 -2n +0 -1 \
        | compute-cum-freqs \
        > .ngal-${k}.distr
    end

    --------------------------------------
          3 0.0001       3 0.0001 _@@@_
        323 0.0092     326 0.0093 _@@_
      17439 0.4964   17765 0.5057 _@_
      17363 0.4943   35128 1.0000 __
    --------------------------------------

  Note the remarkable balance between words with 0 and 1 
  gallows.

TABULATING CORE-MANTLE PATTERNS  

  Tabulating words with/without core/mantle. The following results are
  obtaind when the gallows "cth" are counted as pure core, not core +
  mantle.

    cat prob/obs/txt.n/word.frq \
      | tabulate-core-mantle-bits \
      > .cmbits.frq
  
     ----------------------  
       8772 0.25205 --
       8591 0.24685 -e
       9016 0.25907 k-
       8423 0.24203 ke
     ----------------------  

  Tabulating core-mantle patterns:

    cat prob/obs/txt.n/word.frq \
      |  tabulate-core-mantle-patterns \
      | sort -b +3 -4nr +0 -3 \
      > .cm.sizes

TABULATING NUMBER OF DEALERS PER WORD

  Tabulate separately words with and without "q":
  
    cat prob/obs/txt.n/word.frq \
      | gawk \
          ' /./{ \
              w=$3; gsub(/[cs]h/,"e",w); gsub(/[ci][ktpf]h/,"k",w); \
              gsub(/[ktpfceh]+/,"-",w); \
              gsub(/[i]*[dlrsnxmg]/, "r", w); \
              gsub(/[aoy]/,"",w); if (w == "") { w = ".";} \
              if(w \!~ /[-].*[-]/){s+=$1;p+=$2;print $1,w;} \
            } ' \
      | combine-counts | compute-freqs \
      | gawk \
          ' /./{ \
              w=$3; gsub(/[^r]/,"",w); hasq=match($3,/q/) \
              printf "%d %02d %02d %s\n", hasq, length(w),length($3), $0; \
            } ' \
      | sort -b +0 -1 +1 -2 +2 -3 +5 -6 \
      > .dealer.distr

TABULATING CORES

    cat prob/obs/txt.n/word.frq \
      |  tabulate-core-patterns \
      > .core.sizes

TABULATING CIRCLES

    cat prob/obs/txt.n/word.frq \
      | tabulate-circles-in-context \
      | sort -b +0 -1 +4 -5 \
      > .circles.tbl