Hacking at the Voynich manuscript - Side notes
102 Tabulating the most popular words and labels

Last edited on 2012-05-05 15:07:38 by stolfilocal

INTRODUCTION

  Here we tabulate the most popular words and labels, 
  globally and by section.

SETTING UP THE ENVIRONMENT

  Links:

    ln -s ../tr-stats/dat
    ln -s ../tr-stats/exp
    ln -s ../../../voynich/work 

    ln -s work/capitalize-ligatures
    ln -s work/compute-cum-freqs
    ln -s work/update-paper-include
    
  This note uses the "bash" shell.

TABULATING THE MOST POPULAR WORDS

  Make sure that every sample has at least a trivial tex-encoding function:
  
    export ftri="reencode-words-trivial"
    export ftex="reencode-words-for-tex"
    for smpdir in dat/????/??? ; do
      filter="${smpdir}/${ftex}"
      if [[ ! -x ${filter} ]]; then
        echo "linking ${filter} -> work/${ftri}" 1>&2
        ( cd $smpdir && ln -s work/${ftri} ${ftex} )
      fi
    done

  Formatted lists of most popular words, for TeX report:
  
    sampsizes=( \
      voyn/prs:80 \
      voyn/prs:60 \
      voyn/prs:40 \
      voyn/prs:24 \
      voyn/prs:16 \
      \
      voyn/lab:40 \
      voyn/lab:24 \
      voyn/lab:16 \
      \
      engl/wow:60 \
      engl/cul:60 \
      engl/twp:60 \
      latn/ptt:60 \
      latn/nwt:60 \
      latn/ock:60 \
      grek/nwt:60 \
      span/qvi:60 \
      ital/psp:60 \
      fran/tal:60 \
      port/csm:60 \
      germ/sim:60 \
      russ/pic:60 \
      russ/ptt:60 \
      arab/quf:60 \
      arab/quv:60 \
      arab/qud:60 \
      arab/qph:60 \
      arab/qcs:60 \
      hebr/tav:60 \
      hebr/tad:60 \
      geez/gok:60 \
      viet/ptt:60 \
      viet/nwt:60 \
      chin/ptt:60 \
      chin/ptn:60 \
      chin/red:60 \
      chin/voa:60 \
      chip/voa:60 \
      tibe/vim:60 \
      tibe/ccv:60 \
      tibe/pmi:60 \
      chrc/red:60 \
      enrc/wow:60 \
      envt/wow:60 \
      envg/wow:60 \
      \
      voyp/grs:40 \
      voyp/grm:40 \
      viep/grs:40 \
      viep/mky:40 \
      \
      engl/cpn:40 \
      engl/wnm:40 \
      \
      voyn/ini:40 \
      voyn/mid:40 \
      voyn/fin:40 \
    )
  
  Formatted "N top" word lists for TeX report:
  
    for sn in ${sampsizes[@]} ; do
      sna=( ${sn/:/ } );
      sample="${sna[0]}"; nwords="${sna[1]}";
      tfile="${sample}/tot.1/top-${nwords}-words.tex"
      echo "sample = ${sample}  nwords = ${nwords}  tex file = ${tfile}" 1>&2
      get-top-words ${nwords} ${sample}/tot.1 \
        | gawk '/./{ print $1, $2, $5; }' \
        | dat/${sample}/reencode-words-for-tex -v field=3 \
        | tex-format-word-freqs \
            -v ncols=4 \
            -v showCounts=1 -v showFreqs=1 \
        > dat/${tfile}
      cat dat/${tfile}
      update-paper-include dat/${tfile} exp/${tfile}
    done
    
  Formatted "24 bottom" and "40 bottom" word lists for TeX report:
  
    for nwords in 24 40 ; do
      for sn in ${sampsizes[@]} ; do
        sna=( ${sn/:/ } )
        sample="${sna[0]}";
        tfile="${sample}/tot.1/bot-${nwords}-words.tex";
        echo "sample = ${sample}  nwords = ${nwords}  tex file = ${tfile}" 1>&2
        get-bot-words ${nwords} ${sample}/tot.1 \
          | gawk '/./{ print $1, $2, $5; }' \
          | dat/${sample}/reencode-words-for-tex -v field=3 \
          | tex-format-word-freqs \
              -v ncols=8 \
              -v showCounts=0 -v showFreqs=0 \
          > dat/${tfile}
        cat dat/${tfile}
        update-paper-include dat/${tfile} exp/${tfile}
      done
    done
    
  Formatted per-section "top 24" or "top 16" word lists:
  
    sampsecs=( \
      voyn/prs/pha,hea,heb,cos,str,zod,bio \
      voyn/prs/pha,hea,heb,cos,str,zod,bio \
      engl/cul/pre,her,rec \
      latn/ptt/gen,exo,lev,num,deu \
      latn/nwt/mat,mrk,luk,joh \
      grek/nwt/mat,mrk,luk,joh \
      span/qvi/one,two \
      russ/ptt/gen,exo,lev,num,deu \
      viet/ptt/gen,exo,lev,num,deu \
      viet/nwt/mat,mrk,luk,jhn \
      chin/ptt/gen,exo,lev,num,deu \
      chin/ptn/gen,exo,lev,num,deu  \
    )
    
    for nwords in 16 24 40 ; do
      for ss in ${sampsecs[@]} ; do
        echo "${ss}"  1>&2
        sample="${ss%/*}"
        secscm="${ss##*/}"
        mainsecs=( ${secscm//,/ } )
        tfile="${sample}/top-${nwords}-words-per-section.tex";
        echo "sample = ${sample}  mainsecs = (${mainsecs[@]})  nwords = ${nwords}  tex file = ${tfile}" 1>&2
        get-top-words-per-section ${nwords} ${sample} ${mainsecs[@]} \
          | gawk '/./{ print $1, $2, $3, $6; }' \
          | dat/${sample}/reencode-words-for-tex -v field=4 \
          | tex-format-word-freqs-by-section \
              -v showCounts=0 -v showFreqs=1 \
          > dat/${tfile}
        cat dat/${tfile}
        update-paper-include dat/${tfile}  exp/${tfile}
      done
    done

#END