Hacking at the Voynich manuscript - Side notes
061 Comparing the Recipes section to the Shennong Bencao

Last edited on 2006-11-27 23:00:38 by stolfi

INTRODUCTION

  This note compares the Recipes (Starred Paragraphs) section
  of the VMS to the Chinese medical classic Sennon Bencao Jing.
  
  Most of this notebook was done around 02/2002. It was then expanded
  and redone in 05/2004, after fixing many transcription errors in the
  interlinear file.

SETTING UP THE ENVIRONMENT

  Links:
  
    ln -s ../.. work

    ln -s work/compute-freqs
  
THE DATA

  Raw data files, with comments prefixed with "#", recipe numbers
  in the form S-NNN prefixed with "##", and each kanji surrounded 
  by ASCII spaces:
  
    ln -s ~/IMPORT/texts/chinese/ShennongBencao/text.big5 bencao-raw.big5
    ln -s ~/IMPORT/texts/chinese/ShennongBencao/text.jis  bencao-raw.jis
  
  Data files without punctuation:

    cat bencao-raw.big5 \
      | gawk \
          ' /^#/ {print; next;} \
            // { \
              gsub(/[ ]+[{}][ ]+/, " ", $0); \
              gsub(/[ ]+[\Ą][\´]/, "", $0); \
              gsub(/[ ]+[\Ą][D]/, "", $0); \
              print; \
            } ' \
      > bencao.big5
      
    cat bencao-raw.jis \
      | gawk \
          ' /^#/ {print; next;} \
            // { \
              gsub(/[ ]+[{}][ ]+/, " ", $0); \
              gsub(/[ ]+[\201][\234]/, "", $0); \
              gsub(/[ ]+[\201][D]/, "", $0); \
              print; \
            } ' \
      > bencao.jis
      
    dicio-wc bencao{-raw,}.{big5,jis} vstars.eva

      lines   words     bytes file        
    ------- ------- --------- ------------
       2008   17532     57611 bencao-raw.big5
       1510   19003     70177 bencao-raw.jis
       2008   13705     46183 bencao.big5
       1510   15229     57427 bencao.jis
       1742   13642     86751 vstars.eva
       
  Extracted the Voynichese "stars" section from the Majority version,
  reformatted to be comparable to the Bencao (line numbers as
  NNNV-U-LL, recipe numbers as "## S-NNN", all words surrounded by
  ASCII space).  Fixed many errors by hand, against KHE's images
  (also in the interlinear file).
  
BASIC STATISTICS

  Checking whether each VMS page has been split into the correct number
  of recipes:
  
    cat vstars.eva \
      | count-recipes-per-page \
      > vstars.rpp
    diff true.rpp vstars.rpp
  
      total 328 recipes

  Note that total has changed. This is because, during the 05/2004 round of edits 
  some long recipes were split at paragraph breaks, even though there
  were no stars there.  This is not too unreasonable, because the stars seem 
  to have been placed without much care, as if the scribe did not understand 
  that they were associated with the paragraphs.
      
  Basic statistics - total tokens, words, recipes:
  
    foreach f ( bencao.big5 vstars.eva )
      printf "\n%-10s" "${f:r}:"
      cat $f \
        | print-tk-wd-counts \
        > ${f:r}.twct
      cat ${f:r}.twct \
        | sort -b +2 -3nr +0 -1n \
        | egrep -v '^000 ' \
        > ${f:r}.twsr
    end

    bencao:   total 357 recipes, 12826 tokens (   0 bad), 35.93 tokens/recipe, 1113 good words
    vstars:   total 328 recipes, 10491 tokens (  38 bad), 31.98 tokens/recipe, 2996 good words

   Note that these counts have changed since 02/2002. They used to be
    
    vstars: total 323 recipes, 10542 tokens,  ( 595 bad), 32.64 tokens/recipe, 2767 good words

   During the 05/2004 round of edits, many tokens became joined with
   their neighbors, because the spaces were entered as faithfully as
   possible. However, if we believe the word structure paradigm, then
   many of those joined words should have been kept separate. Also
   note that over 550 "bad" tokens were fixed by those edits.
      
RECIPE LENGTH HISTOGRAMS

  Plotting the recipe length histograms:
  
    foreach tw ( tk.3 wd.4 )
      foreach f ( bencao vstars )
        printf "\n%s (%s): " "${f}" "${tw:r}"
        cat ${f}.twct \
          | gawk -v fld="${tw:e}" '/./{ print $(fld); }' \
          | compute-tk-wd-histogram -v quantum=5 \
          > ${f}.${tw:r}h
      end
      foreach fmt ( png )
        plot-twhi -format ${fmt} \
          bencao.${tw:r}h Bencao 1 \
          vstars.${tw:r}h Voynich 2 \
        > recipe-${tw:r}-hist.${fmt}
      end
    end

RECIPE LENGTH PLOTS

  Plotting the recipe lengths as function of position in text:
  
    foreach fmt ( png )
      foreach f ( bencao.Bencao vstars.Voynich )
        plot-recipe-attr \
            -format ${fmt} \
            ${f:r}.twct "${f:e} (tk)" 3 1  1.0 \
          > ${f:r}-tk-counts.${fmt}
      end
    end

  Dito, smoothed:

    foreach width ( 09 )
      foreach fmt ( png )
        foreach f ( bencao.Bencao vstars.Voynich )
          foreach type ( avg dif )
            cat ${f:r}.twct \
              | gawk '/./{ print $1, $2, $3; }' \
              | filter-recipe-data -v ${type}=1 -v width=${width} \
              > ${f:r}-${type}${width}.tct
          end
          plot-recipe-attr \
              -format ${fmt} \
              ${f:r}-avg${width}.tct "${f:e} avg${width}" 3 1  1.0 \
              ${f:r}-dif${width}.tct "${f:e} dif${width}" 3 2 60.0 \
            > ${f:r}-tk-counts-dif${width}.${fmt}
        end
      end
    end

COINCIDENCE IMAGES

  Computing the coincidence image:
  
    foreach width ( 09 )
      foreach et ( 0.5/0.05/avg 0.01/0.01/dif )
        set err = "${et:h}"
        set type = "${et:t}"
        compute-coincidence-image \
            -v absErr=${err:h} -v relErr=${err:t} \
            -v xFile=bencao-${type}${width}.tct -v xField=3 \
            -v yFile=vstars-${type}${width}.tct -v yField=3 \
          | pgmnorm | pnmdepth 255 \
          > recipe-tk-counts-${type}${width}.pgm
        display recipe-tk-counts-${type}${width}.pgm
      end
    end
    
INTERESTING WORDS

  Word frequency tables:
  
    foreach f ( bencao.big5 vstars.eva )
      echo " "; echo "=== ${f:r} ==="
      cat $f \
        | gawk \
            ' /^ *([#]|$)/{ next; } \
              //{ \
                gsub(/^[-.0-9a-zA-Z]*/, " ", $0); \
                gsub(/[ ][-={}]/, " ", $0); \
                print; \
              } ' \
        | tr ' ' '\012' \
        | egrep '.' \
        | sort | uniq -c | expand \
        | map-field \
            -v table=big5-to-html.tbl \
            -v inField=2 -v outField=3 -v forgiving=1 \
        | map-field \
            -v table=html-to-py.tbl \
            -v inField=3 -v outField=4 -v forgiving=1 \
        | map-field \
            -v table=html-to-meaning.tbl \
            -v inField=3 -v outField=5 -v forgiving=1 \
        | gawk '//{ print $1, ($3 ($3==$4 ? "" : ("=" $4)) ($5==$3 ? "" : ("=" $5))); }' \
        | sort -b +0 -1nr +1 -2 \
        | compute-freqs \
        > ${f:r}.wfr
      head -100 ${f:r}.wfr
    end
        
    === bencao ===
    
        362 0.02823 &#29983;=(sheng1,5:sheng5)
        358 0.02791 &#21619;=(wei4)
        352 0.02745 &#27835;=(zhi4)
        313 0.02441 &#21517;=(ming2)
        308 0.02402 &#19968;=(yi1)
        299 0.02331 &#27683;=(qi4)
        293 0.02285 &#23506;=(han2)
        245 0.01910 &#35895;=(gu3,yu4)
        198 0.01544 &#29105;=(re4)
        168 0.01310 &#24179;=(ping2)
        161 0.01255 &#36523;=(shen1,juan1)
        154 0.01201 &#19981;=(bu4,5:bu5,bu2)
        149 0.01162 &#20037;=(jiu3)
        144 0.01123 &#20013;=(zhong1,zhong4)
        144 0.01123 &#24029;=(chuan1)
        143 0.01115 &#26381;=(fu2,fu4,5:fu5)
        136 0.01060 &#33510;=(ku3)
        136 0.01060 &#36629;=(qing1)
        132 0.01029 &#23665;=(shan1)
        129 0.01006 &#28331;=(wen1)

    === vstars ===
    
        189 0.01802 aiin=aiin
        189 0.01802 chedy=chedy
        155 0.01477 qokeey=qokeey
        146 0.01392 ar=ar
        134 0.01277 qokeedy=qokeedy
        131 0.01249 al=al
        127 0.01211 daiin=daiin
        121 0.01153 chey=chey
        119 0.01134 qokaiin=qokaiin
        115 0.01096 shedy=shedy
         96 0.00915 okeey=okeey
         96 0.00915 ol=ol
         95 0.00906 okaiin=okaiin
         89 0.00848 qokain=qokain
         76 0.00724 otaiin=otaiin
         75 0.00715 cheey=cheey
         70 0.00667 shey=shey
         69 0.00658 okain=okain
         63 0.00601 chol=chol
         63 0.00601 oteey=oteey


  Extract list of kth word from each recipe, and their distributions:
  
    foreach k ( 1 2 3 4 )
      foreach f ( bencao.big5 vstars.eva )
        printf "\n\n=== %s[%s] ===\n\n" "${f:r}" "$k"
        cat $f \
          | gawk -v which=${k} \
              ' /^[#][#]/{ fst = 1; next; } \
                /^ *([#]|$)/{ next; } \
                (fst){ \
                  gsub(/^[-.0-9a-zA-Z]*/, " ", $0); \
                  gsub(/[ ][-={}]/, " ", $0); \
                  print $(which); fst = 0; \
                } ' \
          | tr ' ' '\012' \
          | egrep '.' \
          > ${f:r}-${k}.tks
        cat ${f:r}-${k}.tks \
          | sort | uniq -c | expand \
          | map-field \
              -v table=big5-to-html.tbl \
              -v inField=2 -v outField=3 \
              -v forgiving=1 \
          | map-field \
              -v table=html-to-py.tbl \
              -v inField=3 -v outField=4 \
              -v forgiving=1 \
          | map-field \
              -v table=html-to-meaning.tbl \
              -v inField=3 -v outField=5 -v forgiving=1 \
          | gawk '//{ print $1, ($3 ($3==$4 ? "" : ("=" $4)) ($5==$4 ? "" : ("=" $5))); }' \
          | sort -b +0 -1nr +1 -2 \
          | compute-freqs \
          > ${f:r}-${k}.wfr
        head -5 ${f:r}-${k}.wfr
      end
    end

      === bencao[1] ===

           19 0.05322 &#30333;=(bai2,5:bai5)
           15 0.04202 &#30707;=(shi2,dan4)
            6 0.01681 &#32043;=(zi3)
            5 0.01401 &#22823;=(da4,dai4)
            5 0.01401 &#27700;=(shui3)

      === vstars[1] ===

            6 0.01829 daiin=daiin
            5 0.01524 polaiin=polaiin
            5 0.01524 tchedy=tchedy
            4 0.01220 pchedal=pchedal
            4 0.01220 pcheor=pcheor


      === bencao[2] ===

           15 0.04202 &#23526;=(shi2)
           11 0.03081 &#30707;=(shi2,dan4)
            7 0.01961 &#33609;=(cao3)
            6 0.01681 &#21443;=(can1,cen1,shen1,san1)
            6 0.01681 &#33437;=(zhi1)

      === vstars[2] ===

            7 0.02134 ar=ar
            6 0.01829 shedy=shedy
            5 0.01524 chey=chey
            5 0.01524 qokaiin=qokaiin
            4 0.01220 cheo=cheo


      === bencao[3] ===

          169 0.47339 &#19968;=(yi1)
          111 0.31092 &#21619;=(wei4)
           13 0.03641 &#23376;=(zi5,zi3,zi2)
            3 0.00840 &#23526;=(shi2)
            3 0.00840 &#33609;=(cao3)

      === vstars[3] ===

            9 0.02744 shedy=shedy
            7 0.02134 qokain=qokain
            5 0.01524 chedy=chedy
            5 0.01524 okain=okain
            5 0.01524 qokaiin=qokaiin


      === bencao[4] ===

          169 0.47339 &#21517;=(ming2)
           44 0.12325 &#33510;=(ku3)
           36 0.10084 &#19968;=(yi1)
           32 0.08964 &#36763;=(xin1)
           26 0.07283 &#21619;=(wei4)

      === vstars[4] ===

            9 0.02744 qokeey=qokeey
            7 0.02134 shedy=shedy
            6 0.01829 qokeedy=qokeedy
            5 0.01524 oteedy=oteedy
            4 0.01220 okeey=okeey

REPEATED WORDS

  Checking for repeats
  
    foreach f ( bencao.big5 vstars.eva )
      printf "\n%s: " "${f:r}"
      cat ${f} \
        | list-repeats \
        > ${f:r}.reps
      cat ${f:r}.reps | wc -l 
      cat ${f:r}.reps \
        | gawk '/./{ print $2; }' \
        | sort | uniq -c | expand \
        | map-field \
            -v table=big5-to-html.tbl \
            -v inField=2 -v outField=3 \
            -v forgiving=1 \
        | map-field \
            -v table=html-to-py.tbl \
            -v inField=3 -v outField=4 \
            -v forgiving=1 \
        | gawk '//{ print $1, ($3 "=" $4); }' \
        | sort -b +0 -1nr +1 -2 \
        > ${f:r}.rtop
      head -3 ${f:r}.rtop
    end
    
      bencao:      41
      8 &#27927;=(xi3,xian3)
      6 &#34880;=(xue4,xie3)
      5 &#23506;=(han2)

      vstars:      81
      10 qokeedy=qokeedy
      10 qokeey=qokeey
      7 ar=ar

  Build word-paragraph occurrence map.
  
    foreach f ( bencao.big5 vstars.eva )
      cat ${f} \
        | sed \
            -e 's/^[#][#] */@/' \
            -e 's/[#].*$//' \
            -e 's/^[0-9][-A-Za-z0-9]*[ ]/ /' \
            -e '/^[ ]*$/d' \
        | tr ' ' '\012' \
        | gawk \
            ' BEGIN{ \
                split("", map); \
                split("", wd); nwd=0; split("", wdct); \
                split ("", pg); npg = 0; p = "???"; \
              } \
              /^[@]/ { \
                p = $1; gsub(/[@]/, "", p); \
                pg[npg] = p; npg++; next; \
              } \
              /./ { \
                w = $1; \
                if (! (w in wdct)) \
                  { wd[nwd] = w; nwd++; wdct[w] = 0; } \
                wdct[w]++; map[p,w]++; \
              } \
              END { \
                for (w in wdct) \
                  { printf "%-20s %5d ", w, wdct[w]; \
                    for (i = 0; i < npg; i++) \
                      { p = pg[i]; \
                        if ((p,w) in map) \
                          { printf "%d", map[p,w]; } \
                        else \
                          { printf "."; } \
                      } \
                    printf "\n"; \
                  } \
              } \
            ' \
        | sort -b +1 -2nr +0 -1 \
        > ${f:r}.wpm
    end