Hacking at the Voynich manuscript - Side notes
059 Spatial distribution of gallows and tables

Last edited on 2002-01-05 23:52:22 by stolfi

INTRODUCTION

  We map each word of the text to a simple numeric attribute,
  and study the properties of that sequence.
  
SETTING UP THE ENVIRONMENT

  Scripts
  
    ln -s ../../compute-cum-cum-freqs
    ln -s ../../compute-cum-freqs
    ln -s ../../combine-counts
    ln -s ../../remove-freqs
    ln -s ../../totalize-fields
    ln -s ../../compute-freqs
    ln -s ../../columnate
    ln -s ../../compare-counts
    ln -s ../../compare-freqs
    ln -s ../../select-units
    ln -s ../../words-from-evt
    ln -s ../../map-field
    
  Unit-tag-to-text-type map:

    ln -s ../019/unit-to-type.tbl
    
  Page-to-subsection map:
  
    ln -s ../019/fnum-to-subsec.tbl

  Text in subsections:
  
    ln -s ../045/subsecs-m text-subsecs

CREATING THE SUBSECTION-TO-FNUMS TABLE

    cat fnum-to-subsec.tbl \
      | gawk \
          ' /./ { \
              f=$1; s=$2; \
              if(s != os){ \
                if(os != ""){ print os, (fi "-" ff); } \
                fi = f; os = s; \
              } \
              ff = f; \
            } \
            END { if(os != ""){ print os, (fi "-" ff); } } \
          ' \
      > subsec-to-fnums.tbl
      
  (Had to edit the resulting table by hand.)

OBTAINING THE WORD STREAMS

    mkdir word-seqs
    
    ( cat text-subsecs/all.names ; echo "lab.n"; echo "txt.n" ) \
      > word-seqs/all.names \
      
    set secs = ( `cat text-subsecs/all.names` )
    echo ${secs}

    foreach sec ( ${secs} )
      echo ${sec}
      cat text-subsecs/${sec}.evt \
        | select-units \
            -v types='parags,starred-parags,circular-lines,circular-text,radial-lines,titles' \
            -v table=unit-to-type.tbl \
        | words-from-evt -v showLines=1 \
        > word-seqs/${sec}.wds
    end
 
    set slist = `echo ${secs} | tr ' ' ','`
    
    cat word-seqs/{${slist}}.wds \
      > word-seqs/txt.n.wds
      
    cat text-subsecs/{${slist}}.evt \
      | select-units \
          -v types='labels' \
          -v table=unit-to-type.tbl \
      | words-from-evt -v showLines=1 \
      > word-seqs/lab.n.wds

  Word frequencies:

    foreach sec ( ${secs} txt.n lab.n )
      echo "${sec}"
      cat word-seqs/${sec}.wds \
        | egrep '.' \
        | sort | uniq -c | expand \
        | sort +0 -1nr +1 -2 \
        | compute-freqs \
        > word-seqs/${sec}.wfr
    end


TABULATING ATTRIBUTES

  Tabulation by section:

    mkdir stats

    foreach sec ( txt.n lab.n ${secs} )
      echo "${sec}"
      cat word-seqs/${sec}.wds \
        | tabulate-attributes \
            -f extract-word-attributes.gawk \
        > stats/${sec}.tbl
    end

  Global summary of gallows counts:
        
    foreach f ( g m )
      /bin/rm -f .tmp
      set ofile = "stats/all-${f}.tots"
      foreach sec ( ${secs} txt.n lab.n )
        echo "${sec} ->> .tmp"
        cat stats/${sec}.tbl \
          | egrep 'ct_'"${f}" \
          | sed -e 's/^ *ct_[a-z]*/'"${sec}"' /' \
          >> .tmp
      end
      echo ".tmp -> ${ofile}"
      echo 'Totals for attribute "'"${f}"'"' > ${ofile}
      cat .tmp \
        | tabulate-attribute-totals \
        >> ${ofile}
    end
    
GENERATING THE BIT STREAMS

  Generating digit streams that indicate words with gallows
  (-g.bst), words with table characters (-m.bst),
  and the "ed" group (-ed.bst):

    foreach sec ( txt.n lab.n ${secs} )
      foreach attr ( g m ed r )
        echo "${sec}-${attr}"
        cat word-seqs/${sec}.wds \
          | make-attribute-string \
              -v attr=${attr} \
              -f extract-word-attributes.gawk \
          > stats/${sec}-${attr}.bst
      end
    end

 Separating the "ed" words:
 
   foreach sec ( txt.n lab.n ${secs} )
      echo "${sec}"
      cat word-seqs/${sec}.wds \
        | gawk '/ed/{print;next;} /./{print "*";next;} //{print;}' \
        > word-seqs/${sec}-ed.wds
    end
   foreach sec ( txt.n lab.n ${secs} )
      echo "${sec}"
      cat word-seqs/${sec}-ed.wds \
        | egrep '.' \
        | sort | uniq -c | expand \
        | sort +0 -1nr +1 -2 \
        | compute-freqs \
        > word-seqs/${sec}-ed.wfr
    end
  
TABULATING THE RUN LENGTHS IN BIT SEQUENCES

  Consider only lines that have this minimum length:
  
    set minlen = 1
  
  Testing the run extraction scripts:
  
    cat stats/txt.n-g.bst \
      | tr '23456789' '11111111' \
      | tr -d ' ' \
      | egrep -v '[?*]' \
      > .bar
      
    cat .bar \
      | extract-runs \
          -v sticky_qmarks=0 \
          -v min_line_length=${minlen} \
      > .foo

  Extracting and counting runs by value, length, and section:

    foreach sec ( txt.n lab.n ${secs} )
      foreach attr ( g m ed r )
        echo "${sec}-${attr}.rct"
        cat stats/${sec}-${attr}.bst \
          | tr '23456789' '11111111' \
          | tr -d ' ' \
          | egrep -v '[?*]' \
          | extract-runs \
              -v sticky_qmarks=0 \
              -v min_line_length=${minlen} \
          | sort | uniq -c | expand | sort -b +1 -2 \
          > stats/${sec}-${attr}.rct
      end
    end
    
  Computing the relative frequency of each 0/1 run

    foreach sec ( txt.n lab.n ${secs} )
      foreach attr ( g m ed r )
        echo "${sec}-${attr}.rfr"
        cat stats/${sec}-${attr}.rct \
          | gawk '($2 ~ /^([0]+|[1]+)$/){print;}' \
          | compute-freqs \
          > stats/${sec}-${attr}.rfr
      end
    end
    
  Computing the expected run frequencies:
  
    foreach sec ( txt.n lab.n ${secs} )
      foreach attr ( g m ed r )
        echo "${sec}-${attr}.xfr"
        cat stats/${sec}-${attr}.rct \
          | compute-expected-run-stats \
          > stats/${sec}-${attr}.xfr
      end
    end
    
  Tabulating the relative frequencies of 0-words and 1-words
  per section:
  
    foreach attr ( g m ed r )
      set sfile = "stats/summary-${attr}.bct"
      echo ${sfile}
      /bin/rm -f .tmp
      foreach sec ( txt.n lab.n ${secs} )
        set xfile = stats/${sec}-${attr}.xfr
        set pr = ( `egrep '^s[01] =' ${xfile}` )
        echo "${sec}" "${pr[3]}" "${pr[6]}" "${pr[9]}" "${pr[12]}" >> .tmp
      end
      cat .tmp \
        | map-field \
            -v table=subsec-to-fnums.tbl \
            -v inField=1 -v outField=6 \
        | gawk \
            ' /./{ \
                printf "%-5s  %5d %5.3f  %5d %5.3f  %s\n", \
                  $1,$2,$3,$4,$5,$6; \
              } \
            ' \
        > ${sfile}
    end