Hacking at the Voynich manuscript - Side notes
050 Public text colorization service

Last edited on 1999-07-28 01:58:40 by stolfi

INTRODUCTION

  This note records the development and installation
  of an HTTP-accessible service that produces
  custom-colorized versions of the VMS pages.

  Ideally the client specifies 

    - the f-number of a VMS page,

    - a transcriber's tag (including "A" or "Y")

    - a list of EVA words or regular expressions,

    - (optionally) colors for the same.

  The tool then outputs an HTML-formatted copy of that 
  page, where the selected words are highlighted with colors.
  Optionally, similar words are highlighted too.

CREATING THE PAGE DATABASE

  Creating the source pages, with all versions (including
  consensus and majority), with some of the comments:

    mkdir evt-pages
    /bin/rm -f evt-pages/f*

    cat inter-cm.evt \
      | split-pages -v dir="evt-pages"

  Testing the tool:

    test-colorize-page

LABGUAGE-BASED COLOR MAP

  Let's compute the most characteristic words in each language, 
  and develop a color map from them.
  
  First, let's make a table mapping f-number to language:
  
      cat L16+H-eva/INDEX \
        | gawk -v FS=':' \
            '{ s=$2; gsub (/[.].*$/, "", s); print s, $4; }' \
        | sort | uniq \
        > fnum-to-lang.tbl

  Now let's get the word frequencies per page (majority transcription),
  from Notes/019:

    ln -s ../019/RAW/wfreqs/pages page-wfreqs
    
    ls -l page-wfreqs/all.names
    ls -ld page-wfreqs/f33r.frq
    
  Compute word frequencies per language:
    
    foreach lang ( A B )
      echo '=== '${lang}' ==='
      set fnums = ( \
        ` cat fnum-to-lang.tbl | gawk '($2=="'${lang}'"){print ($1 ".frq");}'` \
      )
      ( cd page-wfreqs && cat ${fnums} ) \
        | gawk '/./{print $1, $3;}' \
        | combine-counts \
        | sort +1 -2 \
        | compute-freqs \
        > lang-${lang}.frq
    end
  
  Compute a language index "x" that is 1 for words strongly
  characteristic of language A, 0 for words strongly characteristic of
  language B, and near 0.5 for words that are either rare or equally
  frequent in both languages:
  
    /n/gnu/bin/join \
        -j 3 -a 1 -a 2 \
        -e 0 \
        -o1.1,1.2,2.1,2.2,0 \
        lang-{A,B}.frq \
      | compute-lang-index \
      | sort -b +4 -5gr +5 -6 \
      > langs.fr2

  Now select the 300 words of maximum specificity from each language
  and map them to colors that depend on their language specificity
  index "x". It seems preferable to exclude single-letter words, which
  have their frequencies messed up by the "key-like sequences".
    
    /bin/rm -f .wds
    foreach sel ( head tail )
      cat langs.fr2 \
        | egrep -v '[?]' \
        | gawk '(length($6) > 1) {print $6, $5; }' \
        | sort -b +1 -2gr \
        | ${sel} -300 \
        >> .wds
    end
    
    set cdic = "color-tables/langs.cdic"
    /bin/rm -f ${cdic}
    echo '# Most language-specific words' >> ${cdic}
    echo '# Orange = language A, Blue = language B' >> ${cdic}
    cat .wds \
      | sort -b +1 -2gr \
      | color-from-index \
      >> ${cdic}
      
PAGE SPECIFICITY COLOR MAP

  Another interesting idea is to color each word according to its
  page specificity, namely the number of bits of information
  that the word gives about the page's f-number.
  
    /bin/rm -f .wps
    foreach fnum ( ` cat fnum-to-lang.tbl | gawk '/./{print $1;}'` )
      echo $fnum
      cat page-wfreqs/${fnum}.frq \
        | gawk -v fnum=${fnum} '/./{print $1, $3, fnum;}' \
        | sort -b +1 -2 +0 -1nr \
        >> .wps
    end
    
    cat .wps \
      | compute-page-spec-index \
      | sort -b +1 -2n +2 -3gr \
      > .bar

    cat .bar \
      | sort +2 -3gr \
      | gawk '(FNR==1){m = $3;} ($3 >= 0.03){ print $4, sqrt($3/m); }' \
      | color-from-index \
      > color-tables/lumpy.cdic

    cat .bar \
      | sort +2 -3gr \
      | gawk '($2<75) {next;} //{ printf "%s %5.3f\n",$4,s; s+=0.12; if(s>=1){s -=1;}}' \
      | color-from-hue \
      > color-tables/smooth.cdic

LOW-FREQUENCY WORDS

  At the other end of the spectrum, let's make a table that highlights
  words that occur only a few times:
  
    cat inter-cm.evt \
      | egrep '^<[^<>]*;A>' \
      | words-from-evt \
      | sed -e 's/[?][?]*/?/g' \
      | sort | uniq -c | expand \
      | sort -b +0 -1nr \
      > .word-counts
      
    set cdic = "color-tables/rare-words.cdic"
    /bin/rm ${cdic}
    set fmax = 3
    echo "# Words that occur at most ${fmax} times" >> $cdic
    echo "# Red = once, green = $fmax times" >> $cdic
    cat .word-counts \
      | egrep -v '[?&*%]' \
      | gawk -v fmax=${fmax} \
          ' ($1 <= fmax) { print $2, ($1 - 1)/(3*fmax); } ' \
      | color-from-hue \
      >> $cdic

COLORING THE LABELS

  Yet another idea is to color occurrences of the labels according
  to the section where they were defined.
  
  Let's make a list of all units that contain labels:
  
    cat L16+H-eva/INDEX \
      | gawk -v FS=':' \
          '($6=="labels"){printf "<%s.\n", $2;}' \
      > .label-units
      
  Let's extract the text of those units (all versions,
  including majority):
  
    cat inter-cm.evt \
      | egrep '^<.*;' \
      | fgrep -f .label-units \
      > .label-text.evt
      
  Then extract the words, keeping the fnums:
      
    cat .label-text.evt \
      | extract-words-and-fnums \
      | sort -b +1 -2 +0 -1 \
      | uniq \
      > .fnum-labels
      
  Let's map the fnums to sections:
  
    cat .fnum-labels \
      | egrep -v '[?*]' \
      | map-field \
          -v inField=1 \
          -v outField=2 \
          -v table=fnum-to-section.tbl \
      | gawk '/./{print $2, $3;}' \
      | sort -b +1 -2 +0 -1 | uniq \
      > .section-labels 
      
  Now combine multiple definitions of the same label:
      
    cat .section-labels \
      | gawk \
          ' (FNR==1){os=$1; ow=$2; next} \
            /./{s=$1;w=$2; if(w!=ow){print os,ow; os=s;ow=w;} else {os=(os "+" $1);}} \
            END{ if(ow != "") {print os,ow;} } \
          ' \
      > .secset-labels
      
  Find the most important combinations of sections that occur:
  
    cat .secset-labels \
      | gawk '/./{print $1;}' \
      | sort | uniq -c | expand \
      | sort +0 -1nr +1 -2 \
      > .secset-freqs
      
      371 cos
      361 zod
      276 pha
      123 bio
       33 cos+zod
       17 cos+pha
       17 pha+zod
       14 cos+pha+zod
       13 bio+cos
       12 bio+cos+zod
       10 bio+cos+pha+zod
       10 bio+pha
        7 bio+pha+zod
        7 bio+zod
        4 bio+cos+pha
        1 hea
      
  Make manually a table mapping secset to color.
  
  Get a list  the common words: 
  
    cat .word-counts \
      | gawk '($1 >= 30) { print $2;}' \
      | sort \
      > .common-words
      
  Delete labels that are common words, and labels with 
  one or two letters, then map secset to color:
  
    cat .secset-labels \
      | egrep -v '[?]' \
      | fgrep -v -w -f .common-words \
      | map-field \
          -v inField=1 \
          -v outField=3 \
          -v table=secset-to-color.tbl \
      | sort -b +2 -3 +1 -2 \
      | gawk \
          ' (FNR == 1){printf "# Uncommon labels\n"} \
            ($1 != os){printf "# Labels defined in %s\n", $1; os=$1;} \
            (length($2) > 2){print $2, $3;} \
          ' \
      > color-tables/labels.cdic

DAIIN COLOCATES

  Yet another idea: all words that occur adjacent to "daiin".
  
    foreach wd ( dain daiin )
      cat inter-cm.evt \
        | egrep '^<.[^<>]*;A>' \
        | words-from-evt \
        | gawk -v wd="${wd}" \
            ' (prevdain == 1) { print $1; } \
              //{ prevdain = 0; } \
              ($1 == wd) { print prev; prevdain = 1; } \
              //{prev = $1;} \
            ' \
        | egrep -v '[?]' \
        | sort | uniq -c | expand \
        | compute-freqs \
        > .${wd}-coloc-freqs
    end
      
    cat .word-counts \
      | sort -b +1 -2 \
      | compute-freqs \
      > .word-freqs
      
   foreach wd ( dain daiin )
     /n/gnu/bin/join  \
         -j 3 -o 1.1,2.1,0 \
         .${wd}-coloc-freqs .word-freqs \
       | gawk '/./{printf "%7d %7d %7.5f %s\n", $1, $2, ($1+1)/($2+36), $3;}' \
       | sort -b +2 -3gr \
       > .${wd}-colocs
   end
      
  Convert to colors according to frequency:
  
    foreach wd ( dain daiin )
      set cdic = color-tables/${wd}-colocates.cdic

      /bin/rm -f ${cdic}
      echo '# colocates of '"${wd}" >> ${cdic}
      echo "${wd}"' 00aaff' >>  ${cdic}
      echo '#' >> ${cdic}
      cat .${wd}-colocs \
        | egrep -v '[?*]' \
        | gawk -v wd="${wd}" \
            ' (FNR==1) {m = $3;} \
              ($4 \!= wd){ printf "%s %7.5f\n", $4, (1 + $3/m)/2; } \
            ' \
        | color-from-index \
        >> ${cdic}
    end