# Last edited on 2025-05-12 19:29:26 by stolfi

105 Extracting images of weirdos and unreadable glyphs


  In this node we try to extract fragments of the page images centered
  on the weirdos, non-basic glyphs, and unreadable glyphs from the 
  interlinear.
  
  For now we use the last publicly released version (at this time, "text16e6.evt").
  
  For now we use only the transcriptions by Takahashi ('H') and Stolfi ('U')
  because the others have many "*" or disagreements with those two.
  
  To start, we try to identify:
  
    * All glyphs that were marked as weirdos with postfixed "{&...}" comments
      by the transcriber.
      
    * All occurrences of the non-basic glyph codes, namely [bgjuvwxz] 
      or not lowercase letters
    
    * All substrings that cannot be parsed as ligatures "sh" 
      "ch", "c[ktpf]h[h]?".  This includes incomplete strings 
      with those sequences.      
      
    * All glyphs transcribed as "*".
    
    * All glyphs where the two transcriptions disagree.
    
  For now, we use Rene Zandbergen's noisy scans of old photographs (RZCD).
  We should use the Beinecke images, but I have already started collecting
  coordinates form RZCD.  Hopefully it will not be hard to collect 4 reference
  point pairs from each RZCD image to the Beinecke images, and then 
  we can projectvely map the coordinates.

  We build a table "weirdos.tbl" with the folowing columns:
  
    "{FRAGNUM} | {LOC} | {XEVA} | {TEXSTR} | {IMGNAME} | {XPOS} | {YPOS} | {STRINGS} | {OBS}"
    
  where
           
    {FRAGNUM} "w{NNNNNN}" where {NNNNNN} is a unique number for a the extracted image fragment.
    
    {LOC}     the locator of the line as in the EVT file, minus the "<>" and the
              transcriber code.
             
    {XEVA}    a tentative best encoding of the glyph(s) in that picture in an extended EVA 
              encoding.
             
    {TEXSTR}  a string that will best reproduce the glyph(s) when typeset in LaTeX
              using the voyeva12.mf font and the macros "\evab" etc in 
              "macros-voyeva-mf.tex"
             
    {IMGNAME} the name of the source image in the source image folder, any 
              common prefix and extension.
              
    {XPOS}    the column index of the pixel at the center of the glyph in that image.
    
    {YPOS}    the row index for the same.
    
    {STRINGS} one or more groups {TRANS}:"{READ}" joined by ";", where {TRANS}
              is one or more transcriber codes, and {READ} is a fragment of 
              text from the interlinear showing how those transcribers read
              that glyph or glyph group. In the {READ}, brakets are used to 
              show the actual reading of the glyph(s); what is outside the brackets
              is context provided for convenience.
              
    {OBS}     general observations about the glyph.

  The extracted fragments are "frags/{FRAGNUM}.png".  
  
  Prepared a file "weirdos-edt.tbl" with the hand-collected data
  but bogus fragment numbers. Extratcing the fragments and assigning
  their numbers:
  
    ./extract_fragments.sh < weirdos-edt.tbl > weirdos-kuk.tbl
    now=`date +'%Y-%m-%d-%H%M%S'` 
    mkdir -p SAVE/${now}; mv -v  weirdos-edt.tbl SAVE/${now}/
    mv -vi weirdos-kuk.tbl weirdos-edt.tbl
    ( check and edit weirdos-edt.tbl )
    
MAPPING EVT PAGES TO PAGE IMAGES

  cat page-table.txt \
    | gawk \
        ' /^[0-9]/{ 
            lfnum = $3; img = $5; 
            gsub(/^fol/,"",img); gsub(/[.]bmp/,"",img);
            print lfnum, img;
          }
        ' \
    > fnum-to-img.tbl
  
WEIRDOS IN TAKAHASHI'S TRANSCRIPTION

  cat source-H.evt \
    | egrep -v -e '(^[#]|[>][ ]*[{][$])' \
    | egrep -e '^[<][^ <>]*[;][H]>' \
    | sed \
        -e 's:[! ]::g' \
        -e 's:[{][^&][^{}]*[}]::g' \
    | egrep -e '[>].*[^-=acdefhiklmnopqrsty.,]' \
    | sed -e 's:>:> :' \
    | gawk '/[<]/{ printf "%-18s %s\n", $1, $2; }' \
    > .tak-weirdos.evt
  wc -l .tak-weirdos.evt
    
  cat .tak-weirdos.evt \
    | gawk '/[<]/{ fnum = $1; gsub(/[<]/,"",fnum); gsub(/[.].*$/,"",fnum); print fnum, $0; }' \
    | map-field -v inField=1 -v outField=1 -v table=fnum-to-img.tbl -v defSubst='???' \
    | gawk '/[<]/{ printf "%-6s %-6s %-18s %s\n", $1, $2, $3, $4; }' \
    > .tak-weirdos.txt
  wc -l .tak-weirdos.{evt,txt}