COMPUTING THE LENGTH DISTRIBUTION OF ROMAN NUMERALS

  Computing the length distributions:

    foreach ekind ( old new )
      set ifile = ".roman-${ekind}.nums"
      set ofile = ".roman-${ekind}-length.cts"
      echo "${ifile} -> dat/${ofile}"
      cat ${ifile} \
        | gawk '//{ print length($1)-1; }' \
        | sort | uniq -c | expand \
        | sort -b +1 -2n \
        > dat/${ofile}
    end


  Make sure all samples have an appropriate element factoring
  filter "factor-text-standard.gawk":
  
    # set trivialfilter = "/home/staff/stolfi/voynich/work/factor-text-trivial.gawk"
    # foreach sample ( ${samples} )
    #   set dir = "sample/${sample}"
    #   set filter = "${dir}/factor-text-standard.gawk"
    #   if ( ! ( -x ${filter} ) ) then
    #     ( cd ${dir} && ln -s ${trivialfilter} ${filter:t} )
    #   endif
    # end


>>> STOPPED HERE <<<
      
  Extracting sets of Voynichese words of same length:

    foreach sample ( text labs )
      foreach ekind ( basic oko )
        foreach len ( 01 02 03  09 10 11 )
          set ofile = "lang/${sample}/voyn-${ekind}-${len}.cts"
          echo "=== fig/${ofile} ==="
          cat lang/${sample}/voyn.wfr \
            | capitalize-ligatures -v field=3 \
            | factor-field-general \
                -f factor-text-${ekind}.gawk -v inField=3 -v outField=4 \
            | gawk '/./{ print $1, $4; }' \
            | extract-words-by-elem-count -v len=${len} \
            | sort -b +1 -2 \
            > dat/${ofile}
          wc ${ofile}
        end
      end
    end