# Last edited on 2002-01-18 09:44:19 by stolfi


SECTION NAMES

  Get (sub)section names:

    set secs = ( `cat subsections.tags` )
    set secscm = `echo ${secs} | tr ' ' ','`
    echo ${secs}; echo ${secscm}
    
  Checking whether we missed anything:
 
    echo "${secs}" | tr ' ' '\012' | sort > .foo
    diff .foo text-subsecs/all.names

  Extracting the good subsections:
  
    cat subsections.tags \
      | egrep -v 'unk|xxx' \
      > subsections-ok.tags
    echo `cat subsections-ok.tags`

  Per-section data will live in sudirectories
  sample/LANG/BUK/SEC.K where SEC.K is the section and subsection tag.

  Let's create the respective book directories:

    mkdir sample
    foreach pd ( sample ${tbldir} ${figdir} )
      mkdir ${pd}/{${langscm}}
      mkdir ${pd}/{${langscm}}/vms
      mkdir ${pd}/{${langscm}}/vms/{${secscm},tot.t}
    end
    
  Copy the list of section-subsection tags (except "tot.t") to handy places:

    foreach lang ( ${langs} )
      (cd sample/${lang}/vms/ && ln -s ../../../subsections.tags )
      (cd sample/${lang}/vms/ && ln -s ../../../subsections-ok.tags )
    end
  Copy the raw EVT-formatted text files to the appropriate 
  sub-directories of "voyn":
  
    set utypes = \
      'parags,starred-parags,circular-lines,circular-text,radial-lines,titles,labels,words'
    foreach sec ( ${secs} "tot.t" )
      set ifile = "text-subsecs/${sec}.evt"
      if ( "$sec" == "tot.t" ) set ifile = "text-all.evt"
      set ofile = "sample/voyn/vms/${sec}/raw.evt"
      echo "${ifile} -> ${ofile}"
      cat ${ifile} \
        | sed -e 's/[&][*\!][*\!][*\!][*\!;]/*\!\!\!\!/g' \
        | basify-weirdos \
        | select-units \
            -v types="${utypes}" \
            -v table=unit-to-type.tbl \
        > ${ofile}
    end
    dicio-wc sample/voyn/vms/{${secscm},tot.t}/raw.evt

  Now separate the EVT-formatted files for running prose ("voyp"),
  labels ("voyl"), for each subsection SEC.K, including "tot.t".

    ln -s ../019/unit-to-type.tbl

    foreach lang ( voyp voyl )
      if ( ${lang} == voyp ) then
        set utypes = \
          'parags,starred-parags,circular-lines,circular-text,radial-lines,titles'
      else
        set utypes = \
          'labels,words'
      endif
      echo "utypes = ${utypes}"
      foreach sec ( ${secs} "tot.t" )
        set ifile = "sample/voyn/vms/${sec}/raw.evt"
        set ofile = "sample/${lang}/vms/${sec}/raw.evt"
        echo "${ifile} -> ${ofile}"
        cat ${ifile} \
          | select-units \
              -v types="${utypes}" \
              -v table=unit-to-type.tbl \
          > ${ofile}
      end
      dicio-wc sample/${lang}/vms/{${secscm},tot.t}/raw.evt
    end

EXTRACTING THE RAW TOKEN LISTS

  Now we extract the raw token lists for . We treat line breaks as
  spaces, but perserve paragraph breaks as dummy "=" words.
  
    foreach lang ( voyn voyp voyl )
      foreach sec ( ${secs} "tot.t" )
        set ifile = "sample/${lang}/vms/${sec}/raw.evt"
        set ofile = "sample/${lang}/vms/${sec}/raw.tks"
        echo "${ifile} -> ${ofile}"
        cat ${ifile} \
          | words-from-evt -v showParags=1 \
          | sed -e 's/^ *$/=/' \
          > ${ofile}
      end
      dicio-wc sample/${lang}/vms/{${secscm},tot.t}/raw.tks
    end

  Now we do the same for the line-initial, -media, and -final 
  sublanguages of "voyp":

    foreach lang ( voyi voym voyf )
      set omi = 1; set omm = 1; set omf = 1
      if ( "${lang}" == "voyi" ) set omi = 0
      if ( "${lang}" == "voym" ) set omm = 0
      if ( "${lang}" == "voyf" ) set omf = 0
      foreach sec ( ${secs} "tot.t" )
        set ifile = "sample/voyp/vms/${sec}/raw.evt"
        set ofile = "sample/${lang}/vms/${sec}/raw.tks"
        echo "${ifile} -> ${ofile}"
        cat ${ifile} \
          | words-from-evt \
              -v showParags=1 \
              -v omitInitial=${omi} \
              -v omitMedial=${omm} \
              -v omitFinal=${omf} \
          | sed -e 's/^ *$/=/' \
          > ${ofile}
      end
      dicio-wc sample/${lang}/vms/{${secscm},tot.t}/raw.tks
    end
    
COMPUTING WORD OCCURRENCE COUNTS

  Counting word occurrences by subset and section:

    foreach lang ( ${langs} )
      foreach sec ( ${secs} "tot.t" )
        set ifile = "sample/${lang}/vms/${sec}/raw.tks"
        set ofile = "sample/${lang}/vms/${sec}/raw.wfr"
        echo "${ifile} -> ${ofile}"
        cat ${ifile} \
          | egrep -v '=' \
          | sort | uniq -c | expand \
          | sort -b +0 -1nr +1 -2 \
          | compute-freqs \
          > ${ofile}
      end
      dicio-wc sample/${lang}/vms/{${secscm},tot.t}/raw.wfr \
        | gawk '/./{ printf "    %8s %s\n", $1,$4;}' 
    end

  Tabulating the fraction of good and bad words per section
   (ppt = parts per thousand):
  
    foreach book ( ${books} )
      set afile = ".raw-gud-bad-counts-${lang}-vms.txt"; 
      echo " "; echo "      Good/bad statistics for subset voyn/${book}:"; echo " "
      count-raw-gud-bad-toks-wrds voyn/${book} ${secs} / tot.1 \
        > ${afile}
      cat ${afile} \
        | sed -e 's:/::g' -e 's/^/      /' 
    end
    
      Good/bad statistics for subset voyn/vms:
 
      #                    tokens                         words             
      #         -----------------------------  -----------------------------
      # sec       raw    gud  ppt    bad  ppt    raw    gud  ppt    bad  ppt
      # ------  -----  ----- ----  ----- ----  -----  ----- ----  ----- ----
        hea.1    6867   6704  976    163   23   2132   1981  928    151   70
        hea.2     868    823  947     45   51    554    509  917     45   81
        heb.1    2901   2820  971     81   27   1189   1111  933     78   65
        heb.2     557    510  913     47   84    331    288  867     43  129
        cos.1     195    155  790     40  204     83     72  857     11  130
        cos.2    1746   1590  910    156   89   1019    868  850    151  148
        cos.3    1006    795  789    211  209    620    429  690    191  307
        bio.1    6975   6697  960    278   39   1597   1382  864    215  134
        zod.1    1370    988  720    382  278    884    555  627    329  371
        pha.1    1023    944  921     79   77    561    483  859     78  138
        pha.2    1588   1452  913    136   85    808    694  857    114  140
        str.1     755    670  886     85  112    483    402  830     81  167
        str.2   10768  10097  937    671   62   3225   2779  861    446  138
        unk.1     213    202  943     11   51    162    153  938      9   55
        unk.2     140    134  950      6   42    103     97  932      6   57
        unk.3      47     44  916      3   62     46     43  914      3   63
        unk.4     317    306  962     11   34    239    228  950     11   45
        unk.5     342    309  900     33   96    246    214  866     32  129
        unk.6     489    431  879     58  118    297    247  828     50  167
        unk.7     387    357  920     30   77    235    208  881     27  114
        unk.8       2      2  666      0    0      2      2  666      0    0
      
        tot.1   38556  36030  934   2526   65   8591   6883  801   1708  198

      Good/bad statistics for subset voyp/vms:
 
      #                    tokens                         words             
      #         -----------------------------  -----------------------------
      # sec       raw    gud  ppt    bad  ppt    raw    gud  ppt    bad  ppt
      # ------  -----  ----- ----  ----- ----  -----  ----- ----  ----- ----
        hea.1    6866   6703  976    163   23   2131   1980  928    151   70
        hea.2     868    823  947     45   51    554    509  917     45   81
        heb.1    2901   2820  971     81   27   1189   1111  933     78   65
        heb.2     557    510  913     47   84    331    288  867     43  129
        cos.1     185    146  784     39  209     73     63  851     10  135
        cos.2    1491   1353  906    138   92    868    733  843    135  155
        cos.3     884    713  805    171  193    533    380  711    153  286
        bio.1    6828   6555  959    273   39   1536   1325  862    211  137
        zod.1    1010    701  693    309  305    641    379  590    262  408
        pha.1     926    858  925     68   73    485    418  860     67  137
        pha.2    1426   1309  917    117   81    684    587  856     97  141
        str.1     755    670  886     85  112    483    402  830     81  167
        str.2   10768  10097  937    671   62   3225   2779  861    446  138
        unk.1     213    202  943     11   51    162    153  938      9   55
        unk.2     140    134  950      6   42    103     97  932      6   57
        unk.3      47     44  916      3   62     46     43  914      3   63
        unk.4     302    292  963     10   33    226    216  951     10   44
        unk.5     342    309  900     33   96    246    214  866     32  129
        unk.6     489    431  879     58  118    297    247  828     50  167
        unk.7     387    357  920     30   77    235    208  881     27  114
        unk.8       0      0    0      0    0      0      0    0      0    0
      
        tot.1   37385  35027  936   2358   63   8105   6525  804   1580  194
 
      Good/bad statistics for subset voyl/vms:
 
      #                    tokens                         words             
      #         -----------------------------  -----------------------------
      # sec       raw    gud  ppt    bad  ppt    raw    gud  ppt    bad  ppt
      # ------  -----  ----- ----  ----- ----  -----  ----- ----  ----- ----
        hea.1       1      1  500      0    0      1      1  500      0    0
        hea.2       0      0    0      0    0      0      0    0      0    0
        heb.1       0      0    0      0    0      0      0    0      0    0
        heb.2       0      0    0      0    0      0      0    0      0    0
        cos.1      10      9  818      1   90     10      9  818      1   90
        cos.2     255    237  925     18   70    225    208  920     17   75
        cos.3     122     82  666     40  325    112     72  637     40  353
        bio.1     147    142  959      5   33    127    122  953      5   39
        zod.1     360    287  795     73  202    303    233  766     70  230
        pha.1      97     86  877     11  112     92     81  870     11  118
        pha.2     162    143  877     19  116    155    136  871     19  121
        str.1       0      0    0      0    0      0      0    0      0    0
        str.2       0      0    0      0    0      0      0    0      0    0
        unk.1       0      0    0      0    0      0      0    0      0    0
        unk.2       0      0    0      0    0      0      0    0      0    0
        unk.3       0      0    0      0    0      0      0    0      0    0
        unk.4      15     14  875      1   62     15     14  875      1   62
        unk.5       0      0    0      0    0      0      0    0      0    0
        unk.6       0      0    0      0    0      0      0    0      0    0
        unk.7       0      0    0      0    0      0      0    0      0    0
        unk.8       2      2  666      0    0      2      2  666      0    0
      
        tot.1    1171   1003  855    168  143    882    721  816    161  182
 
      Good/bad statistics for subset voyi/vms:
 
      #                    tokens                         words             
      #         -----------------------------  -----------------------------
      # sec       raw    gud  ppt    bad  ppt    raw    gud  ppt    bad  ppt
      # ------  -----  ----- ----  ----- ----  -----  ----- ----  ----- ----
        hea.1    1339   1313  979     26   19    709    683  961     26   36
        hea.2     185    178  956      7   37    150    143  947      7   46
        heb.1     440    427  968     13   29    326    313  957     13   39
        heb.2      77     65  833     12  153     68     56  811     12  173
        cos.1       3      2  500      1  250      3      2  500      1  250
        cos.2     203    185  906     18   88    183    165  896     18   97
        cos.3      90     71  780     19  208     82     63  759     19  228
        bio.1     823    782  949     41   49    387    352  907     35   90
        zod.1      30     20  645     10  322     26     17  629      9  333
        pha.1     112    103  911      9   79     95     86  895      9   93
        pha.2     161    137  845     24  148    129    107  823     22  169
        str.1      80     73  901      7   86     76     69  896      7   90
        str.2    1083   1005  927     78   71    675    606  896     69  102
        unk.1      26     22  814      4  148     23     20  833      3  125
        unk.2      32     31  939      1   30     29     28  933      1   33
        unk.3      13     12  857      1   71     13     12  857      1   71
        unk.4      33     32  941      1   29     31     30  937      1   31
        unk.5      35     29  805      6  166     34     28  800      6  171
        unk.6      45     43  934      2   43     39     37  925      2   50
        unk.7      39     37  925      2   50     34     32  914      2   57
        unk.8       0      0    0      0    0      0      0    0      0    0
      
        tot.1    4849   4567  941    282   58   2159   1913  885    246  113
 
      Good/bad statistics for subset voyf/vms:
 
      #                    tokens                         words             
      #         -----------------------------  -----------------------------
      # sec       raw    gud  ppt    bad  ppt    raw    gud  ppt    bad  ppt
      # ------  -----  ----- ----  ----- ----  -----  ----- ----  ----- ----
        hea.1    1339   1302  971     37   27    646    613  947     33   51
        hea.2     185    166  892     19  102    156    137  872     19  121
        heb.1     440    424  961     16   36    270    255  940     15   55
        heb.2      77     74  948      3   38     69     66  942      3   42
        cos.1       3      2  500      1  250      3      2  500      1  250
        cos.2     203    180  882     23  112    167    144  857     23  136
        cos.3      90     67  736     23  252     77     54  692     23  294
        bio.1     823    788  956     35   42    397    362  909     35   87
        zod.1      30     12  387     18  580     30     12  387     18  580
        pha.1     112    101  893     11   97     85     74  860     11  127
        pha.2     161    132  814     29  179    134    108  800     26  192
        str.1      80     73  901      7   86     74     67  893      7   93
        str.2    1083   1002  924     81   74    678    600  883     78  114
        unk.1      26     24  888      2   74     25     23  884      2   76
        unk.2      32     28  848      4  121     27     23  821      4  142
        unk.3      13     12  857      1   71     12     11  846      1   76
        unk.4      33     32  941      1   29     32     31  939      1   30
        unk.5      35     31  861      4  111     35     31  861      4  111
        unk.6      45     34  739     11  239     42     31  720     11  255
        unk.7      39     30  750      9  225     35     27  750      8  222
        unk.8       0      0    0      0    0      0      0    0      0    0
      
        tot.1    4849   4514  930    335   69   2042   1748  855    294  143

      Good/bad statistics for subset voym/vms:
 
      #                    tokens                         words             
      #         -----------------------------  -----------------------------
      # sec       raw    gud  ppt    bad  ppt    raw    gud  ppt    bad  ppt
      # ------  -----  ----- ----  ----- ----  -----  ----- ----  ----- ----
        hea.1    4055   3966  977     89   21   1261   1175  931     86   68
        hea.2     468    451  961     17   36    300    283  940     17   56
        heb.1    2002   1950  973     52   25    809    758  935     51   62
        heb.2     402    370  918     32   79    236    208  877     28  118
        cos.1     112     98  867     14  123     71     61  847     10  138
        cos.2    1077    981  910     96   89    618    524  846     94  151
        cos.3     697    573  820    124  177    413    300  724    113  272
        bio.1    5182   4985  961    197   38   1111    958  861    153  137
        zod.1     949    668  703    281  295    606    364  599    242  398
        pha.1     699    651  930     48   68    363    316  868     47  129
        pha.2    1096   1033  941     63   57    498    444  889     54  108
        str.1     595    524  879     71  119    376    309  819     67  177
        str.2    8599   8087  940    512   59   2356   2034  862    322  136
        unk.1     159    154  962      5   31    124    119  952      5   40
        unk.2      71     70  972      1   13     53     52  962      1   18
        unk.3      21     20  909      1   45     21     20  909      1   45
        unk.4     236    228  962      8   33    177    169  949      8   44
        unk.5     272    249  912     23   84    192    170  880     22  113
        unk.6     399    354  885     45  112    243    205  840     38  155
        unk.7     309    290  935     19   61    187    169  898     18   95
        unk.8       0      0    0      0    0      0      0    0      0    0
      
        tot.1   27400  25702  937   1698   61   5633   4486  796   1147  203

  Formatting the tables for the tech report:

    foreach lang ( ${langs} )
      set afile = ".raw-gud-bad-counts-${lang}-vms.txt"; 
      set tfile = "voyn/${book}/tw-counts-by-sect.tex"; 
      echo " "; echo " ${afile} -> ${tfile}"; echo " "
      cat ${afile} \
        | tex-format-raw-gud-bad-counts \
        > dat/${tfile}
      update-paper-include dat/${tfile} ${tbldir}/${tfile}
    end

  Extracting the main statistics for the tech report:

    foreach lang ( ${langs} )
      set afile = ".raw-gud-bad-counts-${lang}-vms.txt"; 
      set sfile = "voyn/${book}/tw-summary.tex"; 
      echo " "; echo " ${afile} -> ${sfile}"; echo " "
      cat ${afile} \
        | tex-format-raw-gud-bad-summary -v dat=${lang}Vms \
        > dat/${sfile}
      update-paper-include dat/${sfile} ${tbldir}/${sfile}
    end