# Last edited on 2002-01-18 09:44:19 by stolfi SECTION NAMES Get (sub)section names: set secs = ( `cat subsections.tags` ) set secscm = `echo ${secs} | tr ' ' ','` echo ${secs}; echo ${secscm} Checking whether we missed anything: echo "${secs}" | tr ' ' '\012' | sort > .foo diff .foo text-subsecs/all.names Extracting the good subsections: cat subsections.tags \ | egrep -v 'unk|xxx' \ > subsections-ok.tags echo `cat subsections-ok.tags` Per-section data will live in sudirectories sample/LANG/BUK/SEC.K where SEC.K is the section and subsection tag. Let's create the respective book directories: mkdir sample foreach pd ( sample ${tbldir} ${figdir} ) mkdir ${pd}/{${langscm}} mkdir ${pd}/{${langscm}}/vms mkdir ${pd}/{${langscm}}/vms/{${secscm},tot.t} end Copy the list of section-subsection tags (except "tot.t") to handy places: foreach lang ( ${langs} ) (cd sample/${lang}/vms/ && ln -s ../../../subsections.tags ) (cd sample/${lang}/vms/ && ln -s ../../../subsections-ok.tags ) end Copy the raw EVT-formatted text files to the appropriate sub-directories of "voyn": set utypes = \ 'parags,starred-parags,circular-lines,circular-text,radial-lines,titles,labels,words' foreach sec ( ${secs} "tot.t" ) set ifile = "text-subsecs/${sec}.evt" if ( "$sec" == "tot.t" ) set ifile = "text-all.evt" set ofile = "sample/voyn/vms/${sec}/raw.evt" echo "${ifile} -> ${ofile}" cat ${ifile} \ | sed -e 's/[&][*\!][*\!][*\!][*\!;]/*\!\!\!\!/g' \ | basify-weirdos \ | select-units \ -v types="${utypes}" \ -v table=unit-to-type.tbl \ > ${ofile} end dicio-wc sample/voyn/vms/{${secscm},tot.t}/raw.evt Now separate the EVT-formatted files for running prose ("voyp"), labels ("voyl"), for each subsection SEC.K, including "tot.t". ln -s ../019/unit-to-type.tbl foreach lang ( voyp voyl ) if ( ${lang} == voyp ) then set utypes = \ 'parags,starred-parags,circular-lines,circular-text,radial-lines,titles' else set utypes = \ 'labels,words' endif echo "utypes = ${utypes}" foreach sec ( ${secs} "tot.t" ) set ifile = "sample/voyn/vms/${sec}/raw.evt" set ofile = "sample/${lang}/vms/${sec}/raw.evt" echo "${ifile} -> ${ofile}" cat ${ifile} \ | select-units \ -v types="${utypes}" \ -v table=unit-to-type.tbl \ > ${ofile} end dicio-wc sample/${lang}/vms/{${secscm},tot.t}/raw.evt end EXTRACTING THE RAW TOKEN LISTS Now we extract the raw token lists for . We treat line breaks as spaces, but perserve paragraph breaks as dummy "=" words. foreach lang ( voyn voyp voyl ) foreach sec ( ${secs} "tot.t" ) set ifile = "sample/${lang}/vms/${sec}/raw.evt" set ofile = "sample/${lang}/vms/${sec}/raw.tks" echo "${ifile} -> ${ofile}" cat ${ifile} \ | words-from-evt -v showParags=1 \ | sed -e 's/^ *$/=/' \ > ${ofile} end dicio-wc sample/${lang}/vms/{${secscm},tot.t}/raw.tks end Now we do the same for the line-initial, -media, and -final sublanguages of "voyp": foreach lang ( voyi voym voyf ) set omi = 1; set omm = 1; set omf = 1 if ( "${lang}" == "voyi" ) set omi = 0 if ( "${lang}" == "voym" ) set omm = 0 if ( "${lang}" == "voyf" ) set omf = 0 foreach sec ( ${secs} "tot.t" ) set ifile = "sample/voyp/vms/${sec}/raw.evt" set ofile = "sample/${lang}/vms/${sec}/raw.tks" echo "${ifile} -> ${ofile}" cat ${ifile} \ | words-from-evt \ -v showParags=1 \ -v omitInitial=${omi} \ -v omitMedial=${omm} \ -v omitFinal=${omf} \ | sed -e 's/^ *$/=/' \ > ${ofile} end dicio-wc sample/${lang}/vms/{${secscm},tot.t}/raw.tks end COMPUTING WORD OCCURRENCE COUNTS Counting word occurrences by subset and section: foreach lang ( ${langs} ) foreach sec ( ${secs} "tot.t" ) set ifile = "sample/${lang}/vms/${sec}/raw.tks" set ofile = "sample/${lang}/vms/${sec}/raw.wfr" echo "${ifile} -> ${ofile}" cat ${ifile} \ | egrep -v '=' \ | sort | uniq -c | expand \ | sort -b +0 -1nr +1 -2 \ | compute-freqs \ > ${ofile} end dicio-wc sample/${lang}/vms/{${secscm},tot.t}/raw.wfr \ | gawk '/./{ printf " %8s %s\n", $1,$4;}' end Tabulating the fraction of good and bad words per section (ppt = parts per thousand): foreach book ( ${books} ) set afile = ".raw-gud-bad-counts-${lang}-vms.txt"; echo " "; echo " Good/bad statistics for subset voyn/${book}:"; echo " " count-raw-gud-bad-toks-wrds voyn/${book} ${secs} / tot.1 \ > ${afile} cat ${afile} \ | sed -e 's:/::g' -e 's/^/ /' end Good/bad statistics for subset voyn/vms: # tokens words # ----------------------------- ----------------------------- # sec raw gud ppt bad ppt raw gud ppt bad ppt # ------ ----- ----- ---- ----- ---- ----- ----- ---- ----- ---- hea.1 6867 6704 976 163 23 2132 1981 928 151 70 hea.2 868 823 947 45 51 554 509 917 45 81 heb.1 2901 2820 971 81 27 1189 1111 933 78 65 heb.2 557 510 913 47 84 331 288 867 43 129 cos.1 195 155 790 40 204 83 72 857 11 130 cos.2 1746 1590 910 156 89 1019 868 850 151 148 cos.3 1006 795 789 211 209 620 429 690 191 307 bio.1 6975 6697 960 278 39 1597 1382 864 215 134 zod.1 1370 988 720 382 278 884 555 627 329 371 pha.1 1023 944 921 79 77 561 483 859 78 138 pha.2 1588 1452 913 136 85 808 694 857 114 140 str.1 755 670 886 85 112 483 402 830 81 167 str.2 10768 10097 937 671 62 3225 2779 861 446 138 unk.1 213 202 943 11 51 162 153 938 9 55 unk.2 140 134 950 6 42 103 97 932 6 57 unk.3 47 44 916 3 62 46 43 914 3 63 unk.4 317 306 962 11 34 239 228 950 11 45 unk.5 342 309 900 33 96 246 214 866 32 129 unk.6 489 431 879 58 118 297 247 828 50 167 unk.7 387 357 920 30 77 235 208 881 27 114 unk.8 2 2 666 0 0 2 2 666 0 0 tot.1 38556 36030 934 2526 65 8591 6883 801 1708 198 Good/bad statistics for subset voyp/vms: # tokens words # ----------------------------- ----------------------------- # sec raw gud ppt bad ppt raw gud ppt bad ppt # ------ ----- ----- ---- ----- ---- ----- ----- ---- ----- ---- hea.1 6866 6703 976 163 23 2131 1980 928 151 70 hea.2 868 823 947 45 51 554 509 917 45 81 heb.1 2901 2820 971 81 27 1189 1111 933 78 65 heb.2 557 510 913 47 84 331 288 867 43 129 cos.1 185 146 784 39 209 73 63 851 10 135 cos.2 1491 1353 906 138 92 868 733 843 135 155 cos.3 884 713 805 171 193 533 380 711 153 286 bio.1 6828 6555 959 273 39 1536 1325 862 211 137 zod.1 1010 701 693 309 305 641 379 590 262 408 pha.1 926 858 925 68 73 485 418 860 67 137 pha.2 1426 1309 917 117 81 684 587 856 97 141 str.1 755 670 886 85 112 483 402 830 81 167 str.2 10768 10097 937 671 62 3225 2779 861 446 138 unk.1 213 202 943 11 51 162 153 938 9 55 unk.2 140 134 950 6 42 103 97 932 6 57 unk.3 47 44 916 3 62 46 43 914 3 63 unk.4 302 292 963 10 33 226 216 951 10 44 unk.5 342 309 900 33 96 246 214 866 32 129 unk.6 489 431 879 58 118 297 247 828 50 167 unk.7 387 357 920 30 77 235 208 881 27 114 unk.8 0 0 0 0 0 0 0 0 0 0 tot.1 37385 35027 936 2358 63 8105 6525 804 1580 194 Good/bad statistics for subset voyl/vms: # tokens words # ----------------------------- ----------------------------- # sec raw gud ppt bad ppt raw gud ppt bad ppt # ------ ----- ----- ---- ----- ---- ----- ----- ---- ----- ---- hea.1 1 1 500 0 0 1 1 500 0 0 hea.2 0 0 0 0 0 0 0 0 0 0 heb.1 0 0 0 0 0 0 0 0 0 0 heb.2 0 0 0 0 0 0 0 0 0 0 cos.1 10 9 818 1 90 10 9 818 1 90 cos.2 255 237 925 18 70 225 208 920 17 75 cos.3 122 82 666 40 325 112 72 637 40 353 bio.1 147 142 959 5 33 127 122 953 5 39 zod.1 360 287 795 73 202 303 233 766 70 230 pha.1 97 86 877 11 112 92 81 870 11 118 pha.2 162 143 877 19 116 155 136 871 19 121 str.1 0 0 0 0 0 0 0 0 0 0 str.2 0 0 0 0 0 0 0 0 0 0 unk.1 0 0 0 0 0 0 0 0 0 0 unk.2 0 0 0 0 0 0 0 0 0 0 unk.3 0 0 0 0 0 0 0 0 0 0 unk.4 15 14 875 1 62 15 14 875 1 62 unk.5 0 0 0 0 0 0 0 0 0 0 unk.6 0 0 0 0 0 0 0 0 0 0 unk.7 0 0 0 0 0 0 0 0 0 0 unk.8 2 2 666 0 0 2 2 666 0 0 tot.1 1171 1003 855 168 143 882 721 816 161 182 Good/bad statistics for subset voyi/vms: # tokens words # ----------------------------- ----------------------------- # sec raw gud ppt bad ppt raw gud ppt bad ppt # ------ ----- ----- ---- ----- ---- ----- ----- ---- ----- ---- hea.1 1339 1313 979 26 19 709 683 961 26 36 hea.2 185 178 956 7 37 150 143 947 7 46 heb.1 440 427 968 13 29 326 313 957 13 39 heb.2 77 65 833 12 153 68 56 811 12 173 cos.1 3 2 500 1 250 3 2 500 1 250 cos.2 203 185 906 18 88 183 165 896 18 97 cos.3 90 71 780 19 208 82 63 759 19 228 bio.1 823 782 949 41 49 387 352 907 35 90 zod.1 30 20 645 10 322 26 17 629 9 333 pha.1 112 103 911 9 79 95 86 895 9 93 pha.2 161 137 845 24 148 129 107 823 22 169 str.1 80 73 901 7 86 76 69 896 7 90 str.2 1083 1005 927 78 71 675 606 896 69 102 unk.1 26 22 814 4 148 23 20 833 3 125 unk.2 32 31 939 1 30 29 28 933 1 33 unk.3 13 12 857 1 71 13 12 857 1 71 unk.4 33 32 941 1 29 31 30 937 1 31 unk.5 35 29 805 6 166 34 28 800 6 171 unk.6 45 43 934 2 43 39 37 925 2 50 unk.7 39 37 925 2 50 34 32 914 2 57 unk.8 0 0 0 0 0 0 0 0 0 0 tot.1 4849 4567 941 282 58 2159 1913 885 246 113 Good/bad statistics for subset voyf/vms: # tokens words # ----------------------------- ----------------------------- # sec raw gud ppt bad ppt raw gud ppt bad ppt # ------ ----- ----- ---- ----- ---- ----- ----- ---- ----- ---- hea.1 1339 1302 971 37 27 646 613 947 33 51 hea.2 185 166 892 19 102 156 137 872 19 121 heb.1 440 424 961 16 36 270 255 940 15 55 heb.2 77 74 948 3 38 69 66 942 3 42 cos.1 3 2 500 1 250 3 2 500 1 250 cos.2 203 180 882 23 112 167 144 857 23 136 cos.3 90 67 736 23 252 77 54 692 23 294 bio.1 823 788 956 35 42 397 362 909 35 87 zod.1 30 12 387 18 580 30 12 387 18 580 pha.1 112 101 893 11 97 85 74 860 11 127 pha.2 161 132 814 29 179 134 108 800 26 192 str.1 80 73 901 7 86 74 67 893 7 93 str.2 1083 1002 924 81 74 678 600 883 78 114 unk.1 26 24 888 2 74 25 23 884 2 76 unk.2 32 28 848 4 121 27 23 821 4 142 unk.3 13 12 857 1 71 12 11 846 1 76 unk.4 33 32 941 1 29 32 31 939 1 30 unk.5 35 31 861 4 111 35 31 861 4 111 unk.6 45 34 739 11 239 42 31 720 11 255 unk.7 39 30 750 9 225 35 27 750 8 222 unk.8 0 0 0 0 0 0 0 0 0 0 tot.1 4849 4514 930 335 69 2042 1748 855 294 143 Good/bad statistics for subset voym/vms: # tokens words # ----------------------------- ----------------------------- # sec raw gud ppt bad ppt raw gud ppt bad ppt # ------ ----- ----- ---- ----- ---- ----- ----- ---- ----- ---- hea.1 4055 3966 977 89 21 1261 1175 931 86 68 hea.2 468 451 961 17 36 300 283 940 17 56 heb.1 2002 1950 973 52 25 809 758 935 51 62 heb.2 402 370 918 32 79 236 208 877 28 118 cos.1 112 98 867 14 123 71 61 847 10 138 cos.2 1077 981 910 96 89 618 524 846 94 151 cos.3 697 573 820 124 177 413 300 724 113 272 bio.1 5182 4985 961 197 38 1111 958 861 153 137 zod.1 949 668 703 281 295 606 364 599 242 398 pha.1 699 651 930 48 68 363 316 868 47 129 pha.2 1096 1033 941 63 57 498 444 889 54 108 str.1 595 524 879 71 119 376 309 819 67 177 str.2 8599 8087 940 512 59 2356 2034 862 322 136 unk.1 159 154 962 5 31 124 119 952 5 40 unk.2 71 70 972 1 13 53 52 962 1 18 unk.3 21 20 909 1 45 21 20 909 1 45 unk.4 236 228 962 8 33 177 169 949 8 44 unk.5 272 249 912 23 84 192 170 880 22 113 unk.6 399 354 885 45 112 243 205 840 38 155 unk.7 309 290 935 19 61 187 169 898 18 95 unk.8 0 0 0 0 0 0 0 0 0 0 tot.1 27400 25702 937 1698 61 5633 4486 796 1147 203 Formatting the tables for the tech report: foreach lang ( ${langs} ) set afile = ".raw-gud-bad-counts-${lang}-vms.txt"; set tfile = "voyn/${book}/tw-counts-by-sect.tex"; echo " "; echo " ${afile} -> ${tfile}"; echo " " cat ${afile} \ | tex-format-raw-gud-bad-counts \ > dat/${tfile} update-paper-include dat/${tfile} ${tbldir}/${tfile} end Extracting the main statistics for the tech report: foreach lang ( ${langs} ) set afile = ".raw-gud-bad-counts-${lang}-vms.txt"; set sfile = "voyn/${book}/tw-summary.tex"; echo " "; echo " ${afile} -> ${sfile}"; echo " " cat ${afile} \ | tex-format-raw-gud-bad-summary -v dat=${lang}Vms \ > dat/${sfile} update-paper-include dat/${sfile} ${tbldir}/${sfile} end