The number of raw tokens taken for each sample is adjusted so that the number good tokens matches (as far as possible) the number of good tokens in the "prs" or "lab" samples, as appropriate. The following numbers refer to SYMBOL and ALPHA tokens only. echo 35040 > dat/engl/wow/tot.1/trunc-raw.num echo 3200 > dat/engl/cul/pre.1/trunc-raw.num echo 23054 > dat/engl/cul/her.1/trunc-raw.num echo 8985 > dat/engl/cul/rec.1/trunc-raw.num echo 9118 > dat/latn/ptt/gen.1/trunc-raw.num echo 7254 > dat/latn/ptt/exo.1/trunc-raw.num echo 4981 > dat/latn/ptt/lev.1/trunc-raw.num echo 6984 > dat/latn/ptt/num.1/trunc-raw.num echo 6690 > dat/latn/ptt/deu.1/trunc-raw.num echo 9921 > dat/grek/nwt/mat.1/trunc-raw.num echo 6156 > dat/grek/nwt/mrk.1/trunc-raw.num echo 10525 > dat/grek/nwt/luk.1/trunc-raw.num echo 8425 > dat/grek/nwt/joh.1/trunc-raw.num echo 35041 > dat/span/qvi/tot.1/trunc-raw.num echo 36963 > dat/arab/qur/tot.1/trunc-raw.num echo 35307 > dat/geez/gok/tot.1/trunc-raw.num echo 35027 > dat/viet/ptt/tot.1/trunc-raw.num echo 35048 > dat/tibe/vim/tot.1/trunc-raw.num echo 35041 > dat/tibe/ccv/tot.1/trunc-raw.num echo 35027 > dat/chin/ptt/tot.1/trunc-raw.num echo 35027 > dat/chin/red/tot.1/trunc-raw.num echo 35027 > dat/chrc/red/tot.1/trunc-raw.num echo 35040 > dat/enrc/wow/tot.1/trunc-raw.num echo 35027 > dat/rugg/sfw/tot.1/trunc-raw.num echo 35027 > dat/rugg/hnd/tot.1/trunc-raw.num echo 1003 > dat/engl/wnm/tot.1/trunc-raw.num echo 1003 > dat/engl/cnp/tot.1/trunc-raw.num # NOT DONE ############################################################# # # ROMAN NUMERALS # # Generating the Roman numerals 0-999, additive system: # [Now replaced by roman-coded texts such as enrc/wow and # chrc/red] # # /bin/rm .roman-old.nums # foreach u ( '' I II III IIII V VI VII VIII VIIII ) # foreach d ( '' X XX XXX XXXX L LX LXX LXXX LXXXX ) # foreach c ( '' C CC CCC CCCC D DC DCC DCCC DCCCC ) # echo "#$c$d$u" >> .roman-old.nums # end # end # end # # Generating the Roman numerals 0-999, subtractive system: # # /bin/rm .roman-new.nums # foreach u ( '' I II III IV V VI VII VIII IX ) # foreach d ( '' X XX XXX XL L LX LXX LXXX XC ) # foreach c ( '' C CC CCC CD D DC DCC DCCC CM ) # echo "#$c$d$u" >> .roman-new.nums # end # end # end # # # JUNK # # Generating the TeX-formatted summary: # [Now superseded by per-language summaries] # # foreach kind ( raw gud bad ) # set ifile = ".summary-${kind}" # set tfile = "otherlangs-${kind}-tw-summary.tex" # printf "%% Created by Note-101.txt\n" > ${tfile} # printf "%%\n" >> ${tfile} # cat ${ifile} \ # | gawk \ # ' /./ { \ # smp = $1; tks = $2; wds = $3; \ # gsub(/[\/]/, "", smp); \ # printf "\\def\\%sGudTks{%d}\n",smp,tks; \ # printf "\\def\\%sGudWds{%d}\n",smp,wds; \ # } \ # ' \ # >> ${tfile} # update-paper-include ${tfile} ${tbldir}/ # end # # Take a small sample from each sampleuage, and pretend it is labels: # [Now replaced by more realistic samples such as engl/wnm and engl/cnp] # # set ratio = \ # `gawk -v nprose=${nprose} -v nlabs=${nlabs} 'BEGIN{print nlabs/(nprose - 2*nlabs);}'` # echo ${ratio} # # foreach lp ( eng.0 lat.0 ) # set lng = "${lp:r}"; set plus = "${lp:e}" # @ ntake = ${nlabs} + ${plus} # set ifile = "dat/${lng}/prose/raw.tks" # set ofile = "dat/${lng}/labs/raw.tks" # echo "${ifile} -> ${ofile}" # cat ${ifile} \ # | gawk -v ratio=${ratio} '(rand() <= ratio){ print; }' \ # | head -${ntake} \ # > ${ofile} # cat ${ofile} | egrep -v '[^a-z]' > .gud # dicio-wc ${ofile} .gud # end