#! /bin/csh -f # Last edited on 2000-06-08 04:16:19 by stolfi set usage = "$0 PTAG" # Gathers statistics about pairs of elements in Voynichese words # (identified by the tag PTAG, of the form LTAG-RTAG). # Assumes available the following files: # # data/{words,labels}/all.names # a list of names of subsections # # data/{words,labels}/XXX.wsp # the words of subsection XXX (where XXX is found in all.names), # in EVA encoding, factored into QOKOKOKOKO elements, # with mantle and core set off with "()" and "<>". # # Creates the following files: # # stats/{words,labels}/PTAG/XXX.frq # stats/{words,labels}/PTAG/XXX-L.frq # stats/{words,labels}/PTAG/XXX-R.frq # counts and frequencies of the PTAG pairs in section XXX, and # their left and right mebmbers, for each XXX. Also the overal # counts and frequencies when XXX="tot". # # stats/{words,labels}/PTAG/XXX.fcm # stats/{words,labels}/PTAG/XXX-L.fcm # stats/{words,labels}/PTAG/XXX-R.fcm # same as XXX.frq, with extra columns for cumulative counts # and frequencies. # # stats/{words,labels}/PTAG/XXX.mtx # same as XXX.frq, restricted to the 10 most common # prefixes and suffixes, in tabular format. # if ( $#argv != 1 ) then echo "usage: ${usage}"; exit 1 endif set ptag = "$1"; shift; set tmp = "/tmp/$$" foreach f ( words labels ) if ( ! ( -d stats/${f}/${ptag} ) ) then mkdir stats/${f}/${ptag} else echo "cleaning out stats/${f}/${ptag}" ( cd stats/${f}/${ptag}/ && /bin/rm -f *.frq *.fcm all-cmp.* ) endif # Decide whether complex or simple words are allowed: if ( "$ptag" =~ k-* ) then set complex = 1 else set complex = 0 endif # Gather statistics by section set secs = ( `cat data/${f}/all.names` ) foreach sec ( ${secs} ) set wspfile = "data/${f}/${sec}.wsp" set frqfile = "stats/${f}/${ptag}/${sec}.frq" set fcmfile = "stats/${f}/${ptag}/${sec}.fcm" echo "${wspfile} -> ${frqfile}" cat ${wspfile} \ | select-simple-words \ -v complex=${complex} \ | extract-components \ -f get-components.gawk \ -v select=${ptag} \ | tr -d '{}' \ | egrep '.' \ | sort | uniq -c | expand \ | sort -b +0 -1nr \ | compute-freqs \ > ${frqfile} echo "${frqfile} -> ${fcmfile}" cat ${frqfile} \ | gawk '/./{print $1, $3;}' \ | compute-cum-freqs \ > ${fcmfile} end # Compute total counts and frequencies over all sections set frqfile = "stats/${f}/${ptag}/tot.frq" set fcmfile = "stats/${f}/${ptag}/tot.fcm" echo ${secs} \ | tr ' ' '\012' \ | sed -e 's/$/.frq/' \ > ${tmp}.ifiles echo "${frqfile}" ( cd stats/${f}/${ptag} && cat `cat ${tmp}.ifiles` ) \ | gawk '/./{print $1, $3;}' \ | combine-counts \ | sort -b +0 -1nr \ | compute-freqs \ > ${frqfile} echo "${fcmfile}" cat ${frqfile} \ | gawk '/./{print $1, $3;}' \ | compute-cum-freqs \ > ${fcmfile} # Assemble comparative tables foreach sec ( tot ${secs} ) tabulate-pairs \ ${ptag} ${sec} 24 24 end end /bin/rm ${tmp}.*