#! /bin/csh -f # Last edited on 2000-06-08 04:11:03 by stolfi set usage = "$0 CTAG" # Gathers statistics about components of Voynichese words # (identified by the tag CTAG). Assumes available the following files: # # data/{words,labels}/all.names # a list of names of all subsections # # data/{words,labels}/XXX.wsp # the words/labels of subsection XXX (for each XXX in # all.names), in EVA encoding, factored into QOKOKOKOKO # elements, with mantle and core set off with "()" and "<>". # # Creates the following files: # # stats/{words,labels}/CTAG/XXX.frq # counts and frequencies of the CTAG components in section XXX, # for each XXX. Also the overall counts and frequencies, # when XXX="tot". # # stats/{words,labels}/CTAG/XXX.fcm # same as XXX.frq, with extra columns for cumulative counts # and frequencies. # if ( $#argv != 1 ) then echo "usage: ${usage}"; exit 1 endif set ctag = "$1"; shift; set tmp = "/tmp/$$" foreach f ( labels words ) if ( ! ( -d stats/${f}/${ctag} ) ) then mkdir -p stats/${f}/${ctag} else echo "cleaning out stats/${f}/${ctag}" ( cd stats/${f}/${ctag}/ && /bin/rm -f *.frq *.fcm all-cmp.* ) endif # Gather statistics by section set secs = ( `cat data/${f}/all.names` ) foreach sec ( ${secs} ) set wspfile = "data/${f}/${sec}.wsp" set frqfile = "stats/${f}/${ctag}/${sec}.frq" set fcmfile = "stats/${f}/${ctag}/${sec}.fcm" echo "${wspfile} -> ${frqfile}" cat ${wspfile} \ | extract-components \ -f get-components.gawk \ -v select=${ctag} \ | egrep '.' \ | sort | uniq -c | expand \ | sort -b +0 -1nr \ | compute-freqs \ > ${frqfile} echo "${frqfile} -> ${fcmfile}" cat ${frqfile} \ | gawk '/./{print $1, $3;}' \ | compute-cum-freqs \ > ${fcmfile} end end # Compute total counts and frequencies over all sections foreach f ( labels words ) set frqfile = "stats/${f}/${ctag}/tot.frq" set fcmfile = "stats/${f}/${ctag}/tot.fcm" echo ${secs} \ | tr ' ' '\012' \ | sed -e 's/$/.frq/' \ > ${tmp}.ifiles echo "${frqfile}" ( cd stats/${f}/${ctag} && cat `cat ${tmp}.ifiles` ) \ | gawk '/./{print $1, $3;}' \ | combine-counts \ | sort -b +0 -1nr \ | compute-freqs \ > ${frqfile} echo "${fcmfile}" cat ${frqfile} \ | gawk '/./{print $1, $3;}' \ | compute-cum-freqs \ > ${fcmfile} end /bin/rm ${tmp}.*