#! /bin/csh -f # Last edited on 2000-01-22 21:09:08 by stolfi set usage = "$0 FTAG" # Gathers statistics about a specific feature of Voynichese words # (identified by the tag FTAG). Assumes available the following files: # # text-subsecs/all.names # a list of names of subsections # # text-subsecs/XXX.evt # the text of subsection XXX (where XXX is found in all.names), # in EVMT format. # # extract-FTAG-strings # a filter that takes a stream of Voynichese words, one per line, # and outputs the corresponding value(s) of the FTAG feature, # one per line. Empty lines in the output will be ignored. # Will be called with "-f factor-text.gawk" # # unit-to-type.tbl # a table that maps each unit number (e.g. f82v.P3) to the # corresponding text type (e.g. "parags", "labels", etc.) # # Creates the following files: # # stats-subsecs/FTAG/XXX.frq # counts and frequencies of the FTAG values in section XXX, # for each XXX. Also the overal counts and frequencies # when XXX="tot". # # stats-subsecs/FTAG/XXX.fcm # same as XXX.frq, with extra columns for cumulative counts # and frequencies. # # stats-subsecs/FTAG/all-cmp.cts # stats-subsecs/FTAG/all-cmp.frq # a table showing the counts and frequencies, respectively, of each # FTAG value in each section, side by side. # # stats-subsecs/FTAG/all-cmp.top # a table showing the FTAG values that occur in each section, # and their frequencies, sorted by the latter, side by side. # if ( $#argv != 1 ) then echo "usage: ${usage}"; exit 1 endif set ftag = "$1"; shift; set tmp = "/tmp/$$" if ( ! ( -d stats-subsecs/${ftag} ) ) then mkdir stats-subsecs/${ftag} else echo "cleaning out stats-subsecs/${ftag}" ( cd stats-subsecs/${ftag}/ && /bin/rm -f *.frq *.fcm all-cmp.* ) endif # Gather statistics by section foreach f ( `cat text-subsecs/all.names` ) set frqfile = "stats-subsecs/${ftag}/${f}.frq" set fcmfile = "stats-subsecs/${ftag}/${f}.fcm" echo ${frqfile} cat text-subsecs/${f}.evt \ | select-units \ -v types='parags,starred-parags,circular-lines,circular-text,radial-lines,titles' \ -v table=unit-to-type.tbl \ | words-from-evt \ | egrep -v '[?*]' \ | extract-${ftag}-strings \ -f factor-text.gawk \ | egrep '.' \ | sort | uniq -c | expand \ | sort -b +0 -1nr \ | compute-freqs \ > ${frqfile} cat ${frqfile} \ | gawk '/./{print $1, $3;}' \ | compute-cum-freqs \ > ${fcmfile} end # Compute total counts and frequencies over all sections set frqfile = "stats-subsecs/${ftag}/tot.frq" set fcmfile = "stats-subsecs/${ftag}/tot.fcm" echo "${fcmfile}" cat text-subsecs/all.names \ | sed -e 's/$/.frq/' \ > ${tmp}.ifiles ( cd stats-subsecs/${ftag} && cat `cat ${tmp}.ifiles` ) \ | gawk '/./{print $1, $3;}' \ | combine-counts \ | sort -b +0 -1nr \ | compute-freqs \ > ${frqfile} cat ${frqfile} \ | gawk '/./{print $1, $3;}' \ | compute-cum-freqs \ > ${fcmfile} plot-histogram ${fcmfile} & # Assemble comparative tables cat text-subsecs/all.names \ > ${tmp}.ifiles tabulate-frequencies \ -dir stats-subsecs/${ftag} \ -out all \ -title "feature" \ -maxLines 999999 \ tot `cat ${tmp}.ifiles` echo "pha.2 hea.1 cos.2 zod.1 heb.1 str.2 bio.1" \ | tr ' ' '\012' \ > ${tmp}.sfiles tabulate-frequencies \ -dir stats-subsecs/${ftag} \ -out some \ -title "feature" \ -maxLines 999999 \ tot `cat ${tmp}.sfiles` /bin/rm ${tmp}.*