#! /bin/csh -f set usage = "$0 [ -recode FILTER ] [ -chars CHARS ] INFILE OUTPREFIX" # Extracts a list of words from a single-version transcription file in EVT format, # arbitrary alphabet. # # First, removes all comments, location codes, the fillers "!" "%", # and any leading and trailing blanks (spaces, dots, and commas). # Replaces embedded strings of blanks by a single ".". # # The comment-stripped text is piped through the specified filter, if given. # # Then inserts a newline after each "=" and "/", if there is none. # Inserts a "/" before each newline, if there is none. # Maps "," and " " to ".". # Replaces strings of ".-/=" characters by a single copy of the # highest character of the string, in that order. # Deletes any leading "." or "-" # Deletes any leading "/" unless it is the only char in the line. # # What results is the "base text". # # Writes a bunch of files with names beginning in OUTPREFIX: # # OUTPREFIX.txt # A readable version of the base text, with "." and "-" replaced by " ", # "/" omitted (implied by newlines), "=" replaced by " =". # # OUTPREFIX.wds # List of all words, one per line, in original order. # The codes "-" "=" "/" are treated as separate words. # The code "." is treated as a word separator but omitted. # # OUTPREFIX.dic # The set of all words, including "/" "=" "-", sorted and uniquified. # # OUTPREFIX.frq # Frequency counts for all words, in decreasing freq order. # # OUTPREFIX-gut.wds # OUTPREFIX-gut.dic # OUTPREFIX-gut.frq # Same as OUTPREFIX.wds, OUTPREFIX.dic, and OUTPREFIX.frq, # but including only the "good" words (those made entirely of CHARS). # # OUTPREFIX-bad.wds # OUTPREFIX-bad.dic ditto, only "bad" words # OUTPREFIX-bad.frq ditto, only for "bad" words # Ditto, but including only the "bad" words (those that contain # one or more "?"s). # # OUTPREFIX-fun.wds # OUTPREFIX-fun.dic # OUTPREFIX-fun.frq # Ditto, but including only the "funny" words (those # that do not contain "?" but contain some non-CHARS). # # The CHARS argument must be a fully expanded string (no ranges, # wildcards, etc). It may include "=" "-" "/" but not "?" or ".". set chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" set wc = "${STOLFIHOME}/bin/dicio-wc" set recode = "/bin/cat" while ( ( $#argv > 0 ) && ( "x$1" =~ x-* ) ) if ( ( $#argv >= 2 ) && ( "x$1" == "x-chars" ) ) then set chars = "$2"; shift; shift else if ( ( $#argv >= 2 ) && ( "x$1" == "x-recode" ) ) then set recode = "$2"; shift; shift else echo "usage: ${usage}"; exit 1 endif end if ( $#argv < 2 ) then echo "usage: ${usage}"; exit 1 endif set infile = "$1"; shift; set prefix = "$1"; shift; if ( $#argv > 0 ) then echo "usage: ${usage}"; exit 1 endif set temp = "/tmp/$$.txt" cat ${infile} \ | sed \ -e '/^#/d' \ -e 's/{[^}]*}//g' \ -e 's/^<[^>]*;[A-Z][A-Za-z0-9]*> *//g' \ -e 's/[\!%]//g' \ -e 's/^[., ][., ]*//g' \ -e 's/[., ][., ]*$//g' \ -e 's/[., ][., ]*/./g' \ | ${recode} \ | sed \ -e 's/[., ][., ]*/./g' \ -e 's:=\(.\):= \1:g' \ -e 's:/\(.\):/ \1:g' \ | tr ' ' '\012' \ | sed \ -e 's:$:/:g' \ -e 's/[.]*-[-.]*/-/g' \ -e 's:[-.]*/[-./]*:/:g' \ -e 's:[-./]*=[-./=]*:=:g' \ -e 's/^[-.]*//g' \ -e 's:^/\(.\):\1:g' \ -e 's/[*]/?/g' \ -e 's/?????*/???/g' \ > ${temp} cat ${temp} \ | sed \ -e 's/[-.]/ /g' \ -e 's:/::g' \ -e 's:^ *::g' \ -e 's: *$::g' \ -e 's/\(.\)=/\1 =/g' \ | egrep '.' \ > ${prefix}.txt cat ${temp} \ | sed -e 's:\([-=/]\): \1 :g' \ | /bin/tr '. ' '\012\012' \ | egrep '.' \ > ${prefix}.wds cat ${prefix}.wds \ | egrep '^['"${chars}"']*$' \ > ${prefix}-gut.wds cat ${prefix}.wds \ | grep -v '?' \ | egrep '[^'"${chars}"']' \ > ${prefix}-fun.wds cat ${prefix}.wds \ | grep '?' \ > ${prefix}-bad.wds foreach mod ( '' '-gut' '-fun' '-bad' ) cat ${prefix}${mod}.wds \ | sort | uniq \ > ${prefix}${mod}.dic cat ${prefix}${mod}.wds \ | sort | uniq -c | expand \ | sort +0 -1nr \ > ${prefix}${mod}.frq end ${wc} ${prefix}.txt ${prefix}{,-gut,-fun,-bad}.{wds,dic} echo "Sample from ${prefix}.txt:" echo " " cat ${prefix}.txt | head -8 | sed -e 's/^/ /' echo " " cat ${temp} \ | count-digraph-freqs \ -v pad="/" \ -v showentropy=1 \ -v chars="./=${chars}?-" /bin/rm ${temp}