This script does it: cat base.elt \ | sed \ -e 's/{ee/{X}{/g' \ -e 's/{[ceh]}/{E}/g' \ -e 's/{\(..*\)e}/{\1}{E}/g' \ -e 's/{[csi][h]}/{X}/g' \ -e 's/{[ci][ktpf][h]}/{X}/g' \ -e 's/{[ci][ktpf]}/{X}/g' \ -e 's/{[ktpf]}/{X}/g' \ -e 's/{[rlsn]}/{R}/g' \ -e 's/{[mdgj]}/{R}/g' \ -e 's/{[aoy]}/{O}/g' \ -e 's/{[q]}/{Q}/g' \ -e 's/{[i][i]*}/{I}/g' \ -e 's/{[ceh]}/{E}/g' \ -e 's/\([A-Z]\)/{\1}/g' \ > base.clt but this splitting makes the counts smaller hence more affected by error; and the tables get longer and harder to grok. Anyway, here is the script: cat base.txt \ | sed \ -e 's/ee/S/g' \ -e 's/[csi][h]/S/g' \ -e 's/[ci][ktpf][h]/G/g' \ -e 's/[ci][ktpf]/G/g' \ -e 's/[ktpf]/H/g' \ -e 's/[rlsn]/L/g' \ -e 's/[mdgj]/D/g' \ -e 's/[aoy]/O/g' \ -e 's/[q]/Q/g' \ -e 's/[i][i]*/I/g' \ -e 's/[ceh]/E/g' \ -e 's/\([A-Z]\)/{\1}/g' \ > base.flt Comparing pair frequencies: multicol -v titles="lin fig std non" {lin,fig,std,non}-${map}-${v}.frq \ > all-${map}-${v}.frq compare-freqs {lin,fig,std,non}-${map}-${v}.frq \ | tr ':' ' ' \ | sort +0.0 -0.1r +8b -9b +0b -1nr +4b -4nr +6b -7nr \ | gawk '/^[^#]/{if($9\!=c){print "";c=$9}} //{print}' \ > all-${map}-${v}.cmpfrq Relative frequency at breaks ---------------------------- The absolute frequency of each pair X-Y around line or word breaks is affected by the frequency of the consituent letters. So a more relevant quantity is the ratio (freq at line break)/(freq at word break). TO BE DONE foreach brk ( std lin ) cat ${brk}-elt.frq \ | gawk '//{$3=("{" $3 "}"); gsub(/:/, "}:{", $3); print $1,$3;}' \ | elt2slt \ | tr -d '{}' \ | combine-counts \ | sort -b +0 -1nr \ | compute-freqs \ > ${brk}-slt.frq end foreach brk ( std lin ) cat ${brk}-slt.frq \ | gawk '//{gsub(/^.*:/, "", $3); print $1,$3;}' \ | combine-counts \ | sort -b +0 -1nr \ | compute-freqs \ > ${brk}-aft-slt.frw cat ${brk}-slt.frq \ | gawk '//{gsub(/:.*$/, "", $3); print $1,$3;}' \ | combine-counts \ | sort -b +0 -1nr \ | compute-freqs \ > ${brk}-bef-slt.frw end foreach f ( sec-all{-bol,,-eol}.ect ) cat $f \ | gawk \ ' /../{ t += $1; print; } \ END { printf " %7s TOTAL\n", t; } \ ' \ > /tmp/$f end compare-counts /tmp/sec-all{-bol,,-eol}.ect \ | gawk \ ' /../{ \ M=($1+$3+2); B=($1+1)/M; X=M/($2+3); \ printf " %7s %7s %7s %7.4f %7.4f %s\n",$1,$2,$3,B,X,$4;} \ ' \ | sort -b +1 -2nr +3 -4nr \ > /tmp/cmp-counts.txt let's plot the bol/eol and end/mid ratios for the most significant entries: cat /tmp/cmp-counts.txt \ | gawk '($2 > 32){print;}' \ > /tmp/cmp-counts-sig.txt gnuplot < $f.epr cat $f.epr \ | gawk '/./{printf "%d %s\n", 10000*$2, $3;}' \ > /tmp/$f.eprx end JUNK--- Comparing element frequencies (×10000): b-o-l, overall, e-o-l cat `cat txt.pages | sed -e 's@^\(.*\)$@pages-evt/\1.els@'` \ | sed -e 's/^\({[^{}]*}\).*/\1/' \ | egrep '^{.*}$' \ | tr -d '{}' \ | egrep '.' \ | sort | uniq -c | expand \ | sort -b +0 -1nr \ > sec-all-bol.ect cat `cat txt.pages | sed -e 's@^\(.*\)$@pages-evt/\1.els@'` \ | sed -e 's/.*\({[^{}]*}\)$/\1/' \ | egrep '^{.*}$' \ | tr -d '{}' \ | egrep '.' \ | sort | uniq -c | expand \ | sort -b +0 -1nr \ > sec-all-eol.ect foreach f ( sec-all-bol sec-all sec-all-eol ) cat $f.ect \ | est-probs \ > $f.epr cat $f.epr \ | gawk '/./{printf "%d %s\n", 10000*$2, $3;}' \ > /tmp/$f.eprx end /usr/ucb/echo -n " "; \ echo "bol all eol b-d" | sed -e 's/ / /g' ;\ ( compare-counts /tmp/sec-all{-bol,,-eol}.eprx \ | gawk \ ' /--/{printf " %7s %7s %7s %7s %s\n",$1,$2,$3,$3,$4;next;} \ /../{printf " %7s %7s %7s %7.4f %s\n",$1,$2,$3,($1-$3)/(32*($2+1)),$4;} \ ' \ | sort -b +3 -4nr ) bol all eol b-d ------- ------- ------- ------- --------- 947 98 10 0.2958 p 5 . . 0.1562 c? 972 160 310 0.1285 s 1277 402 . 0.0990 q 896 333 10 0.0829 t 81 25 15 0.0793 f 376 176 5 0.0655 sh 5 2 . 0.0521 de 5 2 . 0.0521 x 1518 903 277 0.0429 d 15 11 . 0.0391 cph 66 64 . 0.0317 te 15 15 . 0.0293 cthe 1285 1689 122 0.0215 o 102 150 . 0.0211 she 274 480 10 0.0172 ch 36 60 8 0.0143 cth 10 21 . 0.0142 e? 254 571 10 0.0133 k 79 301 . 0.0082 che 13 118 . 0.0034 ke 8 53 5 0.0017 ckh 13 301 8 0.0005 ee 38 990 36 0.0001 a . . . 0.0000 '? . . . 0.0000 cf . . . 0.0000 cp . . . 0.0000 iiid . . . 0.0000 iis . . . 0.0000 ij . . . 0.0000 iph . . . 0.0000 ith . . . 0.0000 pe . 1 . 0.0000 cfhe . 1 . 0.0000 ck . 1 . 0.0000 ct . 1 . 0.0000 h? . 1 . 0.0000 id . 1 . 0.0000 iid . 1 . 0.0000 iil . 1 . 0.0000 ikh . 2 . 0.0000 iiir . 2 . 0.0000 il . 2 . 0.0000 is . 4 . 0.0000 cfh . 5 . 0.0000 cphe . 5 . 0.0000 i? . 18 . 0.0000 ckhe . 25 . 0.0000 eee 56 32 81 -0.0237 ? . 39 36 -0.0281 ir . 8 10 -0.0347 iir 180 711 1071 -0.0391 l 30 461 828 -0.0540 r 1429 1220 3859 -0.0622 y . 95 292 -0.0951 in . 333 1117 -0.1045 iin . 7 38 -0.1484 iiin . . 5 -0.1562 b . . 5 -0.1562 iiil . . 5 -0.1562 u 5 2 28 -0.2396 j 5 9 102 -0.3031 n . 1 23 -0.3594 g . 4 61 -0.3812 im . 1 28 -0.4375 iim 5 72 1587 -0.6772 m