#! /usr/bin/gawk -f # Last edited on 2000-10-05 00:29:29 by stolfi # Prints to stdout a table of digraph occurences in stdin. # Also computes several derived tables (probabilities, entropies, etc.). # # Line breaks are ignored. The file is implicitly prefixed by a # user-given char, but is not padded at the end. Thus, if # N bytes are read, N digraphs are counted. # Externally defined parameters (with "-v VAR=VALUE"): # pad the character implicitly prefixed to the file. # chars string specifying the row and column ordering. # showentropy 1 to print entropy tables. # showstrangeness 1 to print strangeness/normalness tables. # Global variables: # n total digraph count # xn[x] num occurrences of letter x # xyn[x,y] num occurences of digraph xy function checksym(x, z) { # Checks whether character "x" is expected. # If not, prints warning. # Also allocates the table if this is the first occurence. if (!(x in xn)) { for (z in xn) { xyn[x,z] = 0; xyn[z,x] = 0; } xn[x] = 0 } if (!(x in mark)) { printf "Warning: undeclared character '%s'\n", x > "/dev/stderr"; mark[x] = 1; } } function setmarks(chars, mark, i,x) { # Sets mark[x] = true for every letter in chars split("", mark) for(i=1;i<=length(chars);i++) { x = substr(chars,i,1); if (x in mark) { printf "duplicate char '%s'\n", x; exit 1 } mark[x] = 1; } } BEGIN { n = 0; split("", xn); setmarks(chars, mark); if (length(pad) != 1) { printf "invalid padding char\"%s\"\n", pad > "/dev/stderr"; ok = 0; exit 1 } } /^$/ { if(die) exit 1; next; } /./ { if(die) exit 1; m = length($0); y = pad; checksym(y); for(i=1;i<=m;i++) { x = y; y = substr($0,i,1); checksym(y); xyn[x,y]++; xn[x]++; n++; } pad = y; } function fixchars(chars, xt, rows, temp, i, x) { # Sets rows[x] = 1 for every character x that has an entry in xt. # Returns a string consisting only of the defined characters, in the # order given by "chars". Characters that do not occur in "chars" are # appended at the end. temp = "" for (i=1;i 0 ? 0.5 : -0.5)) } function strangeness( \ n, xk, yk, xyk, \ fx,fy,fxy,fmax,fmin,fexp,tmax,tmin,tsum,texp \ ) { if ((xk == 0) || (yk == 0)) { return 0 } else { fx = xk/n; fy = yk/n; fxy = xyk/n; fmax = (fx < fy ? fx : fy); fexp = fx*fy; fmin = 0; if (fxy <= fmin) { return -1 } else if (fxy >= fmax) { return +1 } else { tmax = (fmax - fxy)/(fmax - fexp); tmin = (fxy - fmin)/(fexp - fmin); tsum = (log(tmin) - log(tmax))/log(2.0); if ( tsum > 0 ) { texp = exp(-2*tsum); return (1 - texp)/(1 + texp) } else { texp = exp( 2*tsum); return (texp - 1)/(texp + 1) } } } } function normalness(n, xk, yk, xyk, str) { str = strangeness(n, xk, yk, xyk); return 1 - str*str } END { printf "Digraph counts:\n"; printf "\n"; ptable(n, xn, xn, xyn, "%d", 5, chars) printf "\n"; if (showstrangeness != 0) { printf "Strangeness (× 99):\n"; printf "\n"; for (x in xn) xr[x] = pscale((1+0)/2, 99); for (y in xn) yr[y] = pscale((1+0)/2, 99); for (x in xn) for (y in xn) { xyr[x,y] = pscale((1+strangeness(n, xn[x], xn[y], xyn[x,y]))/2, 99); } ptable(50, xr, yr, xyr, "%d", 2, chars) printf "\n"; printf "Normalness (× 99):\n"; printf "\n"; for (x in xn) xr[x] = pscale(1.000, 99); for (y in xn) yr[y] = pscale(1.000, 99); for (x in xn) for (y in xn) { xyr[x,y] = pscale(normalness(n, xn[x], xn[y], xyn[x,y]), 99); } ptable(50, xr, yr, xyr, "d", 2, 2, chars) printf "\n"; } if (1) { printf "Next-symbol probability (× 99):\n"; printf "\n"; for (x in xn) xp[x] = pscale(1.000, 99); for (y in xn) yp[y] = pscale(xn[y]/n, 99); for (x in xn) for (y in xn) { xyp[x,y] = pscale(xyn[x,y]/xn[x], 99); } ptable(99, xp, yp, xyp, "%d", 2, chars) printf "\n"; printf "Previous-symbol probability (× 99):\n"; printf "\n"; for (x in xn) xp[x] = pscale(xn[x]/n, 99); for (y in xn) yp[y] = pscale(1.000, 99); for (x in xn) for (y in xn) { xyp[x,y] = pscale(xyn[x,y]/xn[y], 99); } ptable(99, xp, yp, xyp, "%d", 2, chars) printf "\n"; } if (showentropy != 0) { printf "Symbol entropy: " h = 0.000 for (x in xn) { h += entropy(xn[x]/n); } printf "%.3f\n\n", h; printf "Next-symbol entropy:\n" printf "\n"; for (x in xn) xh[x] = 0.000; for (y in xn) yh[y] = entropy(xn[y]/n); h = 0.000 for (x in xn) { for (y in xn) { xyh[x,y] = entropy(xyn[x,y]/xn[x]); xh[x] += xyh[x,y]; } h += xh[x] * (xn[x]/n); } ptable(h, xh, yh, xyh, "%.3f", 5, chars) printf "\n"; printf "Previous-symbol entropy:\n" printf "\n"; for (x in xn) xh[x] = entropy(xn[x]/n); for (y in xn) yh[y] = 0.000; h = 0.000 for (y in xn) { for (x in xn) { xyh[x,y] = entropy(xyn[x,y]/xn[y]); yh[y] += xyh[x,y]; } h += yh[y] * (xn[y]/n); } ptable(h, xh, yh, xyh, "%.3f", 5, chars); exit 0; } }