#! /usr/bin/gawk -f # Usage: $0 [ -v chars=XXXX ] < NGRAMFILE > PTABLES # Prints to stdout a table of transition counts. # Also computes several derived tables (probabilities, entropies, etc.). # Each line of input should consist of N>1 characters, of which # the first N-1 are interpreted as a "state" and the last one as a "transition". # Each table has one row for each state and one column for each transition. # Externally defined parameters (with "-vVAR=VALUE"): # chars string specifying the ordering of transition chars # Global variables: # n total digraph count # xn[x] num occurrences of state x # yn[x] num occurrences of transition y # xyn[x,y] num occurences of pair (x,y) # maxslen max state length BEGIN { n = 0; split("", xn); split("", yn); setmarks(chars, mark); maxslen = 1; } function checkstate(x, z, m) { if (!(x in xn)) { for (z in yn) { xyn[x,z] = 0; } xn[x] = 0 m = length(x); if (m > maxslen) { maxslen = m; } } } function checktrans(y, z) { if (!(y in yn)) { for (z in xn) { xyn[z,y] = 0; } yn[y] = 0 } if (!(y in mark)) { printf "Warning: undeclared character '%s'\n", y; mark[y] = 1; } } function setmarks(chars, mark, i,y) { # Sets mark[y] = true for every letter in chars split("", mark) for(i=1;i<=length(chars);i++) { y = substr(chars,i,1); if (y in mark) { printf "duplicate char '%s'\n", y; exit 1 } mark[y] = 1; } } function fixchars(chars, yt, rows, temp, i, x) { # Sets rows[y] = 1 for every character y that has an entry in yt. # Returns a string consisting only of the defined characters, in the # order given by "chars". Characters that do not occur in "chars" are # appended at the end. temp = "" for (i=1;i 0 ? 0.5 : -0.5)) } /./ { w = $0; m = length(w); x = substr(w,1,m-1); checkstate(x); y = substr(w,m,1); checktrans(y); xyn[x,y]++; xn[x]++; yn[y]++; n++; next; } // { next; } END { printf "State entropy: " h = 0.000 for (x in xn) { h += entropy(xn[x]/n); } printf "%.3f\n\n", h; printf "Transition entropy: " h = 0.000 for (y in yn) { h += entropy(yn[y]/n); } printf "%.3f\n\n", h; for (x in xn) xh[x] = 0.000; h = 0.000 for (x in xn) { for (y in yn) { xh[x] += entropy(xyn[x,y]/xn[x]); } h += xh[x] * (xn[x]/n); } printf "Transition counts:\n"; printf "\n"; ptable(n, h, xn, xh, yn, xyn, "%d", 5, chars) printf "\n"; printf "Transition probabilities (× 99):\n"; printf "\n"; for (y in yn) yp[y] = pscale(yn[y]/n, 99); for (x in xn) for (y in yn) { xyp[x,y] = pscale(xyn[x,y]/xn[x], 99); } ptable(n, h, xn, xh, yp, xyp, "%d", 2, chars) printf "\n"; exit 0; }