#! /usr/bin/gawk -f # Prints to stdout a table of pattern classifications, produced # by compare-freqs and compute-count-ratio. # Each line of input should have the format # "NT FT NL FL rat mk patt" # where NT,NL are counts, FT,FL are frequancies, # rat is the ratio NL/NT, patt is a pattern of the # form "x:y", and mk is a two-byte class marker. # Externally defined parameters (with "-vVAR=VALUE"): # colchars string specifying column ordering for single-char right patterns # rowchars string specifying row ordering for single-char left patterns # Global variables: # xn[x] 1 if string x occurred on left side # yn[y] 1 if string y occurred on right side # xyn[x,y] classification of pattern x:y # maxlen max length of left or right side of pattern (min 2) BEGIN { n = 0; split("", xn); clearcounts(rowchars, xn); split("", yn); clearcounts(colchars, yn); maxlen = 2 } function clearcounts(chars, zn, i,z) { # Sets zn[z] = 0 for every letter "z" in "chars" for(i=1;i<=length(chars);i++) { z = substr(chars,i,1); if (z in zn) { printf "duplicate char '%s'\n", x; n=-1; exit 1 } zn[z] = 0; } } function checkrow(x, z, len) { # Called for each input pair "x:y". # Checks whether left string "x" is known and sets "xn[x]" to 1.. # Prints warning if "x" is a char and was not predeclared with "-v rowchars=..." # If this is the first occurrence of "x" in the file, initializes the row. len = length(x) if (len == 0) { printf "bad left string '%s'\n", $0 > "/dev/stderr"; exit 1 } if (len > maxlen) maxlen = len; if ((!(x in xn)) && (len == 1)) { printf "Warning: undeclared row character '%s'\n", x; xn[x] = 0; } if (xn[x] == 0) { for (z in yn) { xyn[x,z] = ""; } xn[x] = 1 } } function checkcol(y, z, len) { # Called for each input pair "x:y". # Checks whether right string "y" is known and sets "yn[y]" to 1.. # Prints warning if "y" is a char and was not predeclared with "-v colchars=..." # If this is the first occurrence of "y" in the file, initializes the column. len = length(y) if (len == 0) { printf "bad right string '%s'\n", $0 > "/dev/stderr"; exit 1 } if (len > maxlen) maxlen = len; if ((!(y in yn)) && (length(y) == 1)) { printf "Warning: undeclared col character '%s'\n", y; yn[y] = 0; } if (yn[y] == 0) { for (z in xn) { xyn[z,y] = ""; } yn[y] = 1 } } /^[^#]/ { mk = $6; pt = $7; if(match(pt, /:/)) { x = substr(pt,1,RSTART-1); checkrow(x); y = substr(pt,RSTART+1); checkcol(y); xyn[x,y] = mk } else { printf "bad pattern %s\n", pt > "/dev/stderr"; exit 1 } } /^$/ { next; } function fixchars(chars, zt, temp, tmpt, i, z) { # Returns a string consisting only of the characters "z" of "chars" # that have "zt[z] == 1", in the order given by "chars". # Characters that have "zt[z] == 1" but do not occur in "chars" # are appended at the end. temp = "" split("", tmpt) for (i=1;i<=length(chars);i++) { z = substr(chars,i,1); if(! (z in zt)) { printf "internal error #67221 %s\n" > "/dev/stderr"; exit 1 } else if(zt[z] == 0) { delete zt[z] } else if(zt[z] == 1) { temp = (temp z); tmpt[z] = 1 } } for(z in zt) { if ((length(z) == 1) && (! (z in tmpt))) { temp = (temp z); } } return temp; } function pelem(mk, esz) { # Prints one entry of the mark table # Inputs: # mk = pattern classification # esz = formatted element size in bytes # colch = list of characters to use as colum indices printf " %*s", esz, (mk == "" ? "." : mk); } function prow(x, yt, xyt, esz, colch, \ y, ncols, j) { # Prints one row of the mark table # Inputs: # x = row label and index # yt = column indicators (0 = omit row) # xyt = pattern classifications # esz = formatted element size in bytes # colch = list of characters to use as colum indices ncols = length(colch) printf "%*s |", esz, x; for(j=1;j<=ncols;j++) { y = substr(colch,j,1); pelem(xyt[x, y], esz); } for(y in yt) { if (length(y) != 1) { pelem(xyt[x, y], esz); } } printf "\n"; } function ptable(xt, yt, xyt, esz, rowch, colch, \ x, y, nrows, i,j, temp) { # Prints the mark table # Inputs: # xt = row indicators (0 = omit row) # yt = column indicators (0 = omit row) # xyt = pattern classifications # esz = formatted element size in bytes # rowch = list of characters to use as row indices # colch = list of characters to use as colum indices # Reduce/complete "chars" to include all characters that occured one or more times: rowch = fixchars(rowch, xt); colch = fixchars(colch, yt); nrows = length(rowch) ncols = length(colch) printf "%*.*s ", esz, esz, " "; for(j=1;j<=ncols;j++) { y = substr(colch,j,1); printf " %*s", esz, y; } for(y in yt) { if(length(y) != 1) { printf " %*s", esz, y; } } printf "\n"; printf "%*.*s ", esz, esz, " "; for(j=1;j<=ncols;j++) { printf " %*.*s", esz, esz, "---------"; } for(y in yt) { if(length(y) != 1) { printf " %*.*s", esz, esz, "---------"; } } printf "\n"; for(i=1;i<=nrows;i++) { x = substr(rowch,i,1); prow(x, yt, xyt, esz, colch); } for(x in xt) { if (length(x) != 1) { prow(x, yt, xyt, esz, colch); } } } END { printf "Pattern classes:\n"; printf "\n"; ptable(xn, yn, xyn, maxlen, rowchars, colchars); printf "\n"; exit 0; }