#! /usr/bin/gawk -f # Last edited on 2004-10-04 15:02:50 by stolfi BEGIN { usage = ( ARGV[0] " -v order=NUM < SAMPLE.tks > SAMPLE.pfr" ); # Reads a text, one token per line. Words must not contain spaces or # "_". Outputs a table of Markov transitions, in the format # "{COUNT} {FREQ} {STATE} {CHAR}" # The {STATE} is a string of {order} B-language chars, prefixed by "$". # The {FREQ} is relative to the total count for the {STATE}. # sorted by {CHAR} and then by decreasing {COUNT}. # The table includes word breaks, denoted by "_". if (order == "") { arg_error("must define \"order\""); } if ((order < 0) || (order > 5)) { arg_error(("bad order = \"" order "\"")); } split("", ct); # {stct[st,ch]} is the count for char {ch} after state {st}. split("", totct); # {totct[st]} is the total count of state {st}. split("", chus); # {chus[ch]} is 1 iff character {ch} is used. # Initial state before each parag: inistate = substr("___________________________",1,order); # Current state: st = inistate; # Count successive blank lines: nblanklines = 1; } // { w = ($1 "_"); # Replace each parag break by a string of {min{2,order}} "_"s if (length(w) == 1) { nblanklines++; # Ignore multiple blank lines: if (nblanklines > 1) { next; } # If {st} is nonempty, it already ends with exactly one "_", so: if (order > 2) { w = substr("___________________________",1,order-1); } } else { nblanklines = 0; } m = length(w); for (i = 1; i <= m; i++) { ch = substr(w,i,1); ct[st,ch]++; totct[st]++; chus[ch] = 1; st = substr((st ch),2); } next; } END{ if (length(st) != order) { prog_error(("bad state = \"" st "\"")); } for (st in totct) for (ch in chus) if ((st,ch) in ct) { printf "%7d %8.6f $%s %s\n", \ ct[st,ch], ct[st,ch]/totct[st], st, ch; } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function prog_error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit 1 }