#! /usr/bin/gawk -f # Last edited on 2004-10-03 20:46:51 by stolfi BEGIN { usage = ( ARGV[0] " < SAMPLE.tks > SAMPLE.lfr" ); # Reads a text, one token per line. Words must not contain spaces or # "_". Outputs a table of character counts, in the format # "{COUNT} {FREQ} {CHAR}" # sorted by decreasing {COUNT}. The table includes word # breaks, denoted by "_". Blank lines are ignored. split("",c); totct = 0; } /./ { w = ($1 "_"); m = length(w); for (i = 1; i <= m; i++) { c[substr(w,i,1)]++; totct++; } } END{ for (x in c) { printf "%7d %8.6f %s\n", c[x], c[x]/totct, x; } }