#! /usr/bin/gawk -f # Last edited on 2004-01-20 03:21:19 by stolfi BEGIN { usage = ( ARGV[0] "\\\n" \ " -v maxpre=NUM -v minmid=NUM -v maxsuf=NUM \\\n" \ " < INFILE.wfr > OUTFILE.pms" \ ); # Input is a word frequency file, with fields {COUNT WORD}, # where {WORD} is factored into elems, each delimited by braces. # # For each input record, writes zero or more records with fields # {COUNT MID PRE SUF} where {PRE·MID·SUF = WORD}. # # Only generates combinations where the {MID} field has at least # {minmid} elems; {PRE} has between 1 and {maxpre} elems, and # similarly for {SUF}. abort = -1; if (bias == "") { bias = 1; } if (maxpre == "") { maxpre = 2; } if (minmid == "") { minmid = 4; } if (maxsuf == "") { maxsuf = 2; } } (abort >= 0) { exit abort; } /^ *[0-9]/ { if (NF != 2) { data_error(("bad NF = " NF)); } ct = $1; wd = $2; if (! match(wd, /^[{].*[}]$/)) { data_error(("unfactored word = «" wd "»")); } wd = substr(wd, 2, length(wd)-2); n = split(wd, fld, /[}][{]/); if (n < minmid + 2) { next; } for (i = 1; i <= maxpre; i++) { for (j = 1; j <= maxsuf; j++) { m = n - i - j; if (m >= minmid) { printf "%7d ", ct output_part(i, m); printf " "; output_part(0, i); printf " "; output_part(i+m, j); printf "\n"; } } } next; } function output_part(skp,num, k) { for (k = 1; k <= num; k++) { printf "{%s}", fld[skp+k]; } } function data_error(msg) { printf "line %d: %s\n", NR, msg >> "/dev/stderr"; abort = 1; exit 1; } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1; }