#! /n/gnu/bin/gawk -f # Last edited on 2000-02-04 16:36:01 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] " < INFILE.fcm > OUTFILE.grm" ) # Reads from stdin a bunch of lines of the form # # NT PROB DEFN # # where NTi is a non-terminal symbol, PROB is a probability # (not necessarily normalized) and DEFN a definition for NT. # Outputs the saem grammar in "grx" format. if (ARGC != 1) { error(("usage: " usage)); } onsy = ""; } (abort >= 0) { exit abort; } /^ *([#]|$)/ { print; next; } /^[-_:A-Za-z0-9]+ +[0-9]+[.]?[0-9]* / { nsy = $1; prb = $2; def = ""; sep = ""; for (i=3; i<=NF; i++) { def = (def sep $(i)); sep = "."; } if (def == "") { def = "="; } if (nsy != onsy) { printf "%s:\n", nsy; onsy = nsy; } printf " %7s %s\n", prb, def; next; } (NF >= 3) { frq = $1; def = $(NF); if (! match(frq, /^[0-9]+[.]?[0-9]*$/)) { error(("line " FNR ": bad count/prob format \"" $0 "\"")); } gsub(/[- ]/, "", def); gsub(/[.]/, " ", def); if (nsy == "") { error(("line " FNR ": missing non-terminal")); } printf "%-7s %7s %s\n", nsy, frq, def; next; } // { error(("line " FNR ": bad format \"" $0 "\"")); } function error(msg) { printf "%s\n", msg > "/dev/stderr"; abort = 1; exit 1; }