#! /usr/bin/gawk -f # Last edited on 2000-05-21 20:01:30 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] \ " -v delims=XY\\\n" \ " [ -v separator=Z ]\\\n" \ " [ -v axiom=AXIOM ]\\\n" \ " [ -v ignorecounts=BOOL ] \\\n" \ " [ -v eps=NUMBER ] \\\n" \ " [ -v truncate=NUMBER ] \\\n" \ " < GRAMMAR.grx > WFREQS \\\n" \ ); # Reads a finite grammar from stdin, writes the words and their # computed probabilities to stdout. # # The "delims" parameter should be a pair of characters X and Y to be used # as open and close delimiters in the derived strings (default "«»"). # # The grammar is a set of rules of the form # # NTSYMB: # COUNT1 OTHER1... DEFN1 # COUNT2 OTHER2... DEFN2 # ... ... ... # # where NTSYMB is a non-terminal symbol, OTHER is zero or more optional # numeric fields (ignored), and DEFNi is an alternative to NTSYMB. # # The NTSYMB must be a string of [A-Za-z0-9()*+_] and must begin [A-Z(] # # The counts COUNTi must be nonnegative; they are implicitly # normalized to probabilities that add to 1. # # Each alternative DEFNi must be a sequence of symbols, separated by # the "separator" character (default "."), which must not contain # blanks or the two characters XY. Symbols in DEFNi that are not # defined by any rule are assumed to be terminal. Use the # separator alone to indicate the empty string as an alternative. # # If the "axiom" (root symbol) is not specified, the left-hand side # of the first rule is used by default. # # The output is a list of lines of the form # # PROB PHRASE # # where PHRASE is a terminal phrase derived from the root symbol by some # sequence of leftmost reductions, and PROB is the product of all # probabilities associated to the rules used in the derivation. # # The PHRASE and any substituted phrases inside it are # bracketed with the characters X and Y. # # If the grammar is structurally ambiguous, the same PHRASE may # appear more than once in the output (in spite of the bracketing). # # If "truncate" is positive, any derivation tree whose probabilit # is less than "truncate" is omitted from the output. (Use with care, # since many small numbers may add up to a sizabel probablity.) # # The script will loop forever if the grammar is recursive and # "truncate" is zero. # # Global variables: # "defn[firstrule[s]]" is the RHS of the last rule of non-terminal "s". # # "prob[k]" is the probability for "defn[k]". # # "defn[nextrule[k]]" is the next rule after "defn[k]" # of the same non-term symbol (or -1). split("", defn); split("", prob); split("", firstrule); split("", nextrule); nrules = 0; if (eps == "") { eps = 0; } if (truncate == "") { truncate = 0; } if (delims == "") { dop = "«"; dcl = "»"; delims = (dop dcl); } else if ( \ (length(delims) != 2) || match(delims, /[A-Za-z0-9_()*+\[\]\\ ]/) \ ) { arg_error("bad delims"); } else { dop = substr(delims,1,1); dcl = substr(delims,2,1); } dpat = ("[" delims "]"); etypat = ("[" dop "][" dcl "]"); if (separator == "") { separator = "."; } else { if ( \ (length(separator) != 1) || (separator == dop) || (separator == dcl) || match(separator, /[A-Za-z0-9_()*+\[\]\\ ]/) \ ) { arg_error("bad symbol separator"); } } spat = ("[" separator "]"); } (abort >= 0) { exit abort; } /^ *$/ { next; } /^ *[#]/ { next; } /^[A-Z(][A-Za-z0-9_()*+]*[ ]*[:][ ]*$/ { s = $0; gsub(/[ ]*[:][ ]*$/, "", s); if (s in firstrule) { grammar_error(("repeated symbol \"" s "\"")); } firstrule[s] = -1; if (axiom == "") { axiom = s; } cursymb = s; nsymb++; next; } /^ *[0-9.]/ { if (cursymb == "") { grammar_error("rule without head symbol"); } s = cursymb; if (NF < 2) { grammar_error("bad rule format"); } if (! match($1, /[0-9]*([0-9]|([0-9][.]|[.][0-9]))[0-9]*/)) { grammar_error("bad rule count field"); } if (ignorecounts) { ct = 1; } else { ct = $1; } # Add delimiters around symbols in RHS: def = $(NF); if (def ~ dpat) { grammar_error("rule uses reserved delims"); } def = (dop def dcl); gsub(spat, (dcl dop), def); gsub(etypat, "", def); if (def == "") { def = ( dop dcl ); } # printf "%s -> %s\n", s, def > "/dev/stderr"; # Store rule: nextrule[nrules] = firstrule[s]; defn[nrules] = def; prob[nrules] = ct; firstrule[s] = nrules; nrules++; next; } /./ { grammar_error("bad line format"); } END { if (abort >= 0) { exit abort; } if (axiom == "") { exit 0; } if (! (axiom in firstrule)) { arg_error(("no rule for axiom «" axiom "»")); } # Normalize probabilities printf "normalizing..." > "/dev/stderr"; for (s in firstrule) { k = firstrule[s]; sp = 0; nr = 0; while (k >= 0) { sp += prob[k]; nr++; k = nextrule[k]; } k = firstrule[s]; while (k >= 0) { prob[k] = (sp == 0 ? 1.0/nr : prob[k]/sp); k = nextrule[k]; } } printf "done\n" > "/dev/stderr"; # Enumerate language: top = 1; split("", hstack); hstack[top] = (dop axiom dcl); # Incompletely expanded phrases. split("", pstack); pstack[top] = 1; # Their probabilities. split("", xstack); xstack[top] = 2; # Index of leftmost non-terminal. split("", zstack); zstack[top] = length(axiom); # Its length. split("", astack); astack[top] = firstrule[axiom]; # Next rule that hasn't been used yet. while(top > 0) { # Enumerate all terminal phrases derived from "rstack[top]", times "pstack[top]". # Specifically, replace the non-terminal that starts at "xstack[top]" and has # length "zstack[top]" by each of its alternatives beginning with "defn[astack[top]]", # and recursively enumerate the sentences derived from that. # Convention: if "xstack[top] == 0", the phrase "hstack[top]" is terminal. h = hstack[top]; p = pstack[top]; x = xstack[top]; z = zstack[top]; a = astack[top]; # printf "debug: h = [%s] p = %9.7f x = %d z = %d a = %d\n", h,p,x,z,a > "/dev/stderr"; if (p < truncate) { # probability too small, ignore. top--; } else if (x == 0) { # Sentence is terminal: printf "%9.7f %s\n", p, h; top--; } else if (a == -1) { # All rules have been tried: top--; } else { # Replace symbol and recurse: astack[top] = nextrule[a]; h = (substr(h,1,x-1) defn[a] substr(h,x+z)); p *= prob[a]; top++; hstack[top] = h; pstack[top] = p; find_nt(h); xstack[top] = RSTART; zstack[top] = RLENGTH; astack[top] = RALT; } } } function find_nt(h, x,y,z,s,n,a,b) { # Finds the first non-terminal in "h", sets RSTART, RLENGTH like "match" # If there is a non-terminal, also set RALT to the index of its first rule # in "defn[]" and "prob[]". x = 1; n = length(h); a = substr(h,x,1); while (x < n) { # At this point "a" must be an open delimiter: if (a != dop) { prog_error(("phrase delims h = [" h "] x = " x)); } y = x+1; if (y > n) { prog_error(("phrase trunc h = [" h "] y = " y)); } b = substr(h,y,1); if (b == dop) { x = y; a = b; } else { # Now "b" is a close delim or symbol letter: while ( b != dcl) { y++; if (y > n) { prog_error(("missing close h = [" h "] x = " x " y = " y)); } b = substr(h,y,1); if (b == dop) { prog_error(("unexp open h = [" h "] y = " y)); } } z = y - x - 1; if (z > 0) { # Found a symbol (non-empty innermost bracketed string) s = substr(h,x+1,z); if (s in firstrule) { RSTART = x+1; RLENGTH = z; RALT = firstrule[s]; return; } } # This inner bracket was no good, find next open delim: x = y + 1; while ((x < n) && ((a = substr(h,x,1)) == dcl)) { x++; } } } RSTART = 0; RLENGTH = 0; RALT =-1; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function grammar_error(msg) { printf "input grammar, line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function prog_error(msg) { printf "*** program error: %s\n", msg > "/dev/stderr"; abort = 1; exit abort; }