#! /usr/bin/gawk -f # Last edited on 2000-06-09 15:32:15 by stolfi unde development BEGIN { abort = -1; usage = ( ARGV[0] " \\\n" \ " [ -v freqs=BOOL ] \\\n" \ " [ -v prec=NUM ] \\\n" \ " [ -v titles=\"TITLE1 TITLE2 ... TITLEn\" ] \\\n" \ " [ -v terse=BOOL ] \\\n" \ " INFILE1.grx INFILE2.grx ... INFILEn.grx \\\n" \ " > OUTFILE.grx" \ ); # Reads one or more counted grammar files INFILE.grx... which should # differ only on the counts associated to each alternative. Writes a # new grammar OUTFILE.grx where each alternative is labeled with the # corresponding counts of all input grammars. # # The grammar files INFILE.grx... must contain one or # more rules of the form # # SYMBOL: # COUNT1 OTHER1... PROD1 # COUNT2 OTHER2... PROD2 # ... ... ... # # where SYMBOL is a non-terminal symbol, each COUNTi is an integer # or fractional count, the OTHERi fields are zero or more numeric # fields (which will be ignored), and each PRODi is an alternative # for SYMBOL. Each definition PRODi must be a single string without # embedded blanks. (The definitions are not interpreted in any way.) # # The TITLES string must be a list of N short names, separated by blanks. # # The output OUTFILE.grx will be in the same format, except that # each rule will have the format # # SYMBOL: # # title1 title2 title3 ... titlen # CT[1.1] CT[1.2] CT[1.3] ... CT[1.n] DEF1 # CT[2.1] CT[2.2] CT[2.3] ... CT[2.n] DEF2 # ... ... ... # # where CT[i.j], is the COUNT associated with # the alternative DEFi in the input grammar number j.. # # If "freqs" is set to 1, the COUNTs in each input grammar are # first converted to fractions relative to the respective total # COUNT for the non-terminal SYMBOL. # # In any case, if "prec" is set to a positive number, the COUNT # fields of the output grammar will be printed as fractions with # that many decimal fraction digits. Otherwise the COUNT field will # be rounded to the nearest integer. # # "# Data-File:" lines, if any, are deleted from the input files. # Global variables: nfiles = ARGC-1; # Number of input grammars. nsymb = 0; # Number of non-terminal symbols. split("", grname); # "grname[j]" = name of input grammar file "j". split("", symbol); # "symbol[i]" = the "i"th non-terminal symbol. split("", comment); # "comment[s]" = the concatenated comments of symbol "s". # "comment[s,k]" = the concatenated comments of rule "[s,k]". final_comment = ""; # Concat comments at the end of the grammar. split("", nprod); # "nprod[s]" = the number of rules for symbol "s". split("", prod); # "prod[s,k]" = definition "k" of symbol "s". split("", ct); # "ct[s,k,j]" = count/prob of defn "k" of symbol "s" in file "j". # Arguments: if (freqs == "") { freqs = 0; } if (prec == "") { prec = (freqs ? 5 : 0); } if (nfiles < 1) { arg_error("must specify at least one input file"); } if (titles == "") { ntitles = 0; } else { ntitles = split(titles, tit); if (ntitles != nfiles) { arg_error("titles don't match files"); } } for (j = 1; j <= nfiles; j++) { grname[j] = ARGV[j]; } for (j = 1; j <= nfiles; j++) { inhale_grammar(j); if (freqs) { normalize_counts(j); } } write_new_grammar(); } function inhale_grammar(j, fname,ns,nf,k,lin,fld,s,cmt,nlines) { # Variables used while inhaling grammar: fname = grname[j]; # Grammar file name. cmt = ""; # Comments for next symbol or rule. s = ""; # Current non-terminal symbol ns = 0; # Number of symbols seen so far in this file. nlines = 0; # Number of lines read from this file. if (! terse) { printf "reading grammar %d = %s ...\n", j, fname > "/dev/stderr"; } while ((getline lin < fname) > 0 ) { nlines++; if (lin ~ /^ *$/) { cmt = ( cmt "\n" ); } else if (lin ~ /^[#][ ]*Data-File[ ]*[:]/) { } else if (lin ~ /^[#]/) { cmt = ( cmt lin "\n" ); } else if (lin ~ /^[A-Z(][A-Za-z0-9_()*+]*[ ]*[:][ ]*$/) { s = lin; gsub(/[ ]*[:][ ]*$/, "", s); if (j == 1) { if (s in nprod) { grammar_error(fname, nlines, ("repeated symbol \"" s "\"")); } symbol[ns] = s; comment[s] = cmt; } else { if (s != symbol[ns]) { grammar_error(fname, nlines, "inconsistent symbols"); } if (length(comment[s]) < length(cmt)) { comment[s] = cmt; } } cmt = ""; k = 0; ns++; } else if (lin ~ /^ *[0-9.]/) { if (s == "") { grammar_error(fname, nlines, "rule without head symbol"); } nf = split(lin, fld); if (nf < 2) { grammar_error(fname, nlines, "bad rule format"); } if (! match(fld[1], /[0-9]*([0-9]|([0-9][.]|[.][0-9]))[0-9]*/)) { grammar_error(fname, nlines, "bad rule count field"); } def = fld[nf]; if (j == 1) { prod[s,k] = def; comment[s,k] = cmt; nprod[s]++; } else { if (prod[s,k] != def) { grammar_error(fname, nlines, "inconsistent definitions"); } if (length(comment[s,k]) < length(cmt)) { comment[s,k] = cmt; } } ct[s,k,j] = fld[1]; cmt = ""; k++; } else { grammar_error(fname, nlines, "bad line format"); } } if (ERRNO != "0") { grammar_error(fname, nlines, ERRNO); } close (fname); if (nlines == 0) { arg_error(("file \"" fname "\" empty or missing")); } # Comments at end of grammar: gsub(/[\n]*$/, "", cmt); if (j == 1) { nsymb = ns; final_comment = cmt; } else { if (ns != nsymb) { grammar_error(fname, nlines, "inconsistent symbol counts"); } if (length(final_comment) < length(cmt)) { final_comment = cmt; } } if (nsymb == 0) { grammar_error(fname, nlines, "empty grammar"); } } function normalize_counts(j, i,s,m,k,tot) { for (i = 0; i < nsymb; i++) { s = symbol[i]; tot = 0.000000; m = nprod[s]; for (k = 0; k < m; k++) { tot += ct[s,k,j]; } if (tot == 0) { printf "warning: zero total count for \"%s\" in file %s\n", s,grname[j] > "/dev/stderr"; } for (k = 0; k < m; k++) { ct[s,k,j] = (tot == 0 ? 0 : ct[s,k,j]/tot); } } } function write_new_grammar( i,s,k,j,m,def,np,c,wd,fmtc) { if (prec == 0) { wd = (freqs ? 1 : 5); } else { wd = (freqs ? prec + 2 : prec + 6); } for(i = 0; i < nsymb; i++) { s = symbol[i]; np = nprod[s]; printf "%s", comment[s]; printf "%s:\n", s; for (k = 0; k < np; k++) { printf "%s", comment[s,k]; printf " "; for (j = 1; j <= nfiles; j++) { c = ct[s,k,j]; if (c == 0) { fmtc = "."; } else if (prec > 0) { fmtc = sprintf("%.*f", prec, c); gsub(/^[0]*/,"",fmtc); } else { fmtc = sprintf("%d", int(c+0.5)); } printf " %*s", wd, fmtc; } printf " %s\n", prod[s,k]; } if (ntitles != 0) { printf "# "; for (j = 1; j <= nfiles; j++) { printf " %*.*s", wd, wd, "-----"; } printf "\n"; printf "# "; for (j = 1; j <= nfiles; j++) { printf " %*s", wd, tit[j]; } printf "\n"; } } printf "%s", final_comment; printf "\n"; fflush("/dev/stdout"); } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; } function grammar_error(fname, line, msg) { printf "file %s, line %d: %s\n", fname, line, msg > "/dev/stderr"; abort = 1; exit abort; } function word_error(msg) { printf "file %s, line %d: %s\n", wordcounts, nwords, msg > "/dev/stderr"; abort = 1; exit abort; } function prog_error(msg) { printf "*** program error: %s\n", msg > "/dev/stderr"; abort = 1; exit abort; }