#! /bin/gawk -f # Last edited on 2023-05-10 12:08:08 by stolfi # Converts a text frm ".src" format to a list of words, one per line BEGIN { abort = -1; usage = ( ARGV[0] " \\\n" \ " < main.src > main.wds" \ ); # # See "src-format.txt" for the input file format. # See "wds-format.txt" for the output file format. # TO DO: !!! Use locator lines to print errors with correct filename and line # Section stack split("", sectag); # {sectag[n]} is the tag of level {n} minlevel = 1 # Minimum section level. curlevel = minlevel-1; # Current section level. prevtag = ""; # Tag of section just ended, if any. seccount = 0; # Counts sections, for debugging output. # Character type table: split("", chype); chtype[" "] = "b"; chtype["@"] = "i"; chtype["#"] = "i"; chtype["{"] = "i"; chtype["}"] = "i"; # Word type and remapping table: split("", wdtype); split("", wdmap); } (abort >= 0) { exit abort; } // { # Get rid of funny spaces gsub(/[\011\014\015\240]/, " "); # Remove trailing blanks: gsub(/[ ]+$/, ""); } /^ *([#]|$)/ { # Output it as a comment entry: printf "# %s\n", $0; next; } /^ *[@]chars[ ]*(alpha|symbol|punct|blank|null|invalid)[ ]*[{].*[}][ ]*$/ { # Output it as comment entry: printf "# %s\n", $0; # Extract character type: type = $0; gsub(/^[ ]*@chars[ ]*/, "", type); gsub(/[ ]*[{].*[}][ ]*$/, "", type); type = substr(type, 1, 1); # Extract character list: chars = $0; gsub(/^[ ]*@chars[ ]*[a-z]+[ ]*[{]/, "", chars); gsub(/[}][ ]*$/, "", chars); # Set character table: for (i = 1; i <= length(chars); i++) { c = substr(chars, i, 1); if (c in chtype) { if (chtype[c] != type) { data_error(("bad character {" c "} in chars")); } } else { chtype[c] = type; } } next; } /^ *[@]wordmap[ ]*(alpha|symbol|punct|blank|null|invalid)[ ]*[{].*[}][ ]*$/ { # Output it as comment entry: printf "# %s\n", $0; # Extract character type: type = $0; gsub(/^[ ]*@wordmap[ ]*/, "", type); gsub(/[ ]*[{].*[}][ ]*$/, "", type); type = substr(type, 1, 1); # Extract file name: fname = $0; gsub(/^[ ]*@wordmap[ ]*[a-z]+[ ]*[{]/, "", fname); gsub(/[}][ ]*$/, "", fname); # Read file and save data in tables: read_word_table(fname, type); next; } /^[@]begin[ ]*[{][^ {}]+[}][ ]*$/ { # Output it as comment entry: printf "# %s\n", $0; # Extract the section tag: tag = $0; gsub(/^[ ]*@begin[ ]*[{]/, "", tag); gsub(/[}][ ]*$/, "", tag); begin_section(tag); # Output the section locator: output_section_locator(); next; } /^[@]end[ ]*[{][^ {}]+[}][ ]*$/ { # Output it as comment entry: printf "# %s\n", $0; # Extract the section tag: tag = $0; gsub(/^[ ]*@end[ ]*[{]/, "", tag); gsub(/[}][ ]*$/, "", tag); # Unstack section until the given tag: end_section(tag); # Output the section locator: output_section_locator(); next; } /^[@]section[ ]*[0-9]+[ ]*[{][^ {}]+[}][ ]*$/ { # Output it as comment entry: printf "# %s\n", $0; # Extract the nesting level: lev = $0; gsub(/^[ ]*@section[ ]*/, "", lev); gsub(/[ ]*[{].*[}][ ]*$/, "", lev); # Extract the section tag: tag = $0; gsub(/^[ ]*@section[ ]*[0-9]+[ ]*[{]/, "", tag); gsub(/[}][ ]*$/, "", tag); # Unstack section until the given level: if ((lev < minlevel) || (lev > curlevel + 1)) { data_error(("@invalid level \"" lev "\"")); } else if (lev <= curlevel) { end_section(sectag[lev]); } if (lev != curlevel + 1) { data_error(("program bug: curlevel")); } begin_section(tag); # Output the section locator: output_section_locator(); next; } /./ { # Contents line, phew! # Print the line locator: printf "@ %d\n", FNR; # Parse and output the words: process_contents_line($0); # It is OK to repeat a session tag after some contents: prevtag = ""; next; } END { if (abort >= 0) { exit abort; } if (curlevel >= minlevel) { end_section(sectag[minlevel]); } printf "\n" > "/dev/stderr"; } function output_section_locator( lev) { printf "$ "; for (lev = minlevel; lev <= curlevel; lev++) { printf "{%s}", sectag[lev]; } printf "\n"; } function begin_section(tag, j) { if (tag == "") { data_error(("empty section tag")); } if (tag == prevtag) { data_error(("consecutive sections with same tag \"" tag "\"")); } for (j = minlevel; j <= curlevel; j++) { if (tag == sectag[j]) { data_error(("nested sections with same tag \"" tag "\"")); } } curlevel++; sectag[curlevel] = tag; # Next "@begin" will be the first in its parent section: prevtag = ""; # Report opening session: if ((seccount > 7) || (curlevel - minlevel < 2)) { printf "\n%*s", 2*(curlevel-minlevel), "" > "/dev/stderr"; seccount = 0; } else { printf " " > "/dev/stderr"; } printf "{ %s", sectag[curlevel] > "/dev/stderr"; seccount++; } function end_section(tag) { while ((curlevel >= minlevel) && (sectag[curlevel] != tag)) { end_current_section(); } if (curlevel < minlevel) { data_error(("@end tag mismatch \"" tag "\"")); } end_current_section(); } function end_current_section() { # Report closure of section: printf " }" > "/dev/stderr"; # After 2 "@ends" in a row, force a line break: if (prevtag != "") { seccount = 100; } # Next "@begin" will be a sibling of this one: prevtag = sectag[curlevel]; curlevel--; } function process_contents_line(lin, c,ct,w,wt) { # We must be inside a section: if (curlevel < minlevel) { data_error(("missing a @begin or @section")); } # Parse line: w = ""; wt = "a"; while (lin != "") { c = substr(lin, 1, 1); if (c == "@") { # Explicit-type text construct if (! match(lin, /^[@][aspnb][{][^{}]+[}]/)) { data_error(("malformed embedded @-construct \"" lin "\"")); } ct = substr(lin, 2, 1); if (ct == "n") { # Null text, ignore lin = substr(lin, RLENGTH+1); } else { # Non-null text, flush {w}: lookup_and_output_word(w, wt); w = ""; wt = "a"; if (ct != "b") { # Output argument words: output_words(substr(lin,4,RLENGTH-4), ct); } } lin = substr(lin, RLENGTH+1); } else if (c == "{") { # Embedded {}-comment, ignore: if (! match(lin, /^[{][^{}]*[}]/)) { data_error(("malformed {}-comment \"" lin "\"")); } lin = substr(lin, RLENGTH+1); } else { # Single character if (! (c in chtype)) { # Invalid char data_error(("illegal character \"" c "\"")); } ct = chtype[c]; if (ct == "i") { # Illegal char, ignore data_error(("illegal input character \"" c "\"")); } else if (ct == "n") { # Null char, ignore } else if (ct == "a") { # Alpha char: append to word, preserve type w = (w c); } else if (ct == "s") { # Symbol char: append to word, mark it as symbol w = (w c); wt = "s"; } else { # Flush current word: lookup_and_output_word(w, wt); w = ""; wt = "a"; if (ct == "p") { # Punct char: a word unto itself: w = c; wt = "p"; lookup_and_output_word(w, wt); w = ""; wt = "a"; } else if (ct == "b") { # Blank char: ignore it } else { # Program bug data_error(("invalid class \"" ct "\" for char \"" c "\"")); } } lin = substr(lin, 2); } } lookup_and_output_word(w, wt); } function lookup_and_output_word(w, wt) { if (w == "") { return; } if (w in wdtype) { wt = wdtype[w]; w = wdmap[w]; } if (wt == "i") { # Invalid word: data_error(("invalid word \"" w "\"")); } else if ((wt == "b") || (wt == "n")) { # Blank or null word, ignore: } else { output_word(w, wt); } } function output_words(wds, wt, w,nw,iw) { # Splits {wds} at ASCII SP and output each word as type {wt}, # without lookup. nw = split(wds, w); for (iw = 1; iw <= nw; iw++) { output_word(w[iw], wt); } } function output_word(w, wt) { printf "%s %s\n", wt, w; } function read_word_table(fname,wt, nwords,nlines,lin,fld,nfld,wa,wb) { nwords=0; nlines=0; printf "reading wordmap of type = %s \"%s\"... ", wt, fname > "/dev/stderr"; while((getline lin < fname) > 0) { nlines++; if (! match(lin, /^[ \011]*([#]|$)/)) { gsub(/[ ]+[#].*$/, "", lin); gsub(/^[ ]+/, "", lin); nfld = split(lin, fld, " "); if (nfld > 2) tbl_error(fname, nlines, ("bad table entry = \"" lin "\"")); if (nfld < 1) tbl_error(fname, nlines, ("program error: nfld")); wa = fld[1]; wb = (nfld < 2 ? wa : fld[2]); if (wa in wdtype) tbl_error(fname, nlines, ("repeated word in tables = \"" lin "\"")); wdmap[wa] = wb; wdtype[wa] = wt; nwords++; } } if ((ERRNO != "0") && (ERRNO != "")) { tbl_error(fname, nlines, ERRNO); } close (fname); if (nlines == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf " %d words\n", nwords > "/dev/stderr" } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1; } function data_error(msg) { printf "\n" > "/dev/stderr"; printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1; } function tbl_error(f,n,msg) { printf "%s:%d: %s\n", f, n, msg > "/dev/stderr"; abort = 1; exit 1 }