#! /usr/bin/gawk -f # Last edited on 2022-06-15 21:00:57 by stolfi BEGIN { usage = ( \ "cat INFILE \\\n" \ " | add_date_name_to_urls.gawk \\\n" \ " -v prefix=URLPREFIX \\\n" \ " -v table=TBLFILE \\\n" \ " > OUTFILE " \ ); # Reads an HTML file stdin that contains lines with format # "{SECURL} @ {COMMENT}" # where {SECURL} is the URL of a letter sent to the SEC, and {COMMENT} is an arbitrary text. # Converts each of those lines to a HTML table line showing the date # and sender name of the letter, with a HTML link to the {SECURL}, and # the {COMMENT}, suitably formatted. Each line is preceded with an # HTML comment with the date and the letter number, for use as a sort # key. The rows of each table are then sorted by that key. # Other lines are left unchanged. # Every {SECURL} in the file should start with the given {URLPREFIX}. # Each line of the {TBLFILE} should have three field {NUMEX} {DATE} # {NAME} where {NUMEX} is the URL of the letter minus the {URLPREFIX} # but with the extension (".htm" or ".pdf"), {DATE} is the ISO date of # the letter, and {NAME} is the sender's name with blanks replaced by # "_". Comments starting with "#" and blank lines are ignored in # {TBLFILE}. # Whenever an input {SECURL} is not found in the table, the # strings "???-??-??" and "???" are substituted for the date and name. # In any case, input lines that are blank or begin with "#" are not # changed. abort = -1; if (table == "") { arg_error("must specify \"-v table=TBLFILE\"\n"); } if (prefix == "") { arg_error("must specify \"-v prefix=URLPREFIX\"\n"); } # URL to date and name table: split("", tbdate); # Indexed with the {NUMEX}. split("", tbname); # Indexed with the {NUMEX}. read_table(table,tbdate,tbname); # Lines of a section to be sorted: section_table_reset() } (abort >= 0) { exit abort; } /^[#]/ { print; next; } /^ *$/ { # Ignore blank lines inside a "...
": if (nsecrow < 0) { print; } next; } // { # Start of new section table: if (nsecrow >= 0) { data_error("missing '
"); } section_table_clear(); next; } /[@]/ { if (abort >= 0) { exit abort; } if (NF < 3) { data_error("not enough input fields\n"); } if ($2 != "@") { data_error("malformed table line\n"); } if (nsecrow < 0) { data_error("missing "); } # Get the URL of the letter {url}: url = $1; # Get the comment (but preserving $0): lin = $0 $2 = ""; $1 = ""; printf " [[%s]]\n", $0 > "/dev/stderr" cm = $0; $0 = lin # Split the prefix {pr} from the URL leaving {sn}: pr = substr(url, 1, length(prefix)) sn = substr(url, length(prefix) + 1) if (pr != prefix) { data_error(("key \"" sn "\" not in table\n")); } # Look up {sn} in table: if (sn in tbdate) { dt = tbdate[sn]; na = tbname[sn] } else { dt = "???-??-??"; na = "???"; } section_table_save_row(sn, dt, na, cm); next; } /<[\/]table>/ { # End of section table: if (nsecrow < 0) { data_error("missing
"); } section_table_dump(); section_table_reset(); next; } // { print; next; } END { if (nsecrow >= 0) { data_error("missing
"); } } function read_table(fname,tbdate,tbname, ntbl,nlin,lin,linx,fld,nfld,tmp) { ntbl=0; nlin=0; while((getline lin < fname) > 0) { printf " > %s\n", lin > "/dev/stderr" nlin++; if (! match(lin, /^[ \011]*([#]|$)/)) { linx = lin; gsub(/ *[#].*$/, "", linx); nfld = split(linx, fld, " "); if (nfld != 3) { tbl_error(fname, nlin, ("bad table entry = \"" lin "\"")); } # If {inv} is true, swap the two columns: if (fld[1] in tbdate) { tbl_error(fname, nlin, ("repeated key = \"" lin "\"")); } tbdate[fld[1]] = fld[2]; tbname[fld[1]] = fld[3]; ntbl++; } } if ((ERRNO != "0") && (ERRNO != "")) { tbl_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf "loaded %6d table entries\n", ntbl > "/dev/stderr" } function section_table_save_row(sn,dt,na,cm, tx,url) { # prints the HTML table entry for letter number {sn}, date {dt}, name {nm}, comment {cm}. # Protect 'href' fields in comment: tx = cm; tx = gensub(/href=["]([^"]+)["]/, "href=@<<\\1@>>", "g", tx) # Format the quoted text in double quotes: tx = gensub(/["]([^"]+)["]/, "\\1", "g", tx) # Unprotect 'href' fields in comment: tx = gensub(/href=@<<([^<>]+)@>>/, "href=\"\\1\"", "g", tx) # Bare number and extension: bn = sn; gsub(/^[0-9]+-/, "", bn) # Sort key: key = ("") # Full letter URL: url = (prefix sn) tbrow = sprintf("%-32s%s%s
%s", key, url, dt, url, na, tx); nsecrow++ secrow[nsecrow] = tbrow } # The lines of each table are temporarily saved in {secrow[1..nsecrow]}. # If {nsecrow} is negative, we are not parsing a table. function section_table_reset() { # Resets the state to 'not inside a table'. nsecrow = -1 } function section_table_clear() { # Initializes the section table to empty. nsecrow = 0 split("", secrow) # Indexed from 1 to {nsecrow} } function section_table_dump( i) { # Outputs the current section table, with sorted rows. # Sort the rows: ns = asort(secrow) if (ns != nsecrow) { prog_error("row count inconsistency"); } printf " \n" printf "\n" for (i = 1; i <= nsecrow; i++) { printf " %s\n", secrow[i] printf "\n" } printf "
\n" } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function tbl_error(f,n,msg) { printf "%s:%d: ** %s\n", f, n, msg > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 } function prog_error(msg) { printf "%s:%d: ** prog error: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 }