#! /usr/bin/gawk -f # Last edited on 2016-05-09 23:49:53 by stolfilocal BEGIN { # Cleanups posts from a Bitcointalk "latest posts" page, # removing the site headers, writing each post # as a separate HTML file. # Assumes that every ""s. nclose = 0; # Number of closing ""s. anchor = "???"; # Anchor of post in bitcointalk. date = "???"; # Date of post. uname = "???"; # User name. ofile = "???"; # Output file of post. debug = 1; # If true, prints debugging information. verbose = 1; # If true, prints info for each post found. # USA to ISO month conversion table: split("", month_num); month_num["January"] = 1; month_num["February"] = 2; month_num["March"] = 3; month_num["April"] = 4; month_num["May"] = 5; month_num["June"] = 6; month_num["July"] = 7; month_num["August"] = 8; month_num["September"] = 9; month_num["October"] = 10; month_num["November"] = 11; month_num["December"] = 12; } /content="Latest posts of:/ { if (debug) { printf "!! grabbing uname\n" > "/dev/stderr"; } uname = $0; gsub(/^.*"Latest posts of: */, "", uname); gsub(/ *" *\/> *$/, "", uname); gsub(/[ ]/, "_", uname); if (uname !~ /^[-_A-Za-z0-9]+$/) { data_error("invalid uname"); } next; } /^[\011 ]*
" tag: if (debug) { printf "!! found opening
\n" > "/dev/stderr"; } skipping = 0; postlevel = 0; # Level of
inside a post. next; } (skipping) { # We are between posts: next; } /^[\011 ]*<[/]table>/ { # Closing "
" tag: if (debug) { printf "!! found closing \n" > "/dev/stderr"; } if (skipping) { prog_error("Duh?"); } if (postlevel > 0) { data_error("post div not closed"); } skipping = 1; next; } / "/dev/stderr"; } if (postlevel > 0) { data_error("post div not closed"); } if (match($0,//)) { anchor = substr($0,RSTART,RLENGTH); } else { data_error("missing message anchor"); } next; } /^[\011 ]*on: *[A-Z][^<>]* [AP]M/ { # Grab the post date: if (debug) { printf "!! grabbing post date\n" > "/dev/stderr"; } if (postlevel > 0) { data_error("post div not closed"); } date = $0; gsub(/^[\011 ]*on: */, "", date); gsub(/[\011 ]*$/, "", date); # Convert date to ISO UTC format: date = usa_date_to_iso_date(date); # Define the file name: ofile = date; gsub(/[ ]/, "-", ofile); gsub(/[:]/, "", ofile); ofile = (outDir "/" ofile ".html"); next; } /^
"/dev/stderr"; } if (postlevel > 0) { data_error("post div not closed"); } print_html_header(ofile,uname,date,anchor); printf "\n", FNR > ofile; nopen++; postlevel++; print > ofile; next; } /^ *
" inside the post's "...
": if (postlevel > 0) { postlevel++; } print > ofile; next; } /^ *<[/] *div *> *$/ { # Closing of a "
" inside the post's "...
": if (postlevel > 0) { print > ofile; postlevel--; if (postlevel == 0) { # End of post proper: if (debug) { printf "!! end of post\n" > "/dev/stderr"; } printf "\n", FNR > ofile; print_html_footer(ofile); close(ofile); nclose++; ofile = "???"; } next; } else { next; } } /< *[/]? *div */ { # A
or
inside a line -- prefilter bug. data_error("
or
inside a line"); } (postlevel > 0) { # Non-special line between
...
": print > ofile; next; } // { # Non-special line outside
...
": next; } END { printf "%d open post %d close post\n", nopen, nclose > "/dev/stderr"; if (! skipping) { data_error("post's not closed"); } } function usa_date_to_iso_date(date, fld,nfld,yr,mo,dy,hh,mm,ss,ap,iso) { # Converts an UTC date in US format to an UTC date in ISO format. nfld = split(date, fld, /[ :,]+/); if (nfld != 7) { data_error("invalid date format"); } if (debug) { printf "!! date = %d [%s] [%s] [%s]", nfld, fld[1], fld[2], fld[3] > "/dev/stderr"; printf " [%s] [%s] [%s] [%s]\n", fld[4], fld[5], fld[6], fld[7] > "/dev/stderr"; } if (! (fld[1] in month_num)) { data_error("invalid month name"); } mo = month_num[fld[1]]; dy = fld[2] + 0; if ((dy < 1) && (dy > 31)) { data_error("invalid day of month"); } yr = fld[3] + 0; if ((yr < 2009) || (yr > 2099)) { data_error("invalid year"); } hh = fld[4] + 0; if ((hh < 00) || (hh > 23)) { data_error("invalid hour"); } mm = fld[5] + 0; if ((mm < 00) || (mm > 59)) { data_error("invalid minute"); } # Hope that we don't get a leap second: ss = fld[6] + 0; if ((ss < 00) || (ss > 59)) { data_error("invalid second"); } # Fix AM/PM: ap = fld[7]; if ((ap != "AM") && (ap != "PM")) { data_error("invalid AM/PM tag"); } if ((ap == "PM") && (hh != 12)) { hh += 12; }; if ((ap == "AM") && (hh == 12)) { hh = 0; }; iso = sprintf("%04d-%02d-%02d %02d:%02d:%02d", yr, mo, dy, hh, mm, ss); return iso; } function print_html_header(ofile,uname,date,anchor, styleurl,xname) { if (verbose) { printf " new post:\n" > "/dev/stderr"; printf " uname = [%s]\n", uname > "/dev/stderr"; printf " date = [%s]\n", date > "/dev/stderr"; printf " anchor = [%s]\n", anchor > "/dev/stderr"; } if (date == "???") { data_error("missing post date"); } if (anchor == "???") { data_error("missing post anchor"); } if (uname == "???") { data_error("missing user name"); } styleurl = "https://bitcointalk.org/Themes/custom1/style.css"; xname = uname; gsub(/[_]/, " ", xname); printf "\n" > ofile; printf "\n" > ofile; printf "\n" > ofile; printf "\n" > ofile; printf " %s - %s\n", uname, date > ofile; printf " \n", styleurl > ofile; printf "\n" > ofile; printf "\n" > ofile; printf "

%s - %s%s

\n", xname, anchor, date > ofile; printf "
\n" > ofile; } function print_html_footer(ofile) { printf "
\n" > ofile; printf "\n" > ofile; printf "\n" > ofile; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function prog_error(msg) { printf "**PROG ERROR: %s\n", msg > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 }