#! /usr/bin/gawk -f # Last edited on 2016-05-10 12:12:44 by stolfilocal BEGIN { # Extracts the text from bitcointalk posts. # Assumes that the messages have been processed by {do-split-posts.sh} # and {remove-quoted-text.gawk}. # In particular, that all
and
are start a new line. divlevel = 0; } /^ *< *h1/ { # Header with URL to original in bitcointalk: if (divlevel > 0) { data_error("

inside post text"); } title = $0; gsub(/^ *]*>/, "# ", title); gsub(/ *< *[/] *h1 *>/, "", title); title = gensub(/[- ]*([-: 0-9]+)< *[/] *a *>/, "\n# \\2\n# \\1\n", "g", title); print title; next; } /^ *
0) { data_error("nested
"); } divlevel++; next; } /^ *
0) { # Some other
inside a post div. divlevel++; } next; } /^<[/]div/ { if (divlevel > 0) { # Some
inside a post div. divlevel--; } next; } (divlevel == 0) { # Some other line outside the post: next; } /^ * *$/ { # Line with just a comment: next; } /^ *$/ { # Blank line: next; } // { # Some other line inside a post: # HTML character encodings: gsub(/[&][\#]039;/, "'", $0); gsub(/[&][\#]3647;/, "@s{BTC}", $0); gsub(/[&]nbsp;/, " ", $0); gsub(/[&]quot;/, "\"", $0); gsub(/[&]amp;/, "&", $0); gsub(/[&]lt;/, "@p{lt}", $0); gsub(/[&]gt;/, "@s{gt}", $0); # Line and paragraph breaks: gsub(/< *br *[/]? *> *< *br *[/]? *>/, " @p{par}\n\n", $0); gsub(/< *br *[/]? *>/, " @p{brk}\n", $0); gsub(/< *hr *[/]? *>/, "\n@p{hrule}\n", $0); # Dashes: gsub(/[-][-][-]/, "@p{---}", $0); gsub(/[-][-]/, "@p{--}", $0); # Markup: gsub(/< *del *>/, "@p{(so}", $0); gsub(/< *[/] *del *>/, "@p{so)}", $0); gsub(/< *i *>/, "@p{(it}", $0); gsub(/< *[/] *i *>/, "@p{it)}", $0); gsub(/< *b *>/, "@p{(bf}", $0); gsub(/< *[/] *b *>/, "@p{bf)}", $0); gsub(/< *span *style *= *"width: *0; *margin: *0 *-0.6ex *0 *-1px *; *" *>[ ]*< *[/] *span *>/, "", $0); gsub(/< *span *style *= *"text-decoration: *underline *; *" *>/, "@p{(ul}", $0); gsub(/< *span *style *= *"white-space: *pre *; *" *>/, "@p{(tt}", $0); gsub(/< *[/] *span *>/, "@p{??)}", $0); gsub(/< *[/]? *[ou]l *[^<>]*>/, "", $0); gsub(/< *li *[^<>]*>/, "\n@p{(li}", $0); gsub(/< *[/] *li *[^<>]*>/, "@p{li)}\n", $0); $0 = gensub(/< *sup *>([^<>]*)< *[/] *sup *>/, "^{\\1}", "g", $0); $0 = gensub(/< *sub *>([^<>]*)< *[/] *sub *>/, "_{\\1}", "g", $0); # Emoticons and images: gsub(/https:[/][/]bitcointalk.org[/]Smileys[/]default/, "{Smileys}", $0); $0 = gensub(/< *img *src="{Smileys}[/]([a-z]+).gif" *[^<>]* [/]? *>/, "@s{\\1}", "g", $0); # gsub(/< *img *src="{Smileys}[/]smiley.gif" *[^<>]* [/]? *>/, "@s{smiley}", $0); # gsub(/< *img *src="{Smileys}[/]wink.gif" *[^<>]* [/]? *>/, "@s{wink}", $0); # gsub(/< *img *src="{Smileys}[/]tongue.gif" *[^<>]* [/]? *>/, "@s{tongue}", $0); # gsub(/< *img *src="{Smileys}[/]cool.gif" *[^<>]* [/]? *>/, "@s{cool}", $0); # gsub(/< *img *src="{Smileys}[/]huh.gif" *[^<>]* [/]? *>/, "@s{huh}", $0); # gsub(/< *img *src="{Smileys}[/]rolleyes.gif" *[^<>]* [/]? *>/, "@s{rolleyes}", $0); gsub(/< *img *[^<>]* [/]? *>/, "@s{image}"); # HTTP links: gsub(/< *a [^<>]*>/, "@p{(link}", $0); gsub(/< *[/] *a *[^<>]*>/, "@p{link)}", $0); gsub(/http[s]?:[/][/][-_.:+=/&?%A-Za-z0-9]*/, "@s{(link)}", $0); gsub(/www[.][-.A-Za-z0-9]+/, "@s{(link)}", $0); gsub(/@p{[(]link}@s{[(]link[)]}@p{link[)]}/, "@s{(link)}", $0); print; next; } END { if (divlevel > 0) { printf "** not closed!\n" > "/dev/stderr"; exit(1); } } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function prog_error(msg) { printf "**PROG ERROR: %s\n", msg > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "%s:%d: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 }