#! /usr/bin/gawk -f # Last edited on 2014-05-07 01:45:41 by stolfilocal # Removes duplicate transactions from a transaction log file. # Assumes that the input is a sequence of segments, where # each segment was produced by copy-pasting the transaction log # from bitcoinwisdom.com and running it through # ../../cleanup_bitcoinwisdom_trans_log.gawk # In particular, assumes that each segment starts with a # few header lines including one line "Timestamp ! Price ! V.BTC" # followed by transaction entries in the format "{DATE} {TIME} | {PRICE} | {VOLUME}" # and these entries are sorted by reverse chrono order (most recent first). # Finally assumes that the ranges of "{DATE} {TIME}" of the segments overlap # so that the second entry of one segment occurs somewhere in the next segment. # # On output, the first entry of each segment and any duplicate # entries are marked with "### " at the front. The entries # of each segment are printed in chronological order. # BEGIN { abort = -1; nsegs = 0; # Number of segments seen. ntrans = 0; # Number of non-duplicated transactions. ntins = 0; # Number of transactions in the current segment. dup = 0; # If true, entry is in duplicated portion of segment. plin = ""; # Second entry of previous segment. slin = ""; # Second entry of current segment. } (abort >= 0) { exit(abort); } /^[ ]*([\#]|$)/ { # Comment or blank line if (nsegs == 0) { # Before first segment: print; } else { # Ignore. } next; } /[!]/ { # Segment header line if (nsegs == 0) { # Header of first segment: print; } else { if ((nsegs >= 2) && (dup == 0)) { data_error(("segment " nsegs " had no duplicated part")); } dump_seg(); } printf "### segment %d\n", nsegs; nsegs++; ntins = 0; # Next entry will be the first of segment. split("", lin); # Entries in segment. dup = 0; # Next entry is not duplicated in principle. plin = slin; slin = ""; odttm = ""; # Date and time of previous entruin segment. next; } /^20[0-9][0-9].*/ { # Transaction entry. # If {dup}, entry is in duplicated portion. if (NF != 6) { data_error(("bad NF = " NF)); } dt = 41; tm = $2; pr = $4; vb = $6; dttm = (dt " " tm); # Timestamp of entry. if ((ntins > 0) && (dttm > odttm)) { data_error(("entries out of order " odttm " < " dttm)); } if (ntins == 1) { # Save second line of segment: slin = $0; } if ($0 == plin) { # Start of duplicate part: dup = 1; } if ((ntins == 0) || (dup != 0)) { # First line of segment, or duplicated line: lin[ntins] = ("### " $0); } else { # Usable line: lin[ntins] = $0; ntrans++; } odttm = dttm; ntins++; next; } END { if (abort >= 0) { exit(abort); } if ((nsegs >= 2) && (dup == 0)) { end_error(("previous segment had no duplicated part")); } if (nsegs > 0) { dump_seg(); } printf "found %d segments\n", nsegs > "/dev/stderr"; printf "found %d non-duplicated transactions\n", ntrans > "/dev/stderr"; } function dump_seg( i) { for (i = ntins-1; i >= 0; i--) { print lin[i]; } } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; # printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 } function tbl_error(f,n,msg) { printf "%s:%d: ** %s\n", f, n, msg > "/dev/stderr"; abort = 1; exit 1 } function end_error(msg) { printf "** %s\n", msg > "/dev/stderr"; abort = 1; exit 1 }