#! /usr/bin/gawk -f # Last edited on 2014-06-26 05:18:02 by stolfilocal # Removes duplicate transactions from a transaction log file. # Assumes that the input is a sequence of segments, where # each segment was produced by copy-pasting the transaction log # from bitcoinwisdom.com and running it through # ../../cleanup_bitcoinwisdom_trans_log.gawk # In particular, assumes that each segment starts with a # few header lines including one line "Timestamp ! Price ! V.BTC" # followed by transaction entries in the format "{DATE} {TIME} | {PRICE} | {VOLUME}" # and these entries are sorted by reverse chrono order (most recent first). # Finally assumes that the ranges of "{DATE} {TIME}" of the segments overlap # so that the second entry of one segment occurs somewhere in the next segment. # # On output, the first entry of each segment and any duplicate # entries are marked with "### " at the front. The entries # of each segment are printed in chronological order. # BEGIN { abort = -1; nsegs = 0; # Number of segments seen. ntrans = 0; # Total number of non-duplicated transactions. dup = 0; # If true, entry is in duplicated portion of current segment. ntins = 0; # Number of transactions in the current segment. ntins_prev = 0; # Number of transactions in the previous segment. slin = ""; # Second entry of current segment, unmodified. slin_prev = ""; # Second entry of previous segment, unmodified. nerrs = 0; # Number of data errors seen. } (abort >= 0) { exit(abort); } /^[ ]*([\#]|$)/ { # Comment or blank line if (nsegs == 0) { # Before first segment: print; } else { # Ignore. } next; } /[!]/ { # Segment header line if (nsegs == 0) { # Header of first segment: print; } else { printf "segment %d last line = \"%s\"\n\n", nsegs, olin > "/dev/stderr"; if ((nsegs >= 2) && (dup == 0)) { data_error(("segment " nsegs " had no duplicated part")); exit(1); } dump_seg(); } printf "### segment %d\n", nsegs; # Save old segment lines: split("", lin_prev); for (i = 0; i < ntins; i++) { lin_prev[i] = lin[i]; } ntins_prev = ntins; slin_prev = slin; nsegs++; ntins = 0; # Next entry will be the first of segment. split("", lin); # Entries in segment; duplicated ones are prefixed with "### ". dup = 0; # Next entry is not duplicated in principle. idup = -1; # If {dup} is true, current line should be matched to {lin_prev[idup]}. slin = ""; odttm = ""; # Date and time of previous entry in segment. olin = ""; # Previous line in segment. next; } /^20[0-9][0-9].*/ { # Transaction entry. # Normalize spaces to make sure that matching lines match: gsub(/^[ ]+/, "", $0); gsub(/[ ]+$/, "", $0); gsub(/[ ][ ]+/, " ", $0); if (NF != 6) { data_error(("bad NF = " NF)); exit(1); } dt = $1; tm = $2; pr = $4; vb = $6; dttm = (dt " " tm); # Timestamp of entry. if ((ntins > 0) && (dttm > odttm)) { data_error(("entries out of order " odttm " < " dttm)); } if (ntins == 1) { # Save second line of segment: slin = $0; printf "segment %d second line = \"%s\"\n", nsegs, slin > "/dev/stderr"; } if ((dup == 0) && ($0 == slin_prev)) { # Start of duplicate part: dup = 1; idup = 1; # Should be the second line. printf "segment %d first duplicated line = \"%s\"\n", nsegs, $0 > "/dev/stderr"; } if ((ntins == 0) || (dup != 0)) { # First line of segment, or duplicated line -- exclude: lin[ntins] = ("### " $0); } else { # Usable line: lin[ntins] = $0; ntrans++; } odttm = dttm; olin = $0; if ((dup > 0) && (idup < ntins_prev)) { compare_lines(lin[ntins], lin_prev[idup]); idup++; } ntins++; next; } END { if (abort >= 0) { exit(abort); } if ((nsegs >= 2) && (dup == 0)) { end_error(("segment " nsegs " had no duplicated part")); } if (nsegs > 0) { printf "segment %d last line = \"%s\"\n", nsegs, olin > "/dev/stderr"; dump_seg(); } if (nerrs > 0) { end_error(("aborted")); exit(1); } printf "found %d segments\n", nsegs > "/dev/stderr"; printf "found %d non-duplicated transactions\n", ntrans > "/dev/stderr"; } function compare_lines(L,P) { # Compares lines {L} and {P} minus leading "### ". # Prints error message if they do not match. gsub(/^[#][#][#][ ]+/, "", L); gsub(/^[#][#][#][ ]+/, "", P); if (L != P) { data_error(("segment lines do not match \"" L "\" \"" P "\"")); } } function dump_seg( i) { for (i = ntins-1; i >= 0; i--) { print lin[i]; } } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; # printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit 1 } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; nerrs ++; # Abort only at end. } function end_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit 1 }