#! /usr/bin/gawk -f
# Last edited on 2014-06-26 05:18:02 by stolfilocal

# Removes duplicate transactions from a transaction log file.
# Assumes that the input is a sequence of segments, where
# each segment was produced by copy-pasting the transaction log
# from bitcoinwisdom.com and running it through 
# ../../cleanup_bitcoinwisdom_trans_log.gawk
# In particular, assumes that each segment starts with a 
# few header lines including one line "Timestamp ! Price ! V.BTC"
# followed by transaction entries in the format "{DATE} {TIME} | {PRICE} | {VOLUME}"
# and these entries are sorted by reverse chrono order (most recent first).
# Finally assumes that the ranges of "{DATE} {TIME}" of the segments overlap
# so that the second entry of one segment occurs somewhere in the next segment.
#
# On output, the first entry of each segment and any duplicate 
# entries are marked with "### " at the front.  The entries 
# of each segment are printed in chronological order.
# 

BEGIN {
  abort = -1;

  nsegs = 0;  # Number of segments seen.
  ntrans = 0; # Total number of non-duplicated transactions.
  dup = 0;    # If true, entry is in duplicated portion of current segment.

  ntins = 0;      # Number of transactions in the current segment.
  ntins_prev = 0; # Number of transactions in the previous segment.

  slin = "";      # Second entry of current segment, unmodified.
  slin_prev = ""; # Second entry of previous segment, unmodified.
  
  nerrs = 0;  # Number of data errors seen.
}

(abort >= 0) { exit(abort); }

/^[ ]*([\#]|$)/ {
  # Comment or blank line
  if (nsegs == 0) {
    # Before first segment:
    print;
  } else {
    # Ignore.
  }
  next;
}

/[!]/ {
  # Segment header line
  if (nsegs == 0) {
    # Header of first segment:
    print;
  } else {
    printf "segment %d last line = \"%s\"\n\n", nsegs, olin  > "/dev/stderr";
    if ((nsegs >= 2) && (dup == 0)) { data_error(("segment " nsegs " had no duplicated part")); exit(1); }
    dump_seg();
  }
  printf "### segment %d\n", nsegs;
  
  # Save old segment lines:
  split("", lin_prev);
  for (i = 0; i < ntins; i++) { lin_prev[i] = lin[i]; }
  ntins_prev = ntins;
  slin_prev = slin; 
  
  nsegs++;
  ntins = 0; # Next entry will be the first of segment.
  split("", lin); # Entries in segment; duplicated ones are prefixed with "### ".
  dup = 0;    # Next entry is not duplicated in principle.
  idup = -1;  # If {dup} is true, current line should be matched to {lin_prev[idup]}. 
  slin = "";
  odttm = ""; # Date and time of previous entry in segment.
  olin = "";  # Previous line in segment.
  next;
}

/^20[0-9][0-9].*/ {
  # Transaction entry.
  # Normalize spaces to make sure that matching lines match:
  gsub(/^[ ]+/, "", $0);
  gsub(/[ ]+$/, "", $0);
  gsub(/[ ][ ]+/, " ", $0);
  if (NF != 6) { data_error(("bad NF = " NF)); exit(1); }
  dt = $1;
  tm = $2;
  pr = $4;
  vb = $6;
  dttm = (dt " " tm); # Timestamp of entry.
  if ((ntins > 0) && (dttm > odttm)) { data_error(("entries out of order " odttm " < " dttm)); }
  if (ntins == 1) {
    # Save second line of segment:
    slin = $0;
    printf "segment %d second line = \"%s\"\n", nsegs, slin > "/dev/stderr";
  }
  if ((dup == 0) && ($0 == slin_prev)) { 
    # Start of duplicate part:
    dup = 1;
    idup = 1; # Should be the second line. 
    printf "segment %d first duplicated line = \"%s\"\n", nsegs, $0 > "/dev/stderr";
  }
  if ((ntins == 0) || (dup != 0)) {
    # First line of segment, or duplicated line -- exclude:
    lin[ntins] = ("### " $0);
  } else {
    # Usable line:
    lin[ntins] = $0;
    ntrans++;
  }
  odttm = dttm;
  olin = $0;
  if ((dup > 0) && (idup < ntins_prev))
    { compare_lines(lin[ntins], lin_prev[idup]);
      idup++;
    }
  ntins++;
  next;
}

END {
  if (abort >= 0) { exit(abort); }
  if ((nsegs >= 2) && (dup == 0)) { end_error(("segment " nsegs " had no duplicated part")); }
  if (nsegs > 0) { 
    printf "segment %d last line = \"%s\"\n", nsegs, olin  > "/dev/stderr";
    dump_seg();
  }

  if (nerrs > 0) { end_error(("aborted")); exit(1); }

  printf "found %d segments\n", nsegs > "/dev/stderr";
  printf "found %d non-duplicated transactions\n", ntrans > "/dev/stderr";
}

function compare_lines(L,P)
{ 
  # Compares lines {L} and {P} minus leading "### ".
  # Prints error message if they do not match.
  gsub(/^[#][#][#][ ]+/, "", L);
  gsub(/^[#][#][#][ ]+/, "", P);
  if (L != P) { data_error(("segment lines do not match \"" L "\"  \"" P "\"")); }
}

function dump_seg(  i) {
  for (i = ntins-1; i >= 0; i--) { print lin[i]; }
}

function arg_error(msg) { 
  printf "** %s\n", msg > "/dev/stderr";
  # printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1
}

function data_error(msg) { 
  printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  nerrs ++; # Abort only at end.
}

function end_error(msg) { 
  printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1
}