#! /usr/bin/gawk -f
# Last edited on 2014-05-07 01:45:41 by stolfilocal

# Removes duplicate transactions from a transaction log file.
# Assumes that the input is a sequence of segments, where
# each segment was produced by copy-pasting the transaction log
# from bitcoinwisdom.com and running it through 
# ../../cleanup_bitcoinwisdom_trans_log.gawk
# In particular, assumes that each segment starts with a 
# few header lines including one line "Timestamp ! Price ! V.BTC"
# followed by transaction entries in the format "{DATE} {TIME} | {PRICE} | {VOLUME}"
# and these entries are sorted by reverse chrono order (most recent first).
# Finally assumes that the ranges of "{DATE} {TIME}" of the segments overlap
# so that the second entry of one segment occurs somewhere in the next segment.
#
# On output, the first entry of each segment and any duplicate 
# entries are marked with "### " at the front.  The entries 
# of each segment are printed in chronological order.
# 

BEGIN {
  abort = -1;

  nsegs = 0;  # Number of segments seen.
  ntrans = 0; # Number of non-duplicated transactions.
  ntins = 0;  # Number of transactions in the current segment.
  dup = 0;    # If true, entry is in duplicated portion of segment.

  plin = ""; # Second entry of previous segment.
  slin = ""; # Second entry of current segment.
}

(abort >= 0) { exit(abort); }

/^[ ]*([\#]|$)/ {
  # Comment or blank line
  if (nsegs == 0) {
    # Before first segment:
    print;
  } else {
    # Ignore.
  }
  next;
}

/[!]/ {
  # Segment header line
  if (nsegs == 0) {
    # Header of first segment:
    print;
  } else {
    if ((nsegs >= 2) && (dup == 0)) { data_error(("segment " nsegs " had no duplicated part")); }
    dump_seg();
  }
  printf "### segment %d\n", nsegs;
  nsegs++;
  ntins = 0; # Next entry will be the first of segment.
  split("", lin); # Entries in segment.
  dup = 0;   # Next entry is not duplicated in principle.
  plin = slin; slin = "";
  odttm = ""; # Date and time of previous entruin segment.
  next;
}

/^20[0-9][0-9].*/ {
  # Transaction entry.
  # If {dup}, entry is in duplicated portion.
  if (NF != 6) { data_error(("bad NF = " NF)); }
  dt = 41;
  tm = $2;
  pr = $4;
  vb = $6;
  dttm = (dt " " tm); # Timestamp of entry.
  if ((ntins > 0) && (dttm > odttm)) { data_error(("entries out of order " odttm " < " dttm)); }
  if (ntins == 1) {
    # Save second line of segment:
    slin = $0;
  }
  if ($0 == plin) { 
    # Start of duplicate part:
    dup = 1;
  }
  if ((ntins == 0) || (dup != 0)) {
    # First line of segment, or duplicated line:
    lin[ntins] = ("### " $0);
  } else {
    # Usable line:
    lin[ntins] = $0;
    ntrans++;
  }
  odttm = dttm;
  ntins++;
  next;
}

END {
  if (abort >= 0) { exit(abort); }
  if ((nsegs >= 2) && (dup == 0)) { end_error(("previous segment had no duplicated part")); }
  if (nsegs > 0) { dump_seg(); }
  printf "found %d segments\n", nsegs > "/dev/stderr";
  printf "found %d non-duplicated transactions\n", ntrans > "/dev/stderr";
}

function dump_seg(  i) {
  for (i = ntins-1; i >= 0; i--) { print lin[i]; }
}

function arg_error(msg) { 
  printf "** %s\n", msg > "/dev/stderr";
  # printf "usage: %s\n", usage > "/dev/stderr";
  abort = 1;
  exit 1
}

function data_error(msg) { 
  printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit 1
}

function tbl_error(f,n,msg) { 
  printf "%s:%d: ** %s\n", f, n, msg > "/dev/stderr";
  abort = 1;
  exit 1
}

function end_error(msg) { 
  printf "** %s\n", msg > "/dev/stderr";
  abort = 1;
  exit 1
}