#! /usr/bin/gawk -f # Last edited on 2014-02-07 18:46:25 by stolfilocal # Reads a file with summary of trade data for equal and equally-spaced time intervals. # Uses the BTC volume and nationa-currency volume columns. # # Converts it to a log-price series where the independent variable is either # time, BTC volume, or national currency volume. # # The client must define (with "-v") the variables # # {which} "time" for time series, "vbtc" for BTC volume series, "vnat" for nat currency time series. # {ustep} the interval width (time or volume) for the independent variable, # # Writes to standard output a file with one line for each time or volume interval # containing "{i} {UVAR[i]} {DUVAR[i]} {TIME[i]} {DTIME[i]} {VBTC[i]} {DVBTC[i]} {VNAT[i]} {DVNAT[i]} {Z[i]}" where # # {i} is the index of the indep variable (time or volume) interval (from 0). # {UVAR[i]} is the value of the indep variable at the center of that interval. # {DUVAR[i]} is the width of that interval, that is, {ustep}. # # {TIME[i]} is the time in the middle of that interval. # {DTIME[i]} is the time duration of that interval. # # {VBTC[i]} is the accumulated BTC volume up to the middle of that interval. # {DVBTC[i]} is the BTC volume within that interval. # # {VNAT[i]} is the accumulated national currency volume up to the middle of that interval. # {DVNAT[i]} is the national currency volume within that interval. # # {ZVAR[i]} is the log base 10 of the mean price in that interval, that is, {log_10(VNAT[i]/VBTC[i])}. BEGIN \ { abort = -1; if (which == "") { arg_error(("must define {which}")); } if ((which != "time") && (which != "vbtc") && (which != "vnat")) { arg_error(("invalid {which}" which)); } if (ustep == "") { arg_error(("must define {ustep}")); } n_tb = 0; # Number of valid input entries (time series elements). # Summary trade data for the equal and equally-spaced time intervals in input file: split("", vbtc_tb); # Total BTC volume in the sampling interval {i} is {vbtc_tb[i]}. split("", vnat_tb); # Total national-currency volume in the sampling interval {i} is {vnat_tb[i]}. debug = 0; } (abort >= 0) { exit abort; } # Gather the input data, save in {vbtc_tb[0..n_tb-1],vnat_tb[0..n_tb-1]}: /(^[ ]*([#]|$))|[!]/ { next; } /^20[0-9][0-9][-]/ \ { vbtc = 0+$(12); # Volume in BTC. vnat = 0+$(14); # Volume in national currency. wmp = 0+$(16); # Weighted mean price from file. if (vbtc < 0) { data_error(("invalid BTC volume")); } if (vnat < 0) { data_error(("invalid nat currency volume")); } if (wmp < 0) { data_error(("invalid mean price")); } if ((vbtc > 0) || (vnat > 0)) { if (wmp <= 0) { data_error(("zero price with nonzero volume")); } if ((vbtc <= 0) || (vnat <= 0)) { data_error(("volumes are not positive")); } wex = vnat/vbtc; # Expected weighted mean price. err = fabs(wmp - wex); if ((err > 0.05) && (err > 1.0e-5*wmp)) { data_warning(("inconsistent mean price " wmp " " wex " " err)); } } else { data_warning(("zero volume in interval")); } vbtc_tb[n_tb] = vbtc; vnat_tb[n_tb] = vnat; n_tb++; next; } // \ { data_error("invalid line format"); } END \ { if (abort >= 0) { exit abort; } printf "read %d valid data points\n", n_tb > "/dev/stderr"; # Hack to avoid zero-volume intervals: for (i = 0; i < n_tb; i++) { if ((vbtc_tb[i] <= 0) != (vnat_tb[i] <= 0)) { data_error(("oops? " vnat_tb[i] " " vbtc_tb[i])); } if (vbtc_tb[i] <= 0) { # Locate the nearest intervals {im,ip} with nonzero volume: im = i - 1; while ((im >= 0) && (vbtc_tb[im] == 0)) { im --; } if (im < 0) { data_error(("cannot interpolate at beginning")); } ip = i + 1; while ((ip < n_tb) && (vbtc_tb[ip] == 0)) { ip ++; } if (ip >= n_tb) { data_error(("cannot interpolate at end")); } # Use geometric interpolation for the volumes: vbtc_tb[i] = geometric_interp(im,vbtc_tb[im], ip,vbtc_tb[ip], i); vnat_tb[i] = geometric_interp(im,vnat_tb[im], ip,vnat_tb[ip], i); # Scale the volumes to tiny magnitute to minimize distortion: vsma = fmin(vbtc_tb[i], vnat_tb[i]); vbtc_tb[i] *= 0.001/vsma; vnat_tb[i] *= 0.001/vsma; data_warning(("interpolated volumes at " i ": " vbtc_tb[i] " " vnat_tb[i])); } } n_sr = 0; # Number of elements in series. split("", dtime_sr); # {dvbtc_sr[0..n_sr-1]} is the duration in each series interval. split("", dvbtc_sr); # {dvbtc_sr[0..n_sr-1]} is the BTC volume in each series interval. split("", dvnat_sr); # {dvnat_sr[0..n_sr-1]} is the BTC volume in each series interval. if (which == "time") { # Output is time series, just copy the input: for (i = 0; i < n_tb; i++) { dtime_sr[i] = ustep; dvbtc_sr[i] = vbtc_tb[i]; dvnat_sr[i] = vnat_tb[i]; } n_sr = n_tb; } else if ((which == "vbtc") || (which == "vnat")) { # Compute volume series by linear interpolation: vbtc_ac = 0; # Sum of BTC volumes of all previous time intervals. vnat_ac = 0; # Sum of NAT volumes of all previous time intervals. n_sr = 0; # Number of volume intervals already collected. i_pr = 0; # Fractional index in time axis where last volume cut was made (0 = beg of first interval). vbtc_pr = 0; # Sum of BTC volumes up to fractional time index {i_pr}. vnat_pr = 0; # Sum of NAT volumes up to fractional time index {i_pr}. for (i = 0; i < n_tb; i++) { # Process data for time interval {i}, append to {u}-interval {n_sr}. u_ac = (which == "vbtc" ? vbtc_ac : vnat_ac); # The variable {u} at the beg of this time interval. u_tb = (which == "vbtc" ? vbtc_tb[i] : vnat_tb[i]); # Increment in the variable {u} during this time interval. vbtc_nx = vbtc_ac + vbtc_tb[i]; # Sum of BTC volumes of all prev time intervals including this one. vnat_nx = vnat_ac + vnat_tb[i]; # Sum of NAT volumes of all prev time intervals including this one. u_nx = u_ac + u_tb; # The variable {u} at end of this time interval. if (debug) { printf "time interval %5d", i > "/dev/stderr"; printf " vbtc = [%17.6f _ %17.6f]", vbtc_ac, vbtc_nx > "/dev/stderr"; printf " vnat = [%17.6f _ %17.6f]", vnat_ac, vnat_nx > "/dev/stderr"; printf " u = [%17.6f _ %17.6f]\n", u_ac, u_nx > "/dev/stderr"; } while (1) { # Try to get another {u}-interval if possible: # Check if the end of the current {u}-interval is in {[u_ac _ u_nx]}: u_pr = n_sr*ustep; # The variable {u} at the beg of the current {u}-interval. u_ct = (n_sr + 1)*ustep; # The variable {u} at the end of the current {u}-interval. if (u_ct > u_nx) { break; } # We completed another {u}-interval: # Find the fractional index {i_ct} where the {u} variable reaches {u_ct}: if (u_ct < u_ac) { data_error(("duh? " u_ac " " u_ct)); } i_ct = i + (u_ct - u_ac)/(u_nx - u_ac); if (debug) { printf " volume interval %5d = [%17.6f _ %17.6f]",n_sr, u_pr, u_ct > "/dev/stderr"; printf " time index range [%10.4f _ %10.4f]\n", i_pr, i_ct > "/dev/stderr"; } # If two cuts fall in the same time interval, the volume step is too small: if (int(i_pr) == int(i_ct)) { data_warning(("volume in interval too large, double interp " u_tb)); } # Interpolate the volumes at {i_ct}: vbtc_ct = vbtc_ac + (i_ct - i)*vbtc_tb[i]; vnat_ct = vnat_ac + (i_ct - i)*vnat_tb[i]; # Save this {u}-interval: dtime_sr[n_sr] = i_ct - i_pr; dvbtc_sr[n_sr] = vbtc_ct - vbtc_pr; dvnat_sr[n_sr] = vnat_ct - vnat_pr; n_sr++; # Prepare for the next {u}-interval: i_pr = i_ct; vbtc_pr = vbtc_ct; vnat_pr = vnat_ct; } # Accumulate this time interval: vbtc_ac = vbtc_nx; vnat_ac = vnat_nx; } } else { data_error(("boh?" which)); } # Output the series: printf "%d intervals in series\n", n_sr > "/dev/stderr"; time_tt = 0; # Accumulated time to middle of interval. vbtc_tt = 0 # Accumulated volume in BTC up to middle of interval. vnat_tt = 0 # Accumulated volume in nat currency up to middle of interval. for (k = 0; k < n_sr; k++) { # Value of indep variable at center of interval. uvark = (k + 0.5)*ustep; # Cumulative variables up to center of interval: time_tt += 0.5*((k > 0 ? dtime_sr[k-1] : 0) + dtime_sr[k]); vbtc_tt += 0.5*((k > 0 ? dvbtc_sr[k-1] : 0) + dvbtc_sr[k]); vnat_tt += 0.5*((k > 0 ? dvnat_sr[k-1] : 0) + dvnat_sr[k]); # Log mean price in interval: zvark = log(dvnat_sr[k]/dvbtc_sr[k])/log(10); # Consistency: The {which} interval size must be {ustep}: duk = (which == "time" ? dtime_sr[k] : (which == "vbtc" ? dvbtc_sr[k] : (which == "vnat" ? dvnat_sr[k] : -1))); duerr = fabs(duk - ustep); if ((duerr > 1.0e-4) && (duerr > 1.0e-6*uvark)) { data_error(("heh?" duk " " ustep)); } printf "%5d %17.6f %17.6f", k, uvark, ustep; printf " %17.6f %17.6f", time_tt, dtime_sr[k]; printf " %17.6f %17.6f", vbtc_tt, dvbtc_sr[k]; printf " %17.6f %17.6f", vnat_tt, dvnat_sr[k]; printf " %+12.8f", zvark; printf "\n"; } fflush("/dev/stdout"); } function geometric_interp(im,vm,ip,vp,i) { return exp(log(vm) + (log(vp) - log(vm))*(i - im)/(ip - im)); } function fabs(x) { if (x < 0) { x = -x; } return x; } function fmin(x,y) { return (x < y ? x : y); } function data_error(msg) { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit(abort); } function data_warning(msg) { printf "%s:%s: !! warning: %s\n", FILENAME, FNR, msg > "/dev/stderr"; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; abort = 1; exit(abort); }