#! /usr/bin/gawk -f # Last edited on 2014-10-21 18:07:12 by stolfilocal BEGIN \ { # The user must define (with {export}) the environment variable {TZ="UTC"} # The user must define (with -v) {iniDate}, {finDate}. # where {iniDate}, {finDate} are strings in the format "{yyyy}-{mm}-{dd}". # # Reads from {stdin} a trade summary series with 1 hour intervals. # Writes to stdout a file containing 24 data lines, # in the format # # {hr} {NDAYVAL[hr]} {CURAVG[hr]} {DIFAVG[hr]} {DIFRMS[hr]} # # where {hr} is an hour-of-day (0 to 23), {VCUR} is the average # volume traded at that hour, {DIFAVG} is the average of the changes in price at that hour, # {DIFRMS} is the root-mean-square average of those changes. # # More precisely, let # # {hx} = hour index of a line in the input file, namely the timestamp # of the date and time divided by 3600. # {hr} = the remainder of {hx} divided by 24, assumed to be the hour of day.. # {vcur[hx]} = total currency traded between hours {hx} and {hx+1}, from line {hx} of file. # {vbtc[hx]} = total BTC traded in the same interval. # {P[hx]} = weighted mean price in that interval = {vcur[hx]/vbtc[hx]}. # {D[hx]} = change in price around hour {hx} = {log(P[hx]/P[hx-1])} (undef if {hx-1} is absent). # {W[hx]} = weight of {LD[hx]} for averaging = {(vcur[hx-1] + vcur[hx])/2}, or 0 if {hx-1} absent. # # For all hours {hx} in the specified range of dates, the program accumulates # # {sum_W[hr] += W[hx]} # {sum_WD[hr] += W[hx]*D[hx]} # {sum_WD2[hr] += W[hx]*D[hx]*D[hx]} # {num_days[hr] += 1} # {num_dval[hr] += (W > 0 ? 1 : 0)} # Number of days with valid data in that hour. # # At the end, {num_days[hr]} should be the same for all {hr}, namely the # number of days in the rane of dates given. Then the program computes, for each {hr} in {0..23}, # # {NDAYVAL[hr] = num_dval[hr]} # {CURAVG[hr] = sum_W[hr]/num_dval[hr]} # {DIFAVG[hr] = exp10(sum_WD[hr]/sum_W[hr])} # {DIFRMS[hr] = exp10(sqrt(sum_WD2[hr]/sum_W[hr]))} # if (iniDate !~ /^20(09|1[0-9])-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$/) { arg_error(("undefined or invalid {iniDate} = \"" iniDate "\"")); } if (finDate !~ /^20(09|1[0-9])-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$/) { arg_error(("undefined or invalid {finDate} = \"" finDate "\"")); } nh = 0; # Total number of input data lines. # Input data, indexed {0..nh-1}: split("", date_ih); # Date of each data line. split("", time_ih); # Time of each data line. split("", hx_ih); # Hour index of entry. split("", vbtc_ih); # Volume of BTC traded in interval. split("", vcur_ih); # Volume of national currency traded in interval. hgen = hours_from_epoch("2009-01-01","00:00:00"); # Hours from epoch to Bitcoin Year Zero. ohx = -1; # Hour index of previous data line. odt = ""; # Date of previous data line. otm = ""; # Time of previous data line. } # Discard comments, headers, blank lines: // { gsub(/[ ]*[\#].*$/, "", $0); } /^[ ]*$/ { next; } /[!].+[!]/ { next; } # Gather data, store in arrays: /^20[01]/ \ { if (NF != 16) { data_error(("invalid num fields = " NF)); } for (k = 3; k <= NF; k += 2) { if ($(k) != "|") { data_error(("expected \"|\" in field " k ", found \"" $(k) "\"")); } } dt = check_date($1); tm = check_time($2); vbtc = check_amount($12, "BTC volume"); # Volume in BTC. vcur = check_amount($14, "currency volume"); # Volume in nat currency. mp = check_amount($16, "mean price"); # Weighted price. if (vbtc > 0) { # Paranoia check of mean price: err = mp - (vcur/vbtc); if ((err > 0.01) && (err > 0.01*mp)) { data_error(("inconsistent mean price")); } } # Obtain the hour index: hx = hours_from_epoch(dt,tm) - hgen; if ((hx < 0) || (hx > 300000)) { data_error(("crazy date-time, {hx} = " hx)); } # Check for holes in data: if (ohx != -1) { if (hx != ohx + 1) { data_error(("gap in file, from \"" odt " " otm "\" to \"" dt " " tm "\"")); } } ohx = hx; odt = dt; otm = tm; # Split year+month, day: yrmo = substr(dt,1,7); dy = substr(dt,9,2); date_ih[nh] = dt; time_ih[nh] = tm; hx_ih[nh] = hx; vbtc_ih[nh] = vbtc; vcur_ih[nh] = vcur; nh++; next; } // \ { data_error(("bad input line format")); } END \ { # Scan input data, generate output files: oP = -1; # Weighted mean price of previous hour, or -1 if none. ovcur = -1; # Currency volume of previous hour, or -1 if none. clear_hr_tables(); for (ih = 0; ih < nh; ih++) { dt = date_ih[ih]; tm = time_ih[ih]; hx = hx_ih[ih]; vbtc = vbtc_ih[ih]; vcur = vcur_ih[ih]; # Compute {P,D,W}, accounting for missing data: P = (vbtc == 0 ? -1: vcur/vbtc); D = ((oP <= 0) || (P <= 0) ? 0 : log(P/oP)); W = ((oP <= 0) || (P <= 0) || (ovcur <= 0) || (vcur <= 0) ? 0 : 0.5*(ovcur + vcur)); if ((P > 0) && (oP > 0)) { # Consistency: if ((P/oP > 1.2) || (oP/P > 1.2)) { printf "!! large price change %s %s: %.2f --> %.2f\n", dt, tm, oP, P > "/dev/stderr";} } # Compute hour of day {hr}: hr = hx % 24; if ((dt >= iniDate) && (dt <= finDate)) { # Accumulate: sum_W_hr[hr] += W; sum_WD_hr[hr] += W*D; sum_WD2_hr[hr] += W*D*D; num_days_hr[hr] += 1; num_dval_hr[hr] += (W > 0 ? 1 : 0); } # Prepare for next row: oP = P; ovcur = vcur; } dump_hr_tables(iniDate,finDate); } function clear_hr_tables() { # Clears the totals by hour of day. split("", sum_W_hr); # Sum of {W[hx]} in each hour of day. split("", sum_WD_hr); # Sum of {W[hx]*D[hx]} in each hour of day. split("", sum_WD2_hr); # Sum of {W[hx]*D[hx]*D[hx]} in each hour of day. split("", num_days_hr); # Number of times each hour of day appeared. split("", num_dval_hr); # Number of times each hour of day appeared with valid data. } function dump_hr_tables(iniDate,finDate, hr,sum_W,sum_WD,sum_WD2,ndy0,ndy,ndv,curavg,difavg,difrms) { # Writes output ndy0 = num_days_hr[0] for (hr = 0; hr < 24; hr++) { sum_W = sum_W_hr[hr]; sum_WD = sum_WD_hr[hr]; sum_WD2 = sum_WD2_hr[hr]; ndy = num_days_hr[hr]; ndv = num_dval_hr[hr]; if (ndy != ndy0) { arg_error(("range " iniDate "--" finDate " has incomplete days " hr " " ndy " " ndy0)); } curavg = (ndv <= 0 ? 0 : sum_W/ndv); difavg = (sum_W <= 0 ? 0 : exp(sum_WD/sum_W)); difrms = (sum_W <= 0 ? 0 : exp(sqrt(sum_WD2/sum_W))); printf "%2d %2d %10.3f %10.4f %10.4f\n", hr, ndv, curavg, difavg, difrms; } close(fname); } function hours_from_epoch(date,time, xdt,ts,th) { # Given a date "{yyyy}-{mm}-{dd}" and a time "{HH}:00:00", returns # the number of whole hours from the Unix Epoch to that date-time. xdt = (date " " time " 0"); # Final 0 says "no daylight savings time". gsub(/[-:]/, " ", xdt); # {mktime} wants spaces as separators. ts = mktime(xdt); if ((ts % 3600) != 0) { data_error(("timestamp of \"" date " " time "\" is not whole hours")); } th = int(ts/3600); return th; } function check_date(dt) { if (dt !~ /^20(09|1[0-9])-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])$/) { data_error(("invalid date \"" dt "\"")); } return dt; } function check_time(tm) { if (tm !~ /^([01][0-9]|2[0-3]):00:00$/) { data_error(("invalid time \"" tm "\"")); } return tm; } function check_amount(v,name) { if (v !~ /^[0-9]*[.][0-9]*$/) { data_error(("invalid " name " = \"" v "\"")); } return v + 0.0; } function data_error(msg) { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit(abort); } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; abort = 1; exit(abort); } function file_error(f,n,msg) { printf "%s:%d: %s\n", f, n, msg > "/dev/stderr"; abort = 1; exit 1 }