#! /usr/bin/gawk -f # Last edited on 2019-10-28 04:59:24 by jstolfi # Reads two data files with daily volumes and smoothed prices. # Combines them into a single reference price file. # # The user must define (with {export}) the environment variable {TZ="UTC"}, # load (with "-f") the libraries "useful_functions.gawk" and "index_file_functions.gawk", # and define (with "-v") the program variables # # {indexFile}, the name of the index file (that specifies the data files to combine). # {inDir}, the directory that contains the smoothed price files to combine. # # The files to be combined are specified the index file. # The format of index file is described in the file "index_file_functions.gawk", # function {ixf_read_index_file}. # # From each line of the index file the # program gets the nominal date range {INIDATE .. FINDATE}, the exchange tag # {EXTAG}, the currency tag {CRTAG}, and the date range to use {RLODATE .. RHIDATE}. # # Those parameters specify the file "{inDir}/{INIDATE}--{FINDATE}-{EXTAG}-{CRTAG}-01d.txt" # which must contain daily price and volume data for the exchange {EXTAG} in currency {CRTAG} # ("USD", "CNY", etc.), between dates {INIDATE} and {FINDATE} inclusive. # The program will use only entries of the volume file whose date is between # {RLODATE} and {RHIDATE}, inclusive both. # # The format of the input data files is the one used for original time series. # Outputs a file in the same format with a reference USD price. # The high and low prices are merged. The open and close prices are blended. # The BTC and currency volumes are blended, and the weighted price is computed # from them. BEGIN \ { if (ENVIRON["TZ"] != "UTC") { arg_error(("must set TZ to 'UTC'")); } if (indexFile == "") { arg_error(("must define {indexFile}")); } if (inDir == "") { arg_error(("must define {inDir}")); } pi = 3.1415926; # Initialize global tables to be read from the index file: ixf_initialize_index_tables(); # Read data from file {indexFile}, saves in tables {inidate_fi[0..nfiles-1],.. color_fi[0..nfiles-1]}: nfiles = ixf_read_index_file( \ indexFile, \ inidate_fi,findate_fi,extag_fi,crtag_fi,exname_fi,rate_fi,rlodate_fi,rhidate_fi,color_fi \ ); # There must be two exchanges, MGOX and BSTP: if (nfiles != 2) { file_error(indexFile, 0, "must blend exactly two files"); } if ((extag_fi[0] != "MGOX") || (crtag_fi[0] != "USD")) { file_error(indexFile, 0, "first exchange should be MGOX.USD"); } if ((extag_fi[1] != "BSTP") || (crtag_fi[1] != "USD")) { file_error(indexFile, 0, "second exchange should be BSTP.USD"); } # Timestamps for start and end of MGOX-BSTP switch: sec_sw_ini = usf_date_and_time_to_timestamp("2012-05-01", "00:00:00"); sec_sw_fin = usf_date_and_time_to_timestamp("2013-01-31", "00:00:00"); ndays = 0; # Number of data lines. # Series data from input, indexed with day date {dy} split("", date_dy); # Dates on successive lines, indexed {dy}. split("", pop_dy); # Opening price, indexed with [dy,0..nfiles-1]. split("", phi_dy); # High price, indexed with [dy,0..nfiles-1]. split("", plo_dy); # Low price, indexed with [dy,0..nfiles-1]. split("", pcl_dy); # Closing price, indexed with [dy,0..nfiles-1]. split("", vbt_dy); # BTC volume, indexed with [dy,0..nfiles-1]. split("", vcr_dy); # Currency volume, indexed with [dy,0..nfiles-1]. split("", pav_dy); # Average price, indexed with [dy,0..nfiles-1]. # Sorted dates: split("", date_kd); # Dates on successive lines, indexed {1..ndays}. # Precision (unit-in-last-place) of input and output valies: ulp_vbt = 0.0001; # Unit in the last place of input {vbt} ulp_vcr = 0.0001; # Unit in the last place of input {vcr} ulp_pav = 0.00001; # Unit in the last place of input average price {pav} ulp_phl = 0.00001; # Unit in the last place of {pop,phi,plo,pcl}. # Read the smoothed price data and store in series data tables: # Note that the indices are the date {dy} and the file index {kf}. for (kf = 0; kf < nfiles; kf++) { inidate = inidate_fi[kf]; findate = findate_fi[kf]; extag = extag_fi[kf]; crtag = crtag_fi[kf]; rlodate = rlodate_fi[kf]; rhidate = rhidate_fi[kf]; read_daily_price_file(\ inDir,inidate,findate,kf,extag,crtag,rlodate,rhidate, \ date_dy,pop_dy,phi_dy,plo_dy,pcl_dy,vbt_dy,vcr_dy,pav_dy \ ); } printf "done reading %d daily price files\n", nfiles > "/dev/stderr"; # Sort lines by date: ndays = asorti(date_dy,date_kd); # Now {date_kd} has the existing dates, indexed {1..ndays} printf "%d days in reference file, from %s to %s\n", ndays, date_kd[1], date_kd[ndays] > "/dev/stderr"; write_ref_price_file(\ nfiles,extag_fi,crtag_fi,rate_fi,\ ndays,date_kd,date_dy,\ pop_dy,phi_dy,plo_dy,pcl_dy,vbt_dy,vcr_dy,pav_dy\ ); printf "done writing the reference file\n" > "/dev/stderr"; exit(0); } function read_daily_price_file \ ( inDir,inidate,findate,kf,extag,crtag,rlodate,rhidate, \ date_dy,pop_dy,phi_dy,plo_dy,pcl_dy,vbt_dy,vcr_dy,pav_dy, \ fname,nlin,lin,ndays,nsave,fld,nfld,dy,tm,\ pop,phi,plo,pcl,vbt,vcr,pav,j,ody \ ) { # Reads a file with price data, total BTC and currency volumes, in 1 day intervals. # The file name is "{inDir}/{inidate}--{findate}-{extag}-{crtag}-01d.txt". # Stores the data in {date_dy[dy]}, # {pop_dy[dy,kf],phi_dy[dy,kf],plo_dy[dy,kf],pcl_dy[dy,kf]} # {vbt_dy[dy,kf],vcr_dy[dy,kf],pav_dy[dy,kf]}, # for each date {dy} present in the file that lies in the range {rlodate..rhidate} inclusive. # Uses global parameters {ulp_vbt,ulp_vcr,ulp_pav,ulp_phl}. # Assemble the name of the input daily volume file: fname = ( inDir "/" inidate "--" findate "-" extag "-" crtag "-01d.txt" ); printf "reading file %s ...\n", fname > "/dev/stderr"; ERRNO = ""; # Read the file: nlin = 0; # Number of lines read. ndays = 0; # Number of non-blank, non-header, non-comment lines. nsave = 0; # Number of data lines saved in the output arrays. ody = ""; # Date on previous data line. while((getline lin < fname) > 0) { nlin++; # Remove tabs, inline comments, spurious blanks gsub(/[\011]/, " ", lin); gsub(/[\#].*$/, "", lin); gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); gsub(/[ ][ ]+/, " ", lin); if ((lin != "") && (! match(lin, /[!]/))) { /* Data line: */ nfld = split(lin, fld, " "); if (nfld != 16) { file_error(fname, nlin, ("wrong field count = \"" lin "\"")); } for (j = 3; j <= NF; j = j + 2) { if (fld[j] != "|") { file_error(fname, nlin, ("missing '|' in column " j ", line = \"" lin "\"")); } } # Get the input fields: dy = usf_check_date(fname,nlin,fld[1]); tm = fld[2]; if (tm != "00:00:00") { file_error(fname, nlin, ("invalid time = \"" tm "\"")); } pop = usf_check_num(fname, nlin, fld[4]); phi = usf_check_num(fname, nlin, fld[6]); plo = usf_check_num(fname, nlin, fld[8]); pcl = usf_check_num(fname, nlin, fld[10]); vbt = usf_check_num(fname, nlin, fld[12]); vcr = usf_check_num(fname, nlin, fld[14]); pav = usf_check_num(fname, nlin, fld[16]); # Consistency checks: if ((dy,kf) in vbt_dy) { file_error(fname, nlin, ("repeated date = \"" dy "\"")); } if ((ody != "") && (! usf_dates_are_consecutive(ody,dy))) { file_error(fname,nlin, ("non-consecutive dates \"" ody "\" \"" dy "\"")); } ody = dy; if (pav != 0) { # Adjust {vcr} to be consistent with {vbt,pav}: vcr = pav*vbt; } usf_check_prices(fname,nlin, pop,phi,plo,pcl,vbt,vcr,pav, ulp_phl,ulp_vbt,ulp_vcr,ulp_pav); if ((dy >= rlodate) && (dy <= rhidate)) { # Save in arrays: date_dy[dy] = 1; pop_dy[dy,kf] = pop; phi_dy[dy,kf] = phi; plo_dy[dy,kf] = plo; pcl_dy[dy,kf] = pcl; vbt_dy[dy,kf] = vbt; vcr_dy[dy,kf] = vcr; pav_dy[dy,kf] = pav; nsave++; } ndays++; } } if ((ERRNO != "0") && (ERRNO != "")) { file_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf "%6d lines read\n", nlin > "/dev/stderr" printf "%6d data lines found\n", ndays > "/dev/stderr" printf "%6d data lines used\n", nsave > "/dev/stderr" } function write_ref_price_file\ ( nfiles,extag_fi,crtag_fi,rate_fi,\ ndays,date_kd,date_dy,\ pop_dy,phi_dy,plo_dy,pcl_dy,vbt_dy,vcr_dy,pav_dy, \ kd,kf,ody,dy,tm,sec,wt0,wt1,wtt,pop,phi,plo,pcl,vbt,vcr,rate \ ) { # Assumes {date_kd[0..ndays-1]} are the merged dates, in order. # Writes the blended volumes {vbt,vcr} # blended prices {pop,pcl}, united prices {phi,plo}} # and the mean price {pav} to standard output. # Recomputes the blended price from the volumes. if (nfiles != 2) { prog_error(("must blend exactly two files")); } printf "# Created by {compute_ref_price.gawk}\n" printf "# Blend of " for (kf = 0; kf < nfiles; kf++) { printf " %s.%s", extag_fi[kf], crtag_fi[kf]; } printf "\n"; printf "Timestamp ! Open ! High ! Low ! Close"; printf " ! V.BTC ! V.USD ! WTPrice\n"; printf "\n"; ody = ""; # Previous date. for (kd = 1; kd <= ndays; kd++) { # Get date: dy = date_kd[kd]; if (date_dy[dy] != 1) { prog_error(("inconsistent date tables")); } if ((ody != "") && (! usf_dates_are_consecutive(ody,dy))) { prog_error(("non-consecutive dates \"" ody "\" \"" dy "\"")); } tm = "00:00:00"; # Apply the currency rate factors: for (kf = 0; kf < nfiles; kf++) { rate = rate_fi[kf]; if (rate != 1) { pop_dy[dy,kf] /= rate; phi_dy[dy,kf] /= rate; plo_dy[dy,kf] /= rate; pcl_dy[dy,kf] /= rate; vcr_dy[dy,kf] /= rate; pav_dy[dy,kf] /= rate; } } # Compute blend weights of the two files (VERY SPECIFIC): sec = usf_date_and_time_to_timestamp(dy, "00:00:00"); wt0 = blend_weight(sec, sec_sw_ini,sec_sw_fin); wt1 = 1 - wt0; # If either price data is missing, use the other one, unless its weight is zero: if (pav_dy[dy,0] <= 0.0) { wt0 = 0.0; } if (pav_dy[dy,1] <= 0.0) { wt1 = 0.0; } # Blend: wtt = wt0 + wt1; if (wtt <= 0.0) { # Reference price is undefined printf "!! reference price is undefined for %s\n", dy > "/dev/stderr"; printf " wt0 = %8.6f pav0 = %11.5f", wt0, pav_dy[dy,0] > "/dev/stderr"; printf " wt1 = %8.6f pav1 = %11.5f\n", wt1, pav_dy[dy,1] > "/dev/stderr"; pop = 0; phi = 0; plo = 0; pcl = 0; vbt = 0; vcr = 0; pav = 0; } else { # Blend the ranges: phi = (wt0*phi_dy[dy,0] + wt1*phi_dy[dy,1])/wtt; if (phi < ulp_phl) { phi = ulp_phl; } plo = (wt0*plo_dy[dy,0] + wt1*plo_dy[dy,1])/wtt; if (plo < ulp_phl) { plo = ulp_phl; } # Blend the opening and closing prices, no better idea: pop = (wt0*pop_dy[dy,0] + wt1*pop_dy[dy,1])/wtt; if (pop < ulp_phl) { pop = ulp_phl; } pcl = (wt0*pcl_dy[dy,0] + wt1*pcl_dy[dy,1])/wtt; if (pcl < ulp_phl) { pcl = ulp_phl; } # Blend the volumes: vbt = (wt0*vbt_dy[dy,0] + wt1*vbt_dy[dy,1])/wtt; if (vbt < ulp_vbt) { vbt = ulp_vbt; } vcr = (wt0*vcr_dy[dy,0] + wt1*vcr_dy[dy,1])/wtt; if (vcr < ulp_vcr) { vbt = ulp_vcr; } # Recompute the price from the volumes: pav = vcr/vbt; if (pav < ulp_pav) { pav = ulp_pav; } # Make sure mean price is in interval: phi = max_price(phi,pav); plo = min_price(plo,pav); # Consitency of computed prices: usf_check_prices("BLEND",kd, pop,phi,plo,pcl,vbt,vcr,pav, ulp_phl,ulp_vbt,ulp_vcr,ulp_pav); } printf "%s %s | %12.5f | %12.5f | %12.5f | %12.5f", dy, tm, pop, phi, plo, pcl; printf " | %16.4f | %16.4f | %12.5f", vbt, vcr, pav; printf "\n"; ody = dy; } } function check_null_value(fname,nlin,val,name) { if (val != 0.0) { file_error(fname,nlin, (name " = \"" val "\" should be zero")); } } function check_non_null_value(fname,nlin,val,name) { if (val == 0.0) { file_error(fname,nlin, (name " = \"" val "\" should not be zero")); } } function min_price(x,y) { # Min of {x,y}, ignoring undefined: if (x+0 == 0) { return y; } else if (y+0 == 0) { return x; } else { return (x+0 < y+0 ? x : y); } } function max_price(x,y) { return (x+0 > y+0 ? x : y); } function blend_weight(sec,sec_ini,sec_fin, z,wt) { # A blend function, 1 before {sec_ini}, 0 after {sec_fin}. if (sec <= sec_ini) { return 1.0; } else if (sec >= sec_fin) { return 0.0; } else { z = (sec - sec_ini)/(sec_fin - sec_ini); wt = 0.5*(1 + cos(pi*z)); return wt; } }