#! /usr/bin/gawk -f # Last edited on 2015-03-04 23:24:14 by stolfilocal # Reads a daily price/volume series file. Writes to {stdout} a file # with smoothed daily mean prices. # # The user must define (with {export}) the environment variable {TZ="UTC"} # The user must load (with "-f") the library {useful_functions.gawk}, and # define (with "-v") the variables # # {inFile} name of input series file. # {hrad}, the half-width of the smoothing window. # # The input file {inFile} must contain daily price and volume data for some exchange # in some currency. Each input line must have the format # "{DATE} {TIME} | {OPEN} | {HIGH} | {LOW} | {CLOSE} | {VBT} | {VCR} | {WTPRICE}" # where {DATE} is the line's date (UTC), {TIME} must be "00:00:00", # {VBT} is the total btc volume traded in that day, and {VCR} the total # volume in the exchange currency. The other fields are ignored, except for # validation. The {DATE}s must be consecutive days; days with # missing data must be present and have {VBT} and {VCR} both zero. # # Writes to standard output one line number {i} per day in the format # "{DATE} {TIME} | {VBT[i]} | {VCR[i]} | {PMD[i]}", where # # {DATE} is every day in the input file; # {TIME} is always "00:00:00"; # {VBT[i]} and {VCR[i]} are the input daily volumes on the {DATE}; # {PMD[i]} is a smoothed average of the price {VCR[j]/VBT[j]} in a window of several # days around the {DATE}. BEGIN \ { if (inFile == "") { arg_error(("must define {inFile}")); } if (hrad == "") { arg_error(("must define {hrad}")); } if (hrad !~ /^[0-9]+$/) { arg_error(("invalid {hrad}")); } pi = 3.1415926; # Precision (unit-in-last-place) of input and output valies: ulp_vbt = 0.0001; # Unit in last place of input BTC daily volume {vbt}. ulp_vcr = 0.0001; # Unit in last place of currency daily volume {vcr}. ulp_pav = 0.00001; # Unit in the last place of input mean price {pav}. ulp_phl = 0.00001; # Unit in the last place of input {plo,phi}. # Series data tables indexed by {[DATE]}: split("", date_dy); # Set to "1" for dates that exist, undef otherwise. split("", vbt_dy); # BTC volume. split("", vcr_dy); # Currency volume. split("", pav_dy); # Input average price per day. #Read the series and store in series data tables: read_daily_summary_file(\ inFile, \ date_dy,vbt_dy,vcr_dy,pav_dy \ ); # Output series tables, indexed {1..ndays}: split("", date_id); # Date. split("", vbt_id); # BTC volume. split("", vcr_id); # Currency volume. split("", pav_id); # Input average price per day. split("", pmd_id); # Smoothed average price in window around date. # Sort lines by date: ndays = asorti(date_dy,date_id); # Now {date_id} has the existing dates, indexed {1..ndays} # Reindex input data by day number, {1..ndays}: for (id = 1; id <= ndays; id++) { dy = date_id[id]; if (date_dy[dy] != 1) { prog_error(("inconsistent {date_dy,date_id}")); } vbt_id[id] = vbt_dy[dy]; vcr_id[id] = vcr_dy[dy]; pav_id[id] = pav_dy[dy]; } # Discard data points with very small volumes: for (id = 1; id <= ndays; id++) { if ((vcr_id[id] < 3*ulp_vcr) || (vbt_id[id] < 3*ulp_vbt)) { pav_id[id] = 0.0; } } spf_smooth_series(1, ndays, pav_id, hrad, pmd_id); # Round values: for (id = idmin; id <= idmax; id++) { pmd = pmd_id[id]; pmd = (pmd == 0 ? 0.0 : usf_round_value(pmd,ulp_pav)); pmd_id[id] = pmd; } write_smoothed_price(ndays,date_id,vbt_id,vcr_id,pmd_id); exit(0); } function write_smoothed_price \ ( ndays,date_id,vbt_id,vcr_id,pmd_id, \ id,dy,tm,vbt,vcr,pmd,ody \ ) { # Assumes {date_id[1..ndays]} are the merged dates, in order. # Print header: printf "# Created by {compute_smoothed_price.gawk}\n" printf "\n"; for (id = 1; id <= ndays; id++) { dy = date_id[id]; tm = "00:00:00"; printf "%s %s", dy, tm; vbt = vbt_id[id]; vcr = vcr_id[id]; pmd = pmd_id[id]; if (pmd == 0) { printf "!! smoothed price undefined for %s\n", dy > "/dev/stderr"; } printf " | %.4f | %.4f | %.5f", vbt, vcr, pmd; printf "\n"; otst = tst; } fflush("/dev/stdout"); } function read_daily_summary_file \ ( fname, \ date_dy,vbt_dy,vcr_dy,pav_dy, \ nlin,lin,ndays,fld,nfld,dy,tm,pop,phi,plo,pcl,vbt,vcr,pav,j,ody \ ) { # Reads from file {fname} the trade summary data in 1 day intervals. # Sets {date_dy[dt]} to 1, and stores the BTC and currency volumes in {vbt_dy[dt],vcr_dy[dt]}, # for each date {dt} present in the file. # Days with no data have must have a line with the corresponding date and zero volumes. # !!! Should be a library function, reading all series fields !!! # !!! Should return all fields indexed by line number {id} !!! # !!! Should take the time step as parameter and allow it !!! printf "reading file %s ...\n", fname > "/dev/stderr"; ERRNO = ""; # Read the file: nlin = 0; # Number of lines read. ndays = 0; # Number of non-blank, non-header, non-comment lines. ody = ""; # Date on previosu data line. while((getline lin < fname) > 0) { nlin++; # Remove tabs, inline comments, spurious blanks gsub(/[\011]/, " ", lin); gsub(/[\#].*$/, "", lin); gsub(/^[ ]+/, "", lin); gsub(/[ ]+$/, "", lin); gsub(/[ ][ ]+/, " ", lin); if ((lin != "") && (! match(lin, /[!]/))) { nfld = split(lin, fld, " "); if (nfld != 16) { file_error(fname, nlin, ("bad summary entry = \"" lin "\"")); } for (j = 3; j <= NF; j = j + 2) { if (fld[j] != "|") { file_error(fname, nlin, ("missing '|' in column " j ", line = \"" lin "\"")); } } dy = usf_check_date(fname,nlin,fld[1]); if (dy in date_dy) { file_error(fname, nlin, ("repeated date = \"" dy "\"")); } tm = fld[2]; if (tm != "00:00:00") { file_error(fname, nlin, ("invalid time = \"" tm "\"")); } pop = usf_check_num(fname, nlin, fld[4]); phi = usf_check_num(fname, nlin, fld[6]); plo = usf_check_num(fname, nlin, fld[8]); pcl = usf_check_num(fname, nlin, fld[10]); vbt = usf_check_num(fname, nlin, fld[12]); vcr = usf_check_num(fname, nlin, fld[14]); pav = usf_check_num(fname, nlin, fld[16]); if ((ody != "") && (! usf_dates_are_consecutive(ody,dy))) { file_error(fname,nlin, ("non-consecutive dates \"" ody "\" \"" dy "\"")); } ody = dy; # Consistency checks: usf_check_prices(fname,nlin, pop,phi,plo,pcl,vbt,vcr,pav, ulp_phl,ulp_vbt,ulp_vcr,ulp_pav); # Save in arrays: date_dy[dy] = 1; vbt_dy[dy] = vbt; vcr_dy[dy] = vcr; pav_dy[dy] = pav; ndays++; } } if ((ERRNO != "0") && (ERRNO != "")) { file_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf "%6d input lines read\n", nlin > "/dev/stderr"; printf "%6d data lines found\n", ndays > "/dev/stderr"; }