#! /usr/bin/gawk -f # Last edited on 2014-02-07 18:34:46 by stolfilocal # Reads a file with summary of trade data for equal and equally-spaced intervals of some # independent variable {u} (time, BTC volume, or national currency volume), # as produced by {extract_time_or_volume_series.gawk}. # # The client must define (with "-v") the variables # # {opref} output filename prefix. # {rmax} max time gap to consider, in multiples of the independent variable step. # {confpct} desired probability of inclusion in confidence range. # # Writes a file "{opref}_i.txt" with a the raw increment data. For each # {r} up to {rmax}, and every line index {i} in the input file excluding # the last {r}, writes a line in the format # "{i} {r} {UVAR[i]} {UVAR[i+r]} {DTIME[i,r]} {DVBTC[i,r]} {DVNAT[i,r]} {DZVAR[i,r]}", # where # # {UVAR[i]} is the value of the independent variable {u} for interval {i}, # # {DTIME[i,r],DVBTC[i,r],DVNAT[i,r]} are the total time, BTC volume, and # nat curency between the centers of intervals {i} and {i+r}, # # {DZVAR[i,r]} is the increment in {log10(price)} between input lines # {i} and {i+r}, that is, {ZVAR[i+r] - ZVAR[i]}. # Also writes a file "{opref}_r.txt" with one line for each positive integer {r} up to {rmax} # with the format "{r} {AVG[r]} {DEV[r]} {LOQ[r]} {HIQ[r]} {PDV[r]}" # where # # {AVG[r]} is the average of the increments in {log10(price)} after {r} input steps, # {DEV[r]} is the deviation of those increments, assuming zero mean, # {LOQ[r]} is the low (negative) end of some percentile range of those intervals, # {HIQ[r]} is the high (positive) end of some percentile range of those intervals, # {PDV[r]} is the expected deviation according to the {u}-based predictive model. # # The predictive model assumes that the increments in {log10(price)} between # successive {u} intervals are independent identically distributed random # variables. Therefore, the variance of the total increment after {r} # {u}-steps should be {C^2*r} for some constant {C}. BEGIN \ { if (opref == "") { arg_error(("must define {opref}")); } if (rmax == "") { arg_error(("must define {rmax}")); } # Percentage of weight to include in the confidence range. if (confpct == "") { arg_error(("must define {confpct}")); } confpct += 0; if ((confpct < 0.01) || (confpct > 0.99)) { arg_error(("invalid {confpct}")); } ifile = (opref "_i.txt"); # Output file with individual increments. rfile = (opref "_r.txt"); # Output file with parameters per time step {r}. n = 0; # Number of valid input entries. # Summary trade data for the equal and equally-spaced time intervals in input file: # Indexed with interval number {i} in {0..n-1}: split("", uvar_in); # Indep variable {u} in middle of interval. split("", duvar_in); # Width of that interval in {u}. split("", time_in); # Time at middle of interval. split("", dtime_in); # Duration of interval. split("", vbtc_in); # Accum BTC volume up to middle of interval. split("", dvbtc_in); # BTC volume in interval. split("", vnat_in); # Accum national currency volume to middle of interval. split("", dvnat_in); # National currency volume in interval. split("", zvar_in); # Log10 of Weighted mean price in interval. } /(^[ ]*([#]|$))|[!]/ { next; } /^[ ]*[-+]?[0-9]*([0-9]|[.][0-9]*)[ ]*[-+]?[0-9]*([0-9]|[.][0-9]*)/ \ { if (NF != 10) { data_error("invalid field count"); } i_in = 0+$1 if (i_in != n) { data_error(("line number error")); } uvar_in[n] = 0+$2; # Indep variable in middle of interval. duvar_in[n] = 0+$3; # Width of that interval. time_in[n] = 0+$4; # Time at middle of interval. dtime_in[n] = 0+$5; # Duration of interval. vbtc_in[n] = 0+$6; # Accum BTC volume up to middle of interval. dvbtc_in[n] = 0+$7; # BTC volume in interval. vnat_in[n] = 0+$8; # Accum national currency volume to middle of interval. dvnat_in[n] = 0+$9; # National currency volume in interval. zvar_in[n] = 0+$(10); # Log10 of Weighted mean price in interval. if ((uvar_in[n] < 0) || (duvar_in[n] <= 0)) { data_error(("invalid {uvar,duvar}")); } if ((time_in[n] < 0) || (dtime_in[n] <= 0)) { data_error(("invalid {time,dtime}")); } if ((vbtc_in[n] < 0) || (dvbtc_in[n] <= 0)) { data_error(("invalid {vbtc,dvbtc}")); } if ((vnat_in[n] < 0) || (dvnat_in[n] <= 0)) { data_error(("invalid {vnat,dvnat}")); } n++; next; } // \ { data_error("invalid line format"); } END \ { printf "read %d valid data points\n", n > "/dev/stderr"; # Computed data for time increments {r} in {1..umax}: split("", ct); # {ct[r]} is the number of valid pairs {i,i+r} present in data. split("", avg); # {avg[r]} is the average increment in {log10(price)} from discrete time {i} to {i+r}. split("", dev); # {dev[r]} is the deviation of those increments assuming zero mean increment. split("", loq); # {loq[r]} is the low end of the {confpct} percentile range of those increments. split("", hiq); # {hiq[r]} is the high end of the {confpct} percentile range of those increments. split("", pdv); # {pdv[r]} is the deviation of the increment after {r} steps in the log-Brownian model. # Sums to compute the parameters of the log-Brownian model. # In that model, the deviation {pdv[r]} is {C*sqrt(r)} for some constant {C}. # We expect {dev[r]^2 ~ C^2*r} with weight {dvnat_md[r]}, # that is {dev[r]^2/r ~ C^2} with weight {w[r] = r*dvnat[r]}. sum_wC2 = 0; # Sum of {w[i,r]*(dzvar[i,r]^2/r)} for all {i,r}. sum_w = 0; # Sum of weights {w[i,r]}. for (r = 1; r <= rmax; r++) { # printf "=== r = %d ===\n", r > "/dev/stderr"; # Collect the increments {duvar_st[k],dtime_st[k},dvbtc_st[k],dvnat_st[k],dzvar_st[k]}, # for {k} in {0..ct[r]-1}, for all valid pairs {i,i+r} present in data: ct[r] = 0; # Count of valid pairs with increment {r}. split("", duvar_st); split("", dtime_st); split("", dvbtc_st); split("", dvnat_st); split("", dzvar_st); # Sums for weighted means and variances (weighted by natural volume) of this {r}: sum_vd = 0; # Sum of {dzvar_st[k]} weighted by {dvnat_st[k]}. sum_vd2 = 0; # Sum of {dzvar_st^2[k]} weighted by {dvnat_st[k]}. sum_v = 0; # Sum of {dvnat_st[k]}. istep = r; # Increment between starts of strides. for (i = 0; i < n - r; i += istep) { j = i + r; if ((dvnat_in[i] > 0) && (dvnat_in[j] > 0)) { # Compute the total time, btc volume, and nat volume between mids of intervals {i} and {j}: k = ct[r]; duvar_st[k] = uvar_in[j] - uvar_in[i]; dtime_st[k] = time_in[j] - time_in[i]; dvbtc_st[k] = vbtc_in[j] - vbtc_in[i]; dvnat_st[k] = vnat_in[j] - vnat_in[i]; # Compute the increment {dzvar} in {log10(price)} between intervals {i} and time {j}: dzvar_st[k] = zvar_in[j] - zvar_in[i]; # Accumulate for mean and deviation computation for this r: vri = dvnat_st[k]; dri = dzvar_st[k]; sum_vd = sum_vd + vri*dri; sum_vd2 = sum_vd2 + vri*dri*dri; sum_v = sum_v + vri; # Accumulate for log-Brownian model fitting: wri = r*dvnat_st[k]; C2ri = dri*dri/r; sum_wC2 += wri*C2ri; sum_w += wri; # Save for percentile computation: ct[r] += 1; # Write the stride datum: printf "%7d %7d %5d", i, j, r > ifile; printf " %17.6f %17.6f", duvar_st[k], dtime_st[k] > ifile; printf " %17.6f %17.6f", dvbtc_st[k], dvnat_st[k] > ifile; printf " %+12.8f", dzvar_st[k] > ifile; printf "\n" > ifile; } } printf "\n" > ifile; # Blank line for gnuplot. # Compute the mean and deviation of {log10(price)} increments, from {i} to {i+r}: if (ct[r] > 0) { avg[r] = sum_vd/sum_v; dev[r] = sqrt(sum_vd2/sum_v); # Determine the balanced interval {lo,hi} that contains 95% of the weight {dvnat_st[*]}: lopct = (1-confpct)/2; # Percentile level for {loq[r]}. hipct = 1 - lopct; # Percentile level for {hiq[r]}. zblur = 0.001; # Assumed precision of {z} increments. loq[r] = find_percentile(ct[r],dzvar_st,zblur,dvnat_st,lopct); hiq[r] = find_percentile(ct[r],dzvar_st,zblur,dvnat_st,hipct); } } close(ifile); printf "log-price increment over {r} steps:\n" > "/dev/stderr"; C2 = sum_wC2/sum_w; C = sqrt(C2); printf " variance ~{%.9f*r}\n", C2 > "/dev/stderr"; printf " deviation ~{%.7f*sqrt(r)}.\n", C > "/dev/stderr"; # Print output file: for (r = 1; r < rmax; r++) { if (ct[r] > 0) { # Compute the estimated deviation by the two models: pdv[r] = C*sqrt(r); # Print a line to {rfile}: printf "%7d %+11.7f %11.7f", r, avg[r], dev[r] > rfile; printf " %+11.7f %+11.7f", loq[r], hiq[r] > rfile; printf " %11.7f", pdv[r] > rfile; printf "\n" > rfile; } } close(rfile); } function data_error(msg) { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit(abort); } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; abort = 1; exit(abort); }