#! /usr/bin/gawk -f # Last edited on 2014-01-17 03:12:58 by stolfilocal BEGIN \ { # Reads a market data file, ignores headers and comments, converts # the weighted mean prices to log10 scale {z[t]}, computes # pairs of sucessive increments {d1 = z[i-1]-z[i-2]} and {d0 = z[i]-z[i-1]}, # writes out a 2D histogram of the pairs {(d1,d0)}. abort = -1; debug = 0; if (dmax == "") { arg_error(("must define {dmax}")); } dmax += 0; # Max absolute difference between log10 of consecutive prices to expect in histogram. if (dmax <= 0) { arg_error(("bad {dmax} = " dmax)); } if (nh == "") { arg_error(("must define {nh}")); } nh += 0; # Number of histogram bins on each side of zero. if ((nh < 1) || (int(nh) != nh)) { arg_error(("bad {nh} = " nh)); } split("", dbreak); # {dbreak[0]} is zero, {dbreak[1..nh]} are the positive bin breaks. compute_breaks(nh, dmax, dbreak); # The histogram {hist} is an array of {2*nh} by {2*nh} bins, each index is in {0..2*nh-1}. # The first index {i1} corresponds to the older increment {d1}, # The second index {i0} corresponds to the newer increment {d0}. nrecs = 0; # Number of data lines read. nzeros = 0; # Number of data lines with zero price. nincrs = 0; # Number of valid increments computed. npairs = 0; # Number of valid consecutive increment pairs considered. split("", hist); clear_hist(nh, hist); hist_overflow = 0; # Total mass outside the histogram. actual_dmax = 0; # Actual max absolute increment. z1 = "NONE"; z2 = "NONE"; # Last two values of {z[t]}. dsum = 0; # Sum of valid increments. dsum2 = 0; # Sum of squares of valid increments. dsumpr = 0; # Sum of products of consecutive valid increment pairs. } (abort >= 0) { exit(abort); } # discard comments, blank lines, table headers: /^[ ]*([#]|$)/ { next; } /[!]/ { next; } /^20[0-9][0-9]-/ \ { nrecs++; if (NF != 16) { data_error(("invalid field count" NF)); } v = $(16); if (v+0 == 0) { z0 = "NONE"; z1 = "NONE"; z2 = "NONE"; nzeros++; } else { z0 = log(v)/log(10); if (z1 != "NONE") { d0 = z0 - z1; nincrs++; # Accumulate for increment mean, variance: dsum += d0; dsum2 += d0*d0; # Keep track of max increment: if (d0 > actual_dmax) { actual_dmax = d0; } if (-d0 > actual_dmax) { actual_dmax = -d0; } if (z2 != "NONE") { # Got a pair of consecutive increments: d1 = z1 - z2; if (debug > 0) { printf "%+9.5f %9.5f\n", d1, d0 > "/dev/stderr"; } # Accumulate for covariance: dsumpr += d1*d0; accum_hist(nh, dbreak, hist, d1, d0); } } } # Advance: z2 = z1; z1 = z0; next; } // { data_error(("invalid line format")); } END \ { if (abort >= 0) { exit(abort); } printf "%d data lines read\n", nrecs > "/dev/stderr"; printf "%d data lines with zero prices\n", nzeros > "/dev/stderr"; printf "%d valid increments\n", nincrs > "/dev/stderr"; printf "%d valid consecutive increment pairs\n", npairs > "/dev/stderr"; printf "actual max absolute increment = %9.5f\n", actual_dmax > "/dev/stderr"; davg = dsum/nincrs; dvar = dsum2/nincrs; ddev = sqrt(dvar); printf "increment average = %+11.6f variance = %11.8f deviation = %11.6f\n", davg, dvar, ddev > "/dev/stderr"; dcov = dsumpr/npairs; dcorr = dcov/dvar; printf "increment pair covariance = %+11.8f correlation = %+11.6f\n", dcov, dcorr > "/dev/stderr"; if (hist_overflow > 0.1) { printf "lost %.2f mass outside histogram\n", hist_overflow > "/dev/stderr"; } dump_hist(nh, hist, dbreak); } function compute_breaks(nh,dmax,dbreak, i) { for (i = 0; i <= nh; i++) { dbreak[i] = (i*dmax)/nh; } } function clear_hist(nh,dbreak,hist, i1,i0) { # Assumes {hist} has been initialized as an array. # Bins are numbered from 0 to {2*nh-1} on each axis. for (i1 = 0; i1 < 2*nh; i1++) { for (i0 = 0; i0 < 2*nh; i0++) { hist[i1,i0] = 0; } } } function accum_hist(nh,dbreak,hist,d1,d0, x1,x0,i1,i0,j1,j0,f1,f0,e1,e0) { # Accumulate in the histogram the pair {d1,d0} (older and newer increment). # Also increments {hist_overflow,npairs}. # Map {d1,d0} to fractional low bin indices {x1,x0} in {[0 _ 2*nh]} x1 = increment_to_index(nh, dbreak, d1) - 0.5; x0 = increment_to_index(nh, dbreak, d0) - 0.5; # Compute indices {i1,i0,j1,j0} of relevant bins and their weights {e1,e0,f1,f0}: i1 = int(x1); j1 = i1 + 1; f1 = x1 - i1; e1 = 1 - f1; i0 = int(x0); j0 = i0 + 1; f0 = x0 - i0; e0 = 1 - f0; # Must add {e1*e0} to {hist[i1,i0]}, {e1*f0} to {hist[i1,j0]}, etc. # But beware of bins that do not exist: accum_hist_bin(nh,hist,i1,i0,e1*e0); accum_hist_bin(nh,hist,i1,j0,e1*f0); accum_hist_bin(nh,hist,j1,i0,f1*e0); accum_hist_bin(nh,hist,j1,j0,f1*f0); npairs++; } function increment_to_index(nh,dbreak,d, da,i,xa) { # Returns the fractional position of an increment {d} in the histogram. # The result is in the range {[0 _ 2*nh]} if {abs(d) <= dmax}, or a few # units outside if {abs(d) > dmax}. # Namely, let {da} be the absolute value of {d}. # if {da} is in the range {[dbreak[i] _ dbreak[i+1]]} for # some {i} in {0..nh-1}, maps it to a non-negative float {xa} in {[i _ i+1]} by linear interpolation. # If {da > dmax = dbreak[nh]}, maps it to {xa = nh + 3}. # Then returns {nh+xa} if {d} is non-negative, {nh-xa} if {d} is negative. da = (d < 0 ? -d : d); # !!! Should use binary search !!! i = 0; while ((i < nh) && (da > dbreak[i+1])) { i++; } if (i >= nh) { # The increment {d} is too big: xa = nh + 3; } else { # Increment is within valid range, interpolate indices: xa = i + (da - dbreak[i])/(dbreak[i+1] - dbreak[i]); } return (d < 0 ? nh - xa : nh + xa); } function accum_hist_bin(nh,hist,i1,i0,w) { # Adds the mass {w} to {hist[i1,i0]}; unless the indices are invalid, # in which case adds it to {hist_overflow}. if ((i0 < 0) || (i0 >= 2*nh) || (i1 < 0) || (i1 >= 2*nh)) { hist_overflow += w; } else { hist[i1,i0] += w; } } function dump_hist(nh,hist,dbreak, i1,i0) { for (i1 = 0; i1 < 2*nh; i1++) { for (i0 = 0; i0 < 2*nh; i0++) { # Increment ranges associated with bin: lo1 = (i1 < nh ? -dbreak[nh-i1] : dbreak[i1-nh]); # Low limit of {d1}. hi1 = (i1 < nh ? -dbreak[nh-i1-1] : dbreak[i1-nh+1]); # High limit of {d1}. lo0 = (i0 < nh ? -dbreak[nh-i0] : dbreak[i0-nh]); # Low limit of {d0}. hi0 = (i0 < nh ? -dbreak[nh-i0-1] : dbreak[i0-nh+1]); # High limit of {d0}. printf "%4d %4d", i1, i0; printf " %+9.5f %+9.5f", lo1, hi1; printf " %+9.5f %+9.5f", lo0, hi0; printf " %11.6f", hist[i1,i0]; printf "\n"; } printf "\n"; } } function data_error(msg) { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit(abort); } function data_warning(msg) { printf "%s:%s: !! warning - %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; abort = 1; exit(abort); }