#! /usr/bin/gawk -f
# Last edited on 2014-02-07 18:34:46 by stolfilocal

# Reads a file with summary of trade data for equal and equally-spaced intervals of some
# independent variable {u} (time, BTC volume, or national currency volume),
# as produced by {extract_time_or_volume_series.gawk}.
# 
# The client must define (with "-v") the variables
# 
#    {opref}    output filename prefix.
#    {rmax}     max time gap to consider, in multiples of the independent variable step.
#    {confpct}  desired probability of inclusion in confidence range.
# 
# Writes a file "{opref}_i.txt" with a the raw increment data. For each
# {r} up to {rmax}, and every line index {i} in the input file excluding
# the last {r}, writes a line in the format 
# "{i} {r} {UVAR[i]} {UVAR[i+r]} {DTIME[i,r]} {DVBTC[i,r]} {DVNAT[i,r]} {DZVAR[i,r]}",
# where 
#
#  {UVAR[i]} is the value of the independent variable {u} for interval {i},
#  
#  {DTIME[i,r],DVBTC[i,r],DVNAT[i,r]} are the total time, BTC volume, and
#    nat curency between the centers of intervals {i} and {i+r},
# 
#  {DZVAR[i,r]} is the increment in {log10(price)} between input lines
#    {i} and {i+r}, that is, {ZVAR[i+r] - ZVAR[i]}.

# Also writes a file "{opref}_r.txt" with one line for each positive integer {r} up to {rmax}
# with the format "{r} {AVG[r]} {DEV[r]} {LOQ[r]} {HIQ[r]} {PDV[r]}"
# where 
# 
#   {AVG[r]} is the average of the increments in {log10(price)} after {r} input steps,
#   {DEV[r]} is the deviation of those increments, assuming zero mean,
#   {LOQ[r]} is the low (negative) end of some percentile range of those intervals,
#   {HIQ[r]} is the high (positive) end of some percentile range of those intervals,
#   {PDV[r]} is the expected deviation according to the {u}-based predictive model.
#
# The predictive model assumes that the increments in {log10(price)} between
# successive {u} intervals are independent identically distributed random
# variables. Therefore, the variance of the total increment after {r}
# {u}-steps should be {C^2*r} for some constant {C}.

BEGIN \
  { 
    if (opref == "") { arg_error(("must define {opref}")); }
    if (rmax == "") { arg_error(("must define {rmax}")); }

    # Percentage of weight to include in the confidence range.
    if (confpct == "") { arg_error(("must define {confpct}")); }
    confpct += 0;  
    if ((confpct < 0.01) || (confpct > 0.99)) { arg_error(("invalid {confpct}")); }
    
    ifile = (opref "_i.txt"); # Output file with individual increments.
    rfile = (opref "_r.txt"); # Output file with parameters per time step {r}.

    n = 0; # Number of valid input entries.
    # Summary trade data for the equal and equally-spaced time intervals in input file:
    # Indexed with interval number {i} in {0..n-1}:
    split("", uvar_in);    # Indep variable {u} in middle of interval.
    split("", duvar_in);   # Width of that interval in {u}.
    split("", time_in);    # Time at middle of interval.
    split("", dtime_in);   # Duration of interval.
    split("", vbtc_in);    # Accum BTC volume up to middle of interval.
    split("", dvbtc_in);   # BTC volume in interval.
    split("", vnat_in);    # Accum national currency volume to middle of interval.
    split("", dvnat_in);   # National currency volume in interval.
    split("", zvar_in);    # Log10 of Weighted mean price in interval.
  } 

/(^[ ]*([#]|$))|[!]/ { next; }

/^[ ]*[-+]?[0-9]*([0-9]|[.][0-9]*)[ ]*[-+]?[0-9]*([0-9]|[.][0-9]*)/ \
  { if (NF != 10) { data_error("invalid field count"); }
    i_in = 0+$1
    if (i_in != n) { data_error(("line number error")); }
    uvar_in[n] = 0+$2;    # Indep variable in middle of interval.
    duvar_in[n] = 0+$3;   # Width of that interval.
    time_in[n] = 0+$4;    # Time at middle of interval.
    dtime_in[n] = 0+$5;   # Duration of interval.
    vbtc_in[n] = 0+$6;    # Accum BTC volume up to middle of interval.
    dvbtc_in[n] = 0+$7;   # BTC volume in interval.
    vnat_in[n] = 0+$8;    # Accum national currency volume to middle of interval.
    dvnat_in[n] = 0+$9;   # National currency volume in interval.
    zvar_in[n] = 0+$(10); # Log10 of Weighted mean price in interval.
    
    if ((uvar_in[n] < 0) || (duvar_in[n] <= 0)) { data_error(("invalid {uvar,duvar}")); }
    if ((time_in[n] < 0) || (dtime_in[n] <= 0)) { data_error(("invalid {time,dtime}")); }
    if ((vbtc_in[n] < 0) || (dvbtc_in[n] <= 0)) { data_error(("invalid {vbtc,dvbtc}")); }
    if ((vnat_in[n] < 0) || (dvnat_in[n] <= 0)) { data_error(("invalid {vnat,dvnat}")); }
    n++;
    next;
  }
  
// \
  { data_error("invalid line format"); }
  
END \
  { printf "read %d valid data points\n", n > "/dev/stderr";
    # Computed data for time increments {r} in {1..umax}:
    split("", ct);  # {ct[r]} is the number of valid pairs {i,i+r} present in data.
    split("", avg); # {avg[r]} is the average increment in {log10(price)} from discrete time {i} to {i+r}.
    split("", dev); # {dev[r]} is the deviation of those increments assuming zero mean increment.
    split("", loq); # {loq[r]} is the low end of the {confpct} percentile range of those increments.
    split("", hiq); # {hiq[r]} is the high end of the {confpct} percentile range of those increments.
    split("", pdv); # {pdv[r]} is the deviation of the increment after {r} steps in the log-Brownian model.

    # Sums to compute the parameters of the log-Brownian model.
    # In that model, the deviation {pdv[r]} is {C*sqrt(r)} for some constant {C}.
    # We expect {dev[r]^2 ~ C^2*r} with weight {dvnat_md[r]},  
    # that is {dev[r]^2/r ~ C^2} with weight {w[r] = r*dvnat[r]}.  
    sum_wC2 = 0; # Sum of {w[i,r]*(dzvar[i,r]^2/r)} for all {i,r}.
    sum_w = 0;  # Sum of weights {w[i,r]}.

    for (r = 1; r <= rmax; r++)
      { 
        # printf "=== r = %d ===\n", r > "/dev/stderr";
        # Collect the increments {duvar_st[k],dtime_st[k},dvbtc_st[k],dvnat_st[k],dzvar_st[k]},
        # for {k} in {0..ct[r]-1}, for all valid  pairs {i,i+r} present in data:
        ct[r] = 0; # Count of valid pairs with increment {r}.
        split("", duvar_st);
        split("", dtime_st);
        split("", dvbtc_st);
        split("", dvnat_st);
        split("", dzvar_st);
        # Sums for weighted means and variances (weighted by natural volume) of this {r}: 
        sum_vd = 0;   # Sum of {dzvar_st[k]} weighted by {dvnat_st[k]}.
        sum_vd2 = 0;  # Sum of {dzvar_st^2[k]} weighted by {dvnat_st[k]}.
        sum_v = 0;    # Sum of {dvnat_st[k]}.
        istep = r;  # Increment between starts of strides.
        for (i = 0; i < n - r; i += istep)
          { j = i + r;
            if ((dvnat_in[i] > 0) && (dvnat_in[j] > 0)) 
              { # Compute the total time, btc volume, and nat volume between mids of intervals {i} and {j}:
                k = ct[r];
                duvar_st[k] = uvar_in[j] - uvar_in[i];
                dtime_st[k] = time_in[j] - time_in[i];
                dvbtc_st[k] = vbtc_in[j] - vbtc_in[i];
                dvnat_st[k] = vnat_in[j] - vnat_in[i];
                # Compute the increment {dzvar} in {log10(price)} between intervals {i} and time {j}:
                dzvar_st[k] = zvar_in[j] - zvar_in[i];
                # Accumulate for mean and deviation computation for this r:
                vri = dvnat_st[k];
                dri = dzvar_st[k];
                sum_vd = sum_vd + vri*dri;
                sum_vd2 = sum_vd2 + vri*dri*dri;
                sum_v = sum_v + vri;
                # Accumulate for log-Brownian model fitting:
                wri = r*dvnat_st[k];
                C2ri = dri*dri/r;
                sum_wC2 += wri*C2ri;
                sum_w += wri;
                # Save for percentile computation:
                ct[r] += 1;
                # Write the stride datum:
                printf "%7d %7d %5d", i, j, r > ifile;
                printf "  %17.6f  %17.6f", duvar_st[k], dtime_st[k] > ifile;
                printf "  %17.6f  %17.6f", dvbtc_st[k], dvnat_st[k] > ifile;
                printf "  %+12.8f", dzvar_st[k] > ifile;
                printf "\n" > ifile;
              }
          }
        printf "\n" > ifile; # Blank line for gnuplot.
        # Compute the mean and deviation of {log10(price)} increments, from {i} to {i+r}:
        if (ct[r] > 0)
          { avg[r] = sum_vd/sum_v;
            dev[r] = sqrt(sum_vd2/sum_v);

            # Determine the balanced interval {lo,hi} that contains 95% of the weight {dvnat_st[*]}:
            lopct = (1-confpct)/2; # Percentile level for {loq[r]}.
            hipct = 1 - lopct;     # Percentile level for {hiq[r]}.
            zblur = 0.001; # Assumed precision of {z} increments.
            loq[r] = find_percentile(ct[r],dzvar_st,zblur,dvnat_st,lopct);
            hiq[r] = find_percentile(ct[r],dzvar_st,zblur,dvnat_st,hipct);
          }
      }            
    close(ifile);

    printf "log-price increment over {r} steps:\n" > "/dev/stderr";
    C2 = sum_wC2/sum_w;
    C = sqrt(C2);
    printf "  variance ~{%.9f*r}\n", C2 > "/dev/stderr";
    printf "  deviation ~{%.7f*sqrt(r)}.\n", C > "/dev/stderr";

    # Print output file:
    for (r = 1; r < rmax; r++) 
      { if (ct[r] > 0)
          { # Compute the estimated deviation by the two models:
            pdv[r] = C*sqrt(r);
            # Print a line to {rfile}:
            printf "%7d %+11.7f %11.7f", r, avg[r], dev[r] > rfile; 
            printf "  %+11.7f %+11.7f", loq[r], hiq[r] > rfile;
            printf "  %11.7f", pdv[r] > rfile;
            printf "\n" > rfile;
          } 
      }
    close(rfile);
  }

function data_error(msg)
  { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; 
    printf "  «%s»\n", $0 > "/dev/stderr"; 
    abort = 1;
    exit(abort);
  } 
          
function arg_error(msg)
  { printf "** %s\n", msg > "/dev/stderr"; 
    abort = 1;
    exit(abort);
  }