#! /usr/bin/gawk -f # Last edited on 2014-01-28 19:14:25 by stolfilocal # A {gawk} library function to compute percentiles of weighted data. # To be included into other {gawk} programs with "-f". function find_percentile(n,x,xblur,w,pct, k,wtot,wgoal,xmin,xmax,ylo,wlo,yhi,whi,ymd,wmd,weps,yeps,est) { # Assumes that {x[0..n-1]} is an usorted list of real values with weights {w[0..n-1]}. # Returns the value {y} such that {sum_weight_below(n,x,w,y)} is approximately {pct*wtot} # where {wtot} is the sum of all weights. # Assumes that each {x[i]} is spread out over the interval {x[i]±xblur}. # printf "=== find_percentile(%.6f) ===\n", pct > "/dev/stderr"; # Computes the total weight {wtot} and the range of values {xmin,xmax}: wtot = 0; xmin = x[0]; xmin = y[0]; for (k = 0; k < n; k++) { wtot += w[k]; if (x[k] < xmin) { xmin = x[k]; } if (x[k] > xmax) { xmax = x[k]; } } # Binary search for {y}. wgoal = pct*wtot; ylo = xmin - xblur; wlo = 0.0; # {sum_weight_below(n,x,w,ylo)}. yhi = xmax + xblur; whi = wtot; # {sum_weight_below(n,x,w,yhi)}. est = 0; # Estimator to use. # Tolerances: weps = 0.0001*wtot; # Assumed tolerance in {wgoal}. yeps = 0.00000001; # Assumed tolerance in {y} while ((yhi - ylo > yeps) && ((whi - wlo) > weps)) { # At this point we must have {wlo <= wgoal < whi}. # printf " %10.7f %10.7f %10.7f %10.7f\n", ylo, wlo, yhi, whi > "/dev/stderr"; if (s == 0) { # Estimate {y} by bissection: ymd = (yhi + ylo)/2; } else { # Estimate {y} by secant method: ymd = (yhi*(whi-wgoal) + ylo*(wgoal-wlo))/(whi - wlo); } est = 1 - est; wmd = sum_weight_below(n,x,xblur,w,ymd); # Decide which side to go: if (wmd < wgoal - weps/2) { ylo = ymd; wlo = wmd; } else if (wmd > wgoal + weps/2) { yhi = ymd; whi = wmd; } else { ylo = ymd; wlo = wmd; yhi = ymd; whi = wmd; } } if ((yhi - ylo <= yeps) || (whi - wlo <= weps)) { ymd = (ylo + yhi)/2; } else { ymd = (yhi*(whi-wgoal) + ylo*(wgoal-wlo))/(whi - wlo); } # printf " y = %10.7f\n", ymd > "/dev/stderr"; return ymd; } function sum_weight_below(n,x,xblur,w,y, k,sw,dx) { # Assumes that {x[0..n-1]} is an usorted list of real values with weights {w[0..n-1]}. # Returns the sum of all {w[i]} with {x[i] < y-xblur}, plus a fracion of # all {w[i]} with {y-xblur <= x[i] <= y+xblur}. sw = 0; for (k = 0; k < n; k++) { if (x[k] < y - xblur) { sw += w[k]; } else if (x[k] <= y + xblur) { dx = (y - x[k] + xblur)/xblur/2; # Containment fraction. sw += dx*w[k]; } } return sw; }