#! /usr/bin/gawk -f # Last edited on 2021-01-05 19:10:46 by jstolfi # Splits a data file of US consumer prices from {stdin} into # separate files in "00-DATA/split/", one for each series. # Keeps only monthly data, discarding year and semester averages. BEGIN { abort = -1 ndata = 0; # Total data points read, including year/sem avgs. ndata_mt = 0; # Total monthly data points read. cur_sid = "" # Current series ID. cur_fname = "BUG.txt" # Current file name. cur_nd = 0; # Total data points read in current series. } (abort >= 0) { exit abort; } // { # Remove CR: gsub(/[\015]/, "", $0) } /^series_id/ { # File header -- skip: next; } /^CU/ { # Data point: ndata++; if (NF < 4) { data_error("too few fields") } sid = $1; year = $2; per = $3; val = $4 foots = "[" for (i = 5; i <= NF; i++) { foots = ( foots "," $(i)); } foots = (foots "]") gsub(/\[,/, "[", foots); if (year !~ /^[12][089][0-9][0-9]$/) { data_error("bad year \"" year "\""); } year += 0; if (per ~ /^(M13|S01|S02|S03)$/) { # Year or semester average; skip next } ndata_mt++; if (per !~ /^M(0[1-9]|1[0-2])$/) { data_error("bad month \"" per "\""); } # Convert month to fraction of year, add to year: mt = per; gsub(/^M/,"",mt); mt = mt - 1; if ((mt < 0) || (mt >= 12)) { data_error("bad numeric month \"" mt "\""); } mtf = int(100*(mt/12))/100 year += mtf if (sid != cur_sid) { end_series(cur_sid,cur_fname,cur_nd); cur_sid = sid; cur_fname = ("00-DATA/split/" cur_sid ".txt"); cur_nd = 0; } printf "%.2f | %s | %s | %s\n", year, per, val, foots > cur_fname cur_nd++; } END { end_series(cur_sid,cur_fname,cur_nd); printf "%d total data points read\n", ndata > "/dev/stderr" printf "%d monthly data points read\n", ndata_mt > "/dev/stderr" } function end_series(cur_sid,cur_fname,cur_nd) { if (cur_sid != "") { close(cur_fname); printf "%8d %s\n", cur_nd, cur_sid > "/dev/stderr" } } function data_error(msg,lin) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; if (lin != "") { printf " [[%s]]\n", lin > "/dev/stderr"; } abort = 1; exit(1) }