#! /bin/bash -eu
# Last edited on 2026-03-06 14:32:01 by stolfi
   
# USAGE: "{CMD} {USIZE} {BINSIZE} < {WPP_FILE} > {NWH_FILE}
#  
# Each line of the input file {WPP_FILE} must be "{LOC} {NUNITS}"
# where {LOC} is the locus ID of a parag (like "b.1.2.033" or
# "f103v.12") and {NUNITS} is an integer count of text units (chars,
# words, etc) in that parag.
#
# The parameter {USIZE} is the nominal number of hanzi that are
# equivalent to one text unit.
# 
# Writes to {NWH_FILE} a file with the histogram of those counts,
# multiplied by {USIZE}, split into bins of size {BINSIZE}

usize="$1"; shift
bin_size="$1"; shift

echo "  ~~~ $0 ~~~" 1>&2
echo "  using usize = ${usize}" 1>&2
echo "  using bin_size = ${bin_size}" 1>&2

bsplit=$( echo "${usize} != 1" | bc -lq )
echo "  using bsplit = ${bsplit}" 1>&2

cat \
  | gawk -v usize=${usize} '/^[a-z][0-9]/{ loc = $1; wct = $2+0; print loc, wct*usize }' \
  | ./make_histogram.gawk \
      -v col=2 \
      -v sync=-0.5 -v step=${bin_size} -v bsplit=${bsplit} -v bround=0 \
      -v verbose=1

