#! /bin/bash -ue
# Last edited on 2025-09-24 19:32:01 by stolfi

cmd="$0"; cmd="${cmd##*/}"
usage="${cmd} {LANG}/{BOOK} [whole|trunc]"

if [[ $# -ne 2 ]]; then
  echo "usage: ${usage}" 1>&2; exit 1
fi

smp="$1"; shift;
sizeopt="$1"; shift;

echo "=== creating the derived word files dat/${smp}/*/{raw,gud,bad}.{tlw,wdf,wfr} (${sizeopt}) ===" 1>&2

# Ensure presence of the source links, and word-mapping tables for {fix_sample_tokens.gawk}
for f in word-map.tbl org/main.wds sample_fns.gawk reencode_words_for_tex.gawk ; do
  if [[ ! ( -r "dat/${smp}/${f}" ) ]]; then
    echo "*** missing dat/${smp}/${f}" 1>&2; exit 1
  fi
done

# Build files "{raw,gud,bad}.tlw" for each section
secs=( `cat dat/${smp}/sections-ok.tags` )

for sec in "${secs[@]}" "tot.1" ; do
  smpsec="${smp}/${sec}"
 
  echo "  ... creating word files dat/${smpsec}/{raw,gud,bad}.{tlw,wdf,wfr} ..." 1>&2

  # Check if directories exist
  for pd in dat tex ; do
    dir="${pd}/${smpsec}"
    if [[ ! ( -d ${dir} ) ]]; then echo "** directory ${dir} missing" 1>&2; exit 1 ; fi
  done

  # Check if main ".tlw" file exists:
  rFile="dat/${smpsec}/${sizeopt}.tlw"
  if [[ ! ( -s ${rFile} ) ]]; then
    echo "** file ${rFile} not found" 1>&2; exit 1
  fi
  wc -l ${rFile} | sed -e 's:^:    :g' 1>&2

  /bin/rm -fv dat/${smpsec}/{raw,gud,bad}*.tlw 2>&1 | sed -e 's:^:    :g' 1>&2
  echo "    creating the token files {raw,gud,bad}.tlw" 1>&2 
  cp -a ${rFile} dat/${smpsec}/raw.tlw 2>&1 | sed -e 's:^:    :g' 1>&2
  cat ${rFile} \
    | gawk ' ($1 == "a") { print } ' \
    > dat/${smpsec}/gud.tlw
  cat ${rFile} \
    | gawk ' ($1 != "a") { print } ' \
    > dat/${smpsec}/bad.tlw
  if [[ ( -e dat/${smpsec}/bad.tlw ) && ( ! ( -s dat/${smpsec}/bad.tlw ) ) ]]; then
    echo "!! empty raw.tlw file '='" | sed -e 's:^:    :g' 1>&2 
  fi

  for kind in raw gud bad ; do 
    rkFile=dat/${smpsec}/${kind}.tlw
    if [[ ! ( -e ${rkFile} ) ]]; then echo "*** no ${rkFile}" 1>&2 ; exit 1 ; fi
    
    wkFile=dat/${smpsec}/${kind}.wdf
    /bin/rm -fv ${wkFile} 2>&1 | sed -e 's:^:    :g' 1>&2
    echo "    creating running text file ${wkFile}" 1>&2 
    cat ${rkFile} \
      | gawk ' /^ *([#]|$)/ { next; } // { print $3; } ' \
      | format_words_filled.sh -v width=72 \
      > ${wkFile}
    echo "      sample:" 1>&2 
    head -n 20 ${wkFile} | sed -e 's:^:        :g' 1>&2 
    echo "        . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ." 1>&2 
    tail -n 3 ${wkFile} | sed -e 's:^:        :g' 1>&2 
    
    fkFile=dat/${smpsec}/${kind}.wfr
    /bin/rm -fv ${fkFile} 2>&1 | sed -e 's:^:    :g' 1>&2
    echo "    creating the lexeme frequency file ${fkFile}" 1>&2 
    cat ${rkFile} \
      | gawk '// { print $3; }' \
      | sort | uniq -c | expand \
      | sort -b -k1nr -k2 \
      | compute_freqs.gawk \
      > ${fkFile}
    echo "      the 10 most common lexemes in ${rkFile}:"
    head -10 ${fkFile} | sed -e 's:^:        :g' 1>&2 

    skFile="dat/${smpsec}/${kind}-${sizeopt}-wds-summary.tex"
    ekFile="${skFile/dat/tex}"
    /bin/rm -fv ${skFile} ${ekFile} 2>&1 | sed -e 's:^:    :g' 1>&2
    echo "    creating the TeX summary file ${skFile}" 1>&2 
    tex_make_sample_summary.sh ${smpsec} ${sizeopt} ${kind} > ${skFile}
    cat ${skFile} | sed -e 's:^:      :g' 1>&2 
    update_paper_include.sh ${skFile} ${ekFile} | sed -e 's:^:    :g' 1>&2
    
  done
done

# Print summaries
for kind in raw gud bad ; do
  all_fkFiles=( `echo ${secs[@]} "tot.1" | tr ' ' '\012' | sed -e "s:^:dat/${smp}/:g" -e "s:\$:/${kind}.wfr:g"` )
  dicio-wc ${all_fkFiles[@]} 2>&1 | sed -e 's:^:  :g' 1>&2
done
printf "\n"
pgm='/./{ n+=$1; next; } END{ print n; }'
for sec in "${secs[@]}" "tot.1" ; do
  printf "  %s" ${sec} 
  for kind in  raw gud bad ; do
    fkFile="dat/${smp}/${sec}/${kind}.wfr"
    printf " %s = %7d" \
        ${kind} \
        `cat ${fkFile} | gawk "${pgm}"` \
      2>&1 \
      | sed -e 's:^:  :g' \
      1>&2
  done
  printf "\n"
done