#! /bin/bash -ue
# Last edited on 2025-05-04 23:04:23 by stolfi

# Tabulates the fraction of bad tokens or lexemes that were excluded because 
# of not-so-weird weirdos like or inconsistent readings of <iin>/<iiin>:

lang="$1"; shift
books=( "$@" )

if [[ "/${lang}" == "/" ]]; then echo "** lang not specified" 1>&2; exit 1; fi

gen_lang_dir="gen/${lang}"

headfmt="      %-14s %-12s %-12s %-12s %-12s"
printf "${headfmt}\n", \
  "lang/book/sec", "all bad   ", "[?]    ", "[bgjv...] ", "[ai?n]   "
printf "${headfmt}\n", \
  "--------------", "------------", "------------", "------------", "------------"
for book in ${books[@]}; do
  for sec in tot.1 ; do
    printf "      %-14s" "${lang}/${book}/${sec}"
    bad_wfr="${gen_lang_dir}/${book}/${sec}/bad.wfr"
    cat ${bad_wfr} \
      | gawk '/./{w++;t+=$1;} END{printf " %5d%7s", w, sprintf("(%d)",t);}'
    cat ${bad_wfr} \
      | gawk '($3 ~ /[*?]/){ print; }' \
      | gawk '/./{w++;t+=$1;} END{printf " %5d%7s", w, sprintf("(%d)",t);}'
    cat ${bad_wfr} \
      | gawk '($3 ~ /^[a-z]*$/){ print; }' \
      | gawk '/./{w++;t+=$1;} END{printf " %5d%7s", w, sprintf("(%d)",t);}'
    cat ${bad_wfr} \
      | gawk '($3 ~ /^[a-z]*[ao][i]*([?][i]|[i][?])[i]*[n]$/){ print; }' \
      | gawk '/./{w++;t+=$1;} END{printf " %5d%7s", w, sprintf("(%d)",t);}'
    printf "\n"
  done
done

printf "    Each pair of numbers is rejected lexemes in the lexicon\n"
printf "    and (in parenteses) of tokens in the text.\n"
printf "    \n"
printf "    Column 'all bad' is the total rejected lexemes (tokens).\n"
printf "    \n"
printf "    Column '[?]' counts only those that are unreadable, contentious,\n" 
printf "    or contain non-basic weirdos (non-lowercase EVA characters).\n"
printf "    \n"
printf "    Column '[bchv...]' counts those that contain some of the rare\n"
printf "    EVA glyphs <b>, <g>, <j>, <v>, <u>, <z>, or\n"
printf "    nonstandard uses of <c> and <h>.\n"
printf "    \n"
printf "    Column '[ai?n]' counts those have <iin>/<iiin> discrepancies.\n"

for book in ${books[@]}; do
  for sec in tot.1 ; do
    printf "Lexemes rejected from %s:\n\n" "${lang}/${book}/${sec}.wfr"
    bad_wfr="${gen_lang_dir}/${book}/${sec}/bad.wfr"
    cat ${bad_wfr} \
      | gawk '($3 ~ /^[a-z]*$/){print $1, $3; }' \
      | sort -b -k1nr -k2 \
      | ./format_counts_packed.gawk \
      | sed -e 's/^/    /'
    printf "\n"
  done
done
