#! /bin/bash -ue
# Last edited on 2025-05-04 22:45:29 by stolfi

# Creates data files specific for a ${lang}, ${book}, and section ${sec},
# all placed in folder ${gen_dir}.

# First it creates an EVT-format file "raw.evt" with the subset of the
# book's EVT file "gen/{lang}/${book}/raw.evt" consisting of only the
# lines in section ${sec}. (If ${sec} is "tot.1", it is just a copy of
# the book's EVT.)

# Then it extracts from that EVT file a list "raw.tlw" with one token
# per line. The format is "{TYPE} {LOC} {TOKEN}" where {TYPE} is
# "a"=alpha, "p"=punct, "s"=symbol, {LOC} is an EVT locator in the
# format "\{{FNUM}\}\{{UNIT}\}\{{LINE}\}".  For the "ini", "fin", and "mid"
# books, the file "raw.tlw" contains only the first token, last token, and 
# all but first and last tokens in each EVT logical line.

# The script also creates links to ${raw_tlw} with names "trunc.tlw" and "whole.tlw"
# for the benefit of other notes, e.g. note 110.

# The tokens in the "raw.tlw" file may contain "?" or "*" for
# contentious, unreadable, or un-transcribable glyphs. Two files
# "bad.tlw" and "gud.tlw" are created containing only the "bad" tokens,
# with "?" or "*", and the "good" tokens, without those caharacters
# (which normally will be the only ones used for further analysis).

# Then it creates a file "{XXX}.wfr", where {XXX} is "raw", "bad", or "gud", with 
# the counts and frequencies of occurrences of each lexeme in the corresponding "{XXX}.tlw" file.

# It also creates a file "{XXX}.wdf" with the tokens in the corresponding "{XXX}.tlw" file
# formatted as a running text -- separated by single spaces of line breaks,
# without locators, types, etc. and without extra breaks or symbols at 
# paragraph, folio, unit, or page boundaries.


lang="$1"; shift    # Language ("voyn", "chin", "engl", etc.)
book="$1"; shift    # A book of language  ${lang} ("prs", "cul", "lab", "maj", etc.)
sec="$1"; shift     # A specific section (possibly "tot.1").

if [[ "/${lang}" == "/" ]]; then echo "** lang not specified" 1>&2; exit 1; fi
if [[ "/${book}" == "/" ]]; then echo "** book not specified" 1>&2; exit 1; fi
if [[ "/${sec}" == "/" ]]; then echo "** sec not specified" 1>&2; exit 1; fi

if [[ "/${lang}" != "/voyn" ]]; then echo "** cannot handle language ${lang}" 1>&2; exit 1; fi

smp="${lang}/${book}"

# Top folders for output data and output TeX tables of this note:
gen_sec_dir="gen/${lang}/${book}/${sec}"; mkdir -p ${gen_sec_dir}
tex_sec_dir="tex/${lang}/${book}/${sec}"; mkdir -p ${tex_sec_dir}

# EVT file of the book, with all sections together: 
book_evt="gen/${lang}/${book}/raw.evt"
if [[ ! ( -s ${book_evt} ) ]]; then echo "** no file ${book_evt}" 1>&2; exit 1; fi

# The ${lang}/${book}/${sec} specific EVT file:

raw_evt="${gen_sec_dir}/raw.evt"

# Files derived from ${raw_evt}:

raw_tlw="${gen_sec_dir}/raw.tlw"
gud_tlw="${gen_sec_dir}/gud.tlw"
bad_tlw="${gen_sec_dir}/bad.tlw"

whole_tlw="${gen_sec_dir}/whole.tlw"
trunc_tlw="${gen_sec_dir}/trunc.tlw"

raw_wfr="${gen_sec_dir}/raw.wfr"
gud_wfr="${gen_sec_dir}/gud.wfr"
bad_wfr="${gen_sec_dir}/bad.wfr"

raw_wdf="${gen_sec_dir}/raw.wdf"
gud_wdf="${gen_sec_dir}/gud.wdf"
bad_wdf="${gen_sec_dir}/bad.wdf"

if [[ "${sec}" == "tot.1" ]]; then
  echo "copying the book EVT file ${book_evt} to ${raw_evt} ..." 1>&2
  cat ${book_evt} > ${raw_evt}
else
  echo "extracting ${raw_evt} with the lines of section ${sec} from ${book_evt} ..." 1>&2
  fnum_to_sec_tbl="inp/${lang}/fnum-to-subsec.tbl"
  cat ${book_evt} \
    | ./extract_section_from_evt.sh ${sec} ${fnum_to_sec_tbl} \
    > ${raw_evt}
fi
./vms_wc.sh ${book_evt} ${raw_evt}
./show_first_last_lines.sh 10 5 ${raw_evt}

echo "creating file ${raw_tlw} from ${raw_evt} ..." 1>&2
if [[ "${book}" == "ini" ]]; then
  line_sel=( -v omitMedial=1 -v omitFinal=1 ) 
elif [[ "${book}" == "mid" ]]; then
  line_sel=( -v omitInitial=1 -v omitFinal=1 ) 
elif [[ "${book}" == "fin" ]]; then
  line_sel=( -v omitInitial=1 -v omitMedial=1 ) 
else
  line_sel=(  ) 
fi
cat ${raw_evt} \
  | ./words_from_evt.gawk \
      -v showParags=1 \
      ${line_sel[@]} \
      -v showLocation=1 \
  | gawk \
      ' BEGIN { c = "{f0}{P0}{0}"; } \
        /^ *$/{ print "# ="; next; } \
        /./{ c = ( "{" $1 "}" );  gsub(/[.;]/, "}{", c); } \
        /[*?]/{ print "s", c, $2; next; } \
        /./{ print "a", c, $2; next; } \
      ' \
  > ${raw_tlw}

echo "separating good tokens file ${gud_tlw} from ${raw_tlw} ..." 1>&2
cat ${raw_tlw} \
  | ./select_gud_bad_voyn_words.gawk \
      -v field=3 \
      -v lang=${lang} \
      -v book=${book} \
      -v sec=${sec} \
      -v writeGud=1 -v writeBad=0 \
  > ${gud_tlw}

echo "separating bad tokens file ${bad_tlw} from ${raw_tlw} ..." 1>&2
cat ${raw_tlw} \
  | ./select_gud_bad_voyn_words.gawk \
      -v field=3 \
      -v lang=${lang} \
      -v book=${book} \
      -v sec=${sec} \
      -v writeGud=0 -v writeBad=1 \
  > ${bad_tlw}
  
echo "creating links ${whole_tlw} and ${trunc_tlw} to ${gud_tlw} ..." 1>&2
for link_tlw in ${whole_tlw} ${trunc_tlw} ; do
  if [[ "@${gen_sec_dir}" != "@${link_tlw%/*}" ]]; then echo "** bad dir" 1>&2; exit 1; fi
  if [[ "@${gen_sec_dir}" != "@${raw_tlw%/*}" ]]; then echo "** bad dir" 1>&2; exit 1; fi
  link_name="${link_tlw##*/}"
  gud_name="${gud_tlw##*/}"
  if [[ -e ${link_tlw} ]]; then
    if [[ ! ( -L ${link_tlw} ) ]]; then echo "** ${link_tlw} is not a link" 1>&2; exit 1; fi
    rm -fv ${link_tlw} 2>&1 | sed -e 's:^:  :g' 1>&2
  fi
  ( cd ${gen_sec_dir} && ln -s ${gud_name} ${link_name} )
done

for tlw_file in ${raw_tlw} ${gud_tlw} ${bad_tlw}; do
  wfr_file="${tlw_file/tlw/wfr}"
  echo "creating the count and frequency file ${wfr_file} from ${tlw_file} ..." 1>&2
  cat ${tlw_file} \
    | egrep -v '^ *([#]|$)' \
    | gawk '/./ { print $3; }' \
    | sort | uniq -c | expand \
    | sort -b -k1nr -k2 \
    | ./compute_freqs.gawk \
    > ${wfr_file}
  # Print total token counts:
  printf "\n    %-32s" "${wfr_file}: "; \
  cat ${wfr_file} | gawk '/./{t += $1;} END{print t}' ; \
    
  wdf_file="${tlw_file/tlw/wdf}"
  echo "creating running text file ${wdf_file}" 1>&2 
  cat ${tlw_file} \
    | gawk ' /^ *([#]|$)/ { next; } // { print $3; } ' \
    | ./format_words_filled.sh -v width=72 \
    > ${wdf_file}
  # echo "    sample:" 1>&2 
  # ./show_first_last_lines.sh 20 3 ${wdf_file}  
done