#! /bin/bash -eu
# Last edited on 2026-01-16 19:00:48 by stolfi

# Merges the Stolfi transcriptions ${JS_TEXT_FILE} and ${JS_STAR_FILE}
# completing gaps with a modified version ${RZ_FILE} of Rene's modified
# transcription, while eliminating all complicated details.

JS_VERSION="$1"; shift
RZ_VERSION="$1"; shift

JS_STAR_FILE=star${JS_VERSION}.ivt
JS_TEXT_FILE=text${JS_VERSION}.ivt
RZ_FILE=full${RZ_VERSION}.ivt

JOIN_IVT_FILE="join${JS_VERSION}.ivt"

# Clean up the two versions, add "{pnum}.{lseq}" fields,
# and sort:

echo "CLEANUP AND MERGE STOLFI'S TRANSCRIPTIONS, ADD {PNUM} {LSEQ} SORT ..." 1>&2

cat ${JS_TEXT_FILE} ${JS_STAR_FILE} \
  | bare_bones_ivt.gawk \
  | add_pnum_lseq.sh \
  | sort -k1.1 \
  | gawk '//{ gsub(/([;]U|)>$/, ";U> ", $2); print; }' \
  > .js.pevt

echo "CLEANUP RENE'S TRANSCRIPTION, ADD {PNUM} {LSEQ}, SORT ..." 1>&2

cat ${RZ_FILE} \
  | bare_bones_ivt.gawk \
  | add_pnum_lseq.sh \
  | sort -k1.1 \
  | gawk '//{ gsub(/([;][A-Z]|)>$/, ";Z> ", $2); print; }' \
  > .rz.pevt
  
wc -lw .js.pevt .rz.pevt

echo "SOME CONSISTENCY CHECKS ..." 1>&2

for f in .js.pevt .rz.pevt; do
  echo "=== $f ===" 1>&2
  cat $f | gawk '//{ print $1; }' | sort > .ps

  # Check for invalid fnums (turned into "pXXX" or "p???"):
  ( set +e; egrep -e 'p[?][?][?]|pXXX' .ps ; echo "" > /dev/null ) > .perrs
  if [[ -s .perrs ]]; then echo "invalid {fnum}/{pnum}:" 1>&2; cat .perrs 1>&2; fi

  # Check for duplicated pnums:
  uniq -d .ps  > .pdups
  if [[ -s .pdups ]]; then echo "duplicated {pnum}.{lseq}:" 1>&2; cat .pdups 1>&2; fi 
  
  # Check for validity:
  cat $f \
    | validate_25e1_ivt_format.gawk \
       -v locField=2 \
       -v dataField=3 \
       -v validChars='abcdefghijklmnopqrstuvxyz' \
       -v requireUnitHeaders=0 \
       -v requirePageHeaders=0 \
       -v checkLineOrder=1 \
       -v allowAlignmentMarks=1 \
       -v allowParagMarks=1 \
    > .invds
  if [[ -s .invds ]]; then echo "invalid lines:" 1>&2; cat .invds 1>&2; fi 
  
done

echo "JOIN FILES BY {PNUM}.{LSEQ} ..." 1>&2

join -1 1 -2 1 -a 1 -a 2 -e MISSING -o 0,1.2,2.2,1.3,2.3 .js.pevt .rz.pevt > .jsrz.pevev

echo "CHECK FOR STOLFI LINES NOT PRESENT IN RENE'S ..." 1>&2

gawk '//{ if (($3 == "MISSING") || ($5 == MISSING)) { print; }}' .jsrz.pevev > .sbugs
if [[ -s .sbugs ]]; then echo "missing in Rene's:" 1>&2; cat .sbugs 1>&2; fi 

echo "SELECTING STOLFI'S IF PRESENT ELSE RENE'S ..." 1>&2

chmod u+w ${JOIN_IVT_FILE}
cat .jsrz.pevev \
  | gawk \
      ' // {
          if (NF != 5) { data_error("bad NF") }
          ps = $1; jsloc = $2; rzloc = $3; jsdat = $4; rzdat = $5;
          if ((jsloc == "MISSING") && (rzloc == "MISSING")) { data_error("bad join") }
          if ((jsloc == "MISSING") && (jsdat == "MISSING")) { print rzloc,rzdat; next }
          if ((jsloc == "MISSING") || (jsdat == "MISSING")) { data_error("bad join") }
          jsdat_s = jsdat; gsub(/[,.]/, "", jsdat_s); gsub(/[a]/, "o", jsdat_s)
          rzdat_s = rzdat; gsub(/[,.]/, "", rzdat_s); gsub(/[a]/, "o", rzdat_s)
          if (jsdat_s == rzdat_s) { print jsloc,jsdat; next }
          ndif++
          if (ndif <= 10) { 
            printf "%9s %-14s %s\n", ps, jsloc, jsdat_s > "/dev/stderr";
            printf "%9s %-14s %s\n", "", rzloc, rzdat_s > "/dev/stderr";
            printf "\n" > "/dev/stderr";
          }
          print jsloc,jsdat; next
        }
      ' \
  > ${JOIN_IVT_FILE}
chmod a-w ${JOIN_IVT_FILE}
  
echo "VALIDATING THE MERGED EVT FILLE ..." 1>&2

cat ${JOIN_IVT_FILE} \
  | validate_25e1_ivt_format.gawk \
     -v validChars='abcdefghijklmnopqrstuvxyz' \
     -v requireUnitHeaders=0 \
     -v requirePageHeaders=0 \
     -v checkLineOrder=1 \
     -v allowAlignmentMarks=1 \
     -v allowParagMarks=1 \
  > .invds
if [[ -s .invds ]]; then echo "invalid lines:" 1>&2; cat .invds 1>&2; fi 


cat ${JOIN_IVT_FILE} | egrep -e '[;]U>' > .join-js.ivt
cat ${JOIN_IVT_FILE} | egrep -e '[;]Z>' > .join-rz.ivt
wc -l .join-js.ivt .join-rz.ivt ${JOIN_IVT_FILE}

cat ${JOIN_IVT_FILE} | count_U_Z_per_page.gawk

