#! /usr/bin/python3
# Last edited on 2026-03-11 09:41:24 by stolfi

# The command line arguments are an input file name {raw_file}, a unit type
# name {utype} ("ch" or "ec"), a keyword pattern {kword}, and an
# output file name {opa_file}.
#
# Reads the file {raw_file}, The file should by an IVTFF-like format, 
# with lines in the format "<{LOC}> {TEXT}".  Typically it should
# have one paragraph per line.
#
# Perform on the {TEXT} of each line the cleanup appropriate for the
# given {utype}, obtaining a cleaned text {TCLEAN}. See
# {clean_up_raw_text} in {size_position_funcs.py} for details. Then
# looks for the first occurrence of a given string or RE pattern {kword}
# in the {TCLEAN}.
# 
# Writes to {opa_file} a line for each input line that matches
# {kword}. Each output line has the format "{LOC} {WPOS} {PREF} {MIDF}
# {SUFF} {TEXT}" where {MIDF} is the substring of {TCLEAN} that matched
# the {kword}, "{PREF}", "{MIDF}", and "{SUFF}" is a tripartition of
# that cleaned {TCLEAN}, {PDEL} and {and {WPOS} is the the position of
# the match (that is, the length of {PREF} in EVA or hanzi characters)
# and {TEXT} is the original text before cleanup.
#
# Actually {PREF} is preceded by a period, {MIDF} is enclosed in
# brackets, and {SUFF} is ended by a period; where these delimiters are
# '.', '[', and ']' for "starps", and '。', '［', and '］' for "bencao".
# This way, the three fields will never be empty.
#
# If {utype} is "ec", the {kword} must match only the characters
# '[a-z?]'. If {utype} is "ch", it must match only simplified hanzi
# characters. In eiter case it must not contain any punctuation or the
# specialpatterns '^', '$', and '\b'.
# 
# In any case the input file is assumed to be in Unicode UTF-8 encoding,
# and so will be the output file.

import sys, os, re
from sys import stderr as err
from process_funcs import bash, basic_line_loop
from error_funcs import arg_error, file_line_error, prog_error
from chinese_funcs import read_chinese_char_set
from note_077_funcs import compute_and_print_stats, name_for_tex_macro
import size_position_funcs as spf

def main(raw_file, utype, kword, opa_file):
  # {raw_file}  Name of input file.
  # {utype}     Either "ec" or "ch".
  # {kword}     String or RE pattern to search.
  # {opa_file}  Name of output file.
  
  if utype == "ch":
    book = "bencao"
    dot_del = '。'; bra_del = '［'; ket_del = '］'
  elif utype == "ec":
    book = "starps"
    dot_del = '.'; bra_del = '['; ket_del = ']'
  else:
    arg_error(f"invalid {utype = !r}")

  rd = open(raw_file, "r")
  rd.reconfigure(encoding='utf-8')
  
  wr = open(opa_file, "w")
  wr.reconfigure(encoding='utf-8')
  wr.write("# -*- coding: utf-8 -*-\n")

  pat_line, pat_unit, pat_sepa, clean_sepa = spf.get_parsing_patterns(utype)

  tot_para = 0 # Count of input data lines (parags_.
  tot_with = 0 # Total input lines that matched {kword}.

  def process_input_line(nline, line):
    nonlocal tot_para, tot_with 
    # 
    # Parses a line {line} assuming it is line {nline} of the file.
    # The {line} is always a string (never {None}), but may be "" if the line
    # is empty.
    # 
    # Ignores the line if it is a blank or #-comment.
    # 
    # Otherwise the line must be a data line, matching {pat_line}
    # 
    # Increments {tot_para} for each data line.
    # 
    # Gets {tclean} by cleaning up {text}. Looks for the string or RE
    # pattern {kword} in {tclean}. If found, writes the corresponding
    # output line and increments {tot_with}.

    # Should we debug the line?
    debug = False
    
    def data_error(msg):
      nonlocal raw_file, nline, line
      file_line_error(raw_file, nline, msg, line)
      assert False
      # ----------------------------------------------------------------------
  
    assert line != None, "The {line} arg must not be {None}" 

    # Ignore comments and blank lines:
    if re.match(r" *([#]|$)", line): return

    # Just in case, ignore IVTFF page headers:
    if re.match(r"<f[0-9]+[rv][0-9]*>", line): return
    
    tot_para += 1

    m = re.match(pat_line, line)
    if m is None: 
      # Invalid line format.
      data_error("invalid line format")

    # Parse the line into locus ID {loc} and raw text:
    assert m.lastindex == 2, f"bug {m.lastindex = }"
    loc = m.group(1)
    raw_text = m.group(2) 

    # Cleanup the raw text:
    tclean = spf.clean_up_raw_text(raw_text, utype, data_error)
    if debug: err.write(f"!~ {tclean = !r}\n")

    # Search for {kword} in the {tclean}:
    m = re.search(kword, tclean)
    if m is not None:
      wbeg = m.start()
      wend = m.end()
      if debug: err.write(f"!~ {loc:<12s} {wbeg = } {wend = }\n")
      tot_with += 1

      pref = tclean[0:wbeg]
      midf = tclean[wbeg:wend]
      suff = tclean[wend:]

      # Cleanup the raw text for reference:
      tdirt = raw_text
      if utype == "ec":
        tdirt = re.sub(r"[<][!][^<>]*[>]", "", tdirt)
        tdirt = re.sub(r"[«=»]", "", tdirt);
        tdirt = re.sub(r"[,]", "", tdirt)
        tdirt = re.sub(r"[-]", ".", tdirt)
      elif utype == "ch":
        tdirt = re.sub(r"（[^（）]*）", "", tdirt)
        tdirt = re.sub(r"［］：。；", "", tdirt)
      else:
        assert False

      wr.write( \
        "%-8s %3d %s%s %s%s%s %s%s %s\n" % \
        ( loc, wbeg, dot_del, pref, bra_del, midf, ket_del, suff, dot_del, tdirt )
      )

    return      
    # ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

  err.write(f"reading file '{raw_file}' ...\n")
  nread = basic_line_loop(rd, process_input_line)
  rd.close()
  wr.close()

  frac_match = tot_with/tot_para # Ratio of parags with matches.

  err.write(f"{nread:6d} lines read\n")
  err.write(f"{tot_para:6d} parags read\n")
  err.write(f"{tot_with:6d} parags matched {kword}\n")
  err.write(f"{frac_match:8.2f} fraction of parags matching\n")
  
  tex_file = re.sub(r"[.]opa", "-parms.tex", opa_file)
  write_TeX_parms_file(tex_file, utype, tot_para, tot_with)

  return
  # ----------------------------------------------------------------------

def write_TeX_parms_file(tex_file, utype, tot_para, tot_with):
  # Writes the file {tex-file} with 
  # parameter defintions for LaTeX.

  err.write("\n")

  unit_size = spf.hanzi_per_unit(utype) # Nominal avg num of Chinese chars per unit.
  err.write(f"assumed avg hanzi per unit = {unit_size:8.2f}\n")
  
  tex_name = tex_file
  tex_name = re.sub(r"^.*[/]", "", tex_name)
  tex_name = re.sub(r"-parms.*$", "", tex_name)
  txpref = name_for_tex_macro(tex_name)
  
  tex_wr = open(tex_file, "w")
  
  # These should match the defs from other TeX parms files:
  tex_wr.write(f"\\def\\{txpref}NumParags{{{tot_para}}}\n")
  tex_wr.write(f"\n") 

  tot_sans = tot_para - tot_with
  tex_wr.write(f"\\def\\{txpref}ParagsWith{{{tot_with}}}\n")
  tex_wr.write(f"\\def\\{txpref}ParagsSans{{{tot_sans}}}\n")
  tex_wr.write(f"\n") 

  tex_wr.write(f"\\def\\{txpref}HanziPerUnit{{{unit_size:.3f}}}\n")
  tex_wr.write(f"\n") 

  tex_wr.close()
  return
  # ----------------------------------------------------------------------

def test_stuff():
  arg_error("no tests yet\n")
  return
  # ----------------------------------------------------------------------

if sys.argv[1] == "test":
  test_stuff()
else:
  narg = len(sys.argv)
  iarg = 1
  raw_file =  sys.argv[iarg]; iarg += 1
  utype =     sys.argv[iarg]; iarg += 1
  kword =     sys.argv[iarg]; iarg += 1
  opa_file =  sys.argv[iarg]; iarg += 1
  main(raw_file, utype, kword, opa_file)