# Last edited on 2026-04-11 22:07:55 by stolfi

  ???gsizes_ch[iv] = [ len(gs) for gs in gaps_ch[iv] ]
  ???hsizes_ch[iv] = [ len(hs) for hs in hits_ch[iv] ]
  err.write(f"!@ {???gsizes_ch[iv] = } {???hsizes_ch[iv] = }\n")

  # The hits must be the same in both long and trim versions:
  assert hits_ch[0] == hits_ch[1], "hits don't match"

  # Merge the two gap size list into a single list of gap size ranges: 
  tsize_ch = tuple(tsize_ch); assert len(tsize_ch) == 2

   ???gsizes_ch = list(zip(???gsizes_ch[0], ???gsizes_ch[1]))
  hits_ch = list(zip(hits_ch[0], hits_ch[1]))

  # Cleanup the SBJ hanzi text, just in case:
  text_ch = re.sub(r"<[^<>]*>", "", text_ch)
  text_ch = re.sub(r"[ \012]", "", text_ch)
  text_ch = re.sub(r"[：［］（），。；　]", "", text_ch)

  # Split the SBJ text with the hanzi pattern:
  tsize_ch = len(text_ch)
  gaps_ch, hits_ch = \
    mmf.find_multi_pattern_occurrences(text_ch, ???kwords_ch|kwords_list)
  score, loc_ec, size, gaps_ec, ???gsizes_ec, hits_ec, ???hsizes_ec = cand
  assert len(gaps_ec) == ng
  assert len(hits_ec) == nh

  assert len(???gsizes_ec) == ng
  assert len(???hsizes_ec) == nh
  

  exp_gsizes_str = spf.format_size_ranges(exp_gsizes_ec)
  exp_gsizes_ec = ???
  exp_???tgsize_ec = ???
  
  ??? exp_gsizes_wc = anf.compute_wc_gap_size_ranges(gsizes_ch)
  ??? exp_gsizes_ec = anf.compute_ec_gap_size_ranges(gsizes_ch)
  ??? 
  ??? exp_???tgsize_wc = anf.compute_total_ec_size_range(exp_gsizes_wc)
  ??? exp_???tgsize_ec = anf.compute_total_wc_size_range(exp_gsizes_ec)
  ??? 
  ??? exp_gsizes_wc_str = spf.format_size_ranges(exp_gsizes_wc)
  ??? exp_gsizes_ec_str = spf.format_size_ranges(exp_gsizes_ec)

  # The {score} will be zero if {tgsize_ec} is in the interval
  # {exp_tgsize_ec}, and every gap size {gsizes_ec[ig]} is in the interval
  # {exp_gsizes_ec[ig]}. The score increases as those numbers lie further
  # and further from those intervals.
  #

  # The parameter {exp_tgsize_ec} must be a range, and {exp_gsizes_ec}
  # must be a list of {ng = nh+1} ranges. Each range is a pair
  # {(min,max)}.
  #

  # The parameter {gsizes_ec} must be a list of {ng} integers
  # which are interpreted as the counts of EVA letters
  # before, between, and after some {nh = ng-1} instances of the
  # relevant keyword(s) in some EVA text.
  # 
  # The parameter {exp_gsizes_ec???} must be a list of {ng} integer pairs
  # {(lo,hi)}which are interpreted as the min and max values of {gsizes_ec}
  # predicted based on the gap sizes in the SBJ entry. The range should
  # consider excluding or includling fields of the entry that may be
  # omitted in the SPS.
  exp_kwsize_ec = (4,5,) # Expected range of keyword size in EVA letters.
  for gaps_ec, hits_ec, exp_gsizes_ec??? in ( \
        ( [ 'nessuno', 'solouno', 'ambedue', 'tretreno', 'quattro', 'quinqux', ],
          [   'ONE',    'TWO',      'THREE',   'FOUR',     'FIVE', ],
          [ (15,18), (6,8), (28,29), (6,8), (2,3), (6,7), ]
        ),
        ( [ 'nessuno', 'solouno', 'ambedue', ],
          [   'ONE',     'TWO', ],
          [ (15,18), (6,8), (28,29), ],
        ),
        ( [ 'nessuno', 'soloBEHuno', ],
          [   'ONE', ],
          [ (15,18), (6,8), ],
        ),
      ):
    nh = len(hits_ec); ng = nh + 1
    kwords_ec = [ r"[A-Z]+" ] * nh
    gsizes_ec = [ len(g) for g in gaps_ec ]
    hsizes_ec = [ len(h) for h in hits_ec ]
    ???tgsize_ec = gsizes_ec[0]
    for hs, gs in zip(hsizes_ec[0:], gsizes_ec[1:]):
      ???tgsize_ec += hs + gs
    err.write(f"{???tgsize_ec =     !r} {gsizes_ec = !r}\n")
    exp_gsize_ec = spf.estimate_total_ec_gap_size_range(gsizes_ch)
    err.write(f"{exp_???tgsize_ec??? = !r} {exp_gsizes_ec??? = !r}\n")
    score = compute_score(???tgsize_ec, gsizes_ec, exp_???tgsize_ec???, exp_gsizes_ec???)
    err.write(f"{score = :6.1f} (from {{opt_gsizes_ec}})\n")

all: test-ana-cand
test-ana-cand: \
            analyze_candidate_matches.py 
	analyze_candidate_matches.py testo

def compute_total_gap_size_range(gsizes):
  # Computes a total size range
  # given the estimated integer ranges {gsizes[0..ng-1]} of gap sizes 
  # between certain keywords and and the estimated size range
  # {hsize} for the separating .
  ng = len(gsizes); nh = ng - 1;
  ???tgsize_lo = nh*hsize[0]; 
  ???tgsize_hi = nh*hsize[1]
  for egs in gsizes:
    ???tgsize_lo += egs[0]
    ???tgsize_hi += egs[1]
  return (???tgsize_lo, ???tgsize_hi,)
  # ----------------------------------------------------------------------

  
def write_cands_file(cev_file, gsizes_ch, cands, kwords_ec, data, exp_tsize_ec???, exp_gsizes_ec???):
  #   {cev_file}      name of output file with evaluated candidates, or "-". 
  #   {gsizes_ch}     gap sizes in the SBJ entry (for documentation).
  #   {cands}         sorted list of candidates.
  #   {kwords_ec}     list of EVA keyword pattern.
  #   {data}          counts from parag parsing and evaluation.
  #   {exp_gsizes_ec???} expected gaps sizes in EVA letters.
  #   {exp_tsize_ec???}  expected total cand size in EVA letters.
  #
  # The {exp_gsizes_ec???} is a list of integer pairs (interpreted as a 
  # range), and {exp_tsize_ec???} is an integer pair (ditto).
  #
  # Takes a list of evaluated candidates {cands} as produced by
  # {analize_starps_cands}. Each element should be a tuple
  # {(score, loc_ec, tsize_ec, gaps_ec, gsizes_ec, hits_ec, hsizes_ec)} 
  # The function writes each candidate to {cev_file}, formatted
  # as described in {format_cand}.
  #
  # Candidates with empty {hits_ec} equal to {None} are not written.
  
  ng = len(exp_gsizes_ec???); nh = ng - 1
  
  gsizes_ch_str = spf.format_size_ranges(gsizes_ch)
  exp_gsizes_ec???_str = spf.format_size_ranges(exp_gsizes_ec???)
  exp_tsize_ec???_str = spf.format_range(exp_tsize_ec???)

  err.write(f"writing the file '{cev_file}' ...\n")
  wr = out if cev_file == "-" else open(cev_file, "w")
  wr.reconfigure(encoding='utf-8')

  wr.write("# -*- coding: utf-8 -*-\n")
  wr.write(f"# npar_read = {data['npar_read']}\n")
  wr.write(f"# npar_with = {data['npar_with']}\n")
  wr.write(f"# nh_min = {nh}\n")
  wr.write(f"# min_size = {data['min_size']}\n")
  wr.write(f"# max_size = {data['max_size']}\n")
  wr.write(f"# {kwords_ec = !r}\n")
  wr.write(f"# gsizes_ch = {gsizes_ch_str!r}\n")
  wr.write(f"# exp_gsizes_ec??? = {exp_gsizes_ec???_str!r}\n")
  wr.write(f"# exp_tsize_ec??? = {exp_tsize_ec???_str}\n")
  
  for cand in cands:
    score, loc_ec, tsize_ec, gaps_ec, gsizes_ec, hits_ec, hsizes_ec = cand
    if hits_ec != None:
      output_cand(wr, cand, kwords_ec, exp_tsize_ec???, exp_gsizes_ec???)
      wr.write("\n")
  wr.close()

  return
  # ----------------------------------------------------------------------
def cands_eval_summary_from_cev_file(st, entry_name, kw_num):
  # Summary of search for matching candidates.
  
  cev_file = f"cands/{entry_name}_{kw_num}.cev"
  rd = open(cev_file, "r")
  data = read_parms_from_file_header(rd)
  rd.close()
  
  kwords_ec = data['kwords_ec']
  cands_eval_summary(st, data, kwords_ec)
  return
  # ----------------------------------------------------------------------
  

  cev_file = f"cands/{code_ch}_{kwnum}.cev"
  anf.write_cands_file \
    ( cev_file, gsizes_ch, cands, kwords_ec, data, exp_tsize_ec, exp_gsizes_ec )
  
  #   {exp_tsize_ec}  range of expected total cand size, in EVA, sans puncts.
  #   {exp_gsizes_ec} ranges of expected sizes of gap, ditto.
  # between those occurrences are consistent
  # with the respctive expected values,looking for matching occurrences of the {nh}
  # patterns in {kwords_ec} and  
  #
  #  exp_tsize_ec, exp_gsizes_ec {exp_tsize_ec???}  range of expected total cand size, in EVA, sans puncts.
  #   {exp_gsizes_ec???} ranges of expected sizes of gap, ditto.
 #   {exp_tsize_ec???}  range of expected total cand size, in EVA, sans puncts.
  #   {exp_gsizes_ec???} ranges of expected sizes of gap, ditto.
  # between those occurrences are consistent
  # with the respctive expected values,looking for matching occurrences of the {nh}
  # patterns in {kwords_ec} and  {exp_tsize_ec???}and
  # {exp_gsizes_ec???[0..ng-1} where {ng=nh+1}. 
  # exp_tsize_ec???, exp_gsizes_ec???
  #   {exp_tsize_ec???}  range of expected total cand size, in EVA, sans puncts.
  #   {exp_gsizes_ec???} ranges of expected sizes of gap, ditto.
  # corresponding elements of {exp_gsizes_ec???}, and the
  # total size {tsize_ec} of {tclean_ec} with {exp_tsize_ec???}. 

def optimal_split(gsizes_ec, hsizes_ec, exp_gsizes_ec):
  # The parameter exp_gsizes_ec shoule be a list of {ng_ch} expected gap
  # size ranges in EVA letters. If {len(gsizes_ec) > ng_ch}, condenses
  # {gsizes_ec,hsizes_ec} to {ng_ch} gaps in the optimal way. Returns condensed
  # {fgsizes_ec,fhsizes_ec}
  
  ng_ec = len(gsizes_ec)
  nh_ec = len(hsizes_ec)
  assert ng_ec == nh_ec + 1
  
  ng_ch = len(exp_gsizes_ec)
  nh_ch = ng_ch - 1
  
  min_gaps_score = +inf # Score due to gaps only.
  
  if nh_ec < nh_ch:
    # Not enough hits. Returns a placeholder solution.
    opt_gsizes_ec = gsizes_ec.copy()
    opt_hsizes_ec = hsizes_ec.copy()
    return opt_gsizes_ec, opt_hsizes_ec
  
  opt_gsizes_ec = None
  opt_hsizes_ec = None
  
  debug = False

  if debug: err.write(f"!- {ng_ch = } {nh_ch = }\n")
      
  def opt_aux(gsz, hsz, k):
    nonlocal min_gaps_score, opt_gsizes_ec, opt_hsizes_ec 
    # if {len(hsz) > nh_ch} tries to condense {gsz[k:]} with the intervening
    # hits in all possible ways that result line {len(hsz) == nh_ch}.
    # Remembers the lowest scoring one in {min_gaps_score,opt_gsizes_ec,opt_hsizes_ec}.
    
    mg = len(gsz)
    mh = len(hsz)
    assert mg == mh + 1

    if debug: err.write(f"\n")
    if debug: err.write(f"!- {' '*k} {mg = } {mh = } {k = }\n")
    if debug: err.write(f"!- {' '*k} {gsz = !r}\n")
    if debug: err.write(f"!- {' '*k} {hsz = !r}\n")

    if mh == nh_ch:
      sc = compute_gaps_score(gsz, exp_gsizes_ec)
      if debug: err.write(f"!- {' '*k} {sc = }\n\n")
      if sc < min_gaps_score:
        min_gaps_score = sc
        opt_gsizes_ec = gsz.copy()
        opt_hsizes_ec = hsz.copy()
      return
    else:
      assert mh > nh_ch
      if k+1 < mg:
        # Try condensing {gsz[k]} with {gsz[k+1]} maybe more:
        gsz1 = gsz[0:k] + [gsz[k] + hsz[k] + gsz[k+1],] + gsz[k+2:]
        hsz1 = hsz[0:k] + hsz[k+1:]
        opt_aux(gsz1, hsz1, k)
      if k < mh:
        # Try keeping {gsz[k]}  and condensing the rest:
        opt_aux(gsz, hsz, k+1)
    return
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
  opt_aux(gsizes_ec, hsizes_ec, 0)
  
  return opt_gsizes_ec, opt_hsizes_ec
  # ----------------------------------------------------------------------

    opt_gsizes_ec, opt_hsizes_ec = optimal_split(gsizes_ec, hsizes_ec, exp_gsizes_ec)
    err.write(f"{opt_gsizes_ec = !r}\n")
    err.write(f"{opt_hsizes_ec = !r}\n")


    fgaps_ec, fgsizes_ec, fhits_ec, fhsizes_ec = \
      combine_gaps_and_hits(gaps_ec, gsizes_ec, opt_gsizes_ec, hits_ec, hsizes_ec, opt_hsizes_ec)
    err.write(f"{fgaps_ec   = !r}\n")
    err.write(f"{fgsizes_ec = !r}\n")
    err.write(f"{fhits_ec   = !r}\n")
    err.write(f"{fhsizes_ec = !r}\n")

opt_gsizes_ec, opt_hsizes_ec = optimal_split(???gsizes_ec, ???hsizes_ec, exp_gsizes_ec)
  fgaps_ec, fgsizes_ec, fhits_ec, fhsizes_ec = \
    spf.combine_gaps_and_hits(gaps_ec, ???gsizes_ec, opt_gsizes_ec, hits_ec, ???hsizes_ec, opt_hsizes_ec)
  assert len(fgaps_ec) <= ng??????
  assert len(fhits_ec) <= nh??????
  

    loc4, tags4, starps_items = h77.split_formatted_entry("""
    <f108r.17>   
    (A)    | fcheokair.oke»         
    (A1)   | -
    (A3)   | ··daiin.
    (A31)  | ····chedy.qokeed.okain.chdy.
    (A32)  | ····laiin.ofar.
    (A33)  | ····chedy.tedam.
    (A34)  | ····okeedy.lkal.daiin.
    (A35)  | ····ykchedy.qokol.chedy.
    (A36)  | ····qokedy.lkedy.
    (A4)   | ··okalo.l.chedl.y.
    (A41)  | ····dchedy.okeedar.
    (A411) | ······shchy.okol.
    (A42)  | ····kedy.okeedy.
    (A43)  | ····chal.raiin.
    (A44)  | ····otedy.chtal.am.
    (A45)  | ····dain.chey.qokeedy.
    (A46)  | ····chol.cheey.dalkar.okedy.  
    (A61)  | -
    (A62)  | -
    (A63)  | -
    (A7)   | -     
    """)
    assert loc4 == best_loc
    assert tags4 == tags1

  
  if chosen_loc != None:
    h.parags(st, f"""Thus we will tentatively assign {entry_code} ({loc_ch}) to
    {chosen_loc}.""")
    chosen_cand = cands_1[0]
    chosen_score = chosen_cand[0];  
    assert chosen_cand[1] == chosen_loc
  else:
    h.parags(st, f"""Thus we will not assign {entry_code} ({loc_ch}) to any SPS parag.""")
    chosen_cand = None
    chosen_score = None;

def select_best_cand(cands_1, cands_2, max_score):
  # Selects the best candidate from two candidare lists,
  # sorted by inceasing badness score.
  
  best_cand_1 = cands_1[0] if cands_1 != None else None
  best_cand_2 = cands_2[0] if cands_2 != None else None

  score_1 = best_cand_1[0] if best_cand_1 != None else +inf 
  score_2 = best_cand_2[0] if best_cand_2 != None else +inf 

  best_score = score_1 if score_1 <= score_2 else score_2
  best_cand = best_cand_1 if score_1 <= score_2 else best_cand_2

  if best_score > max_score: best_cand = None
  
  return best_cand
  # ----------------------------------------------------------------------

def compute_total_gap_size???(gsizes, egsizes):  
  tsize = 0;      # Total size of all gaps.
  for gs in gsizes: tsize += gs
  etsize_min = 0  # Expected max total size of gaps.
  for egs in egsizes: etsize_min += egs[0]
  etsize_max = 0  # Expecetd min total size of gaps.
  for egs in egsizes: etsize_max += egs[1]
  return tsize, (etsize_min, etsize_max,)
  # ----------------------------------------------------------------------
  
avg_deltas=()
  echo "  computing average delta ..." 1>&2
  avg_delta=$( \
    gawk ' /^ *[a-z]/{ s += $2; s += $3; n += 2 } END { print s/n }' ${wpd_file} \
  )
  echo "    avg_delta = ${avg_delta}" 1>&2
  avg_deltas+=( ${avg_delta} )


########################################################################

  # Remove junk not counted as punctuation:
  if utype == "ch":
    # Nothing to remove besides punctuation.
    pass
  elif utype == "ps":
    # Nothing to remove besides punctuation.
    pass
  elif utype == "ec" or utype == "wc" or utype == "wp":
    # Delete all markup:
    text = re.sub(r"[<][!][^<>]*[>]", "", text)
    text = re.sub(r"[«=» ]", "", text)
    text = re.sub(r"[{]([^{}]*)[}]", r"\1", text)
    text = re.sub(r"^<[%]>", "", text)
    text = re.sub(r"<[$]>$", "", text)
    # Map weirdos to '?':
    text = re.sub(r"[&][0-9][0-9][0-9][;]?", "?", text)
    # Map all to lowercse:
    text = text.lower()
  else:
    arg_error(f"invalid unit type {utype = !r}")
    
######################################################################
???if utype == "ec":
    m_bad = re.search(r"[^ac-fhik-tvxy?]")
  elif utype == "wc" or utype == "wp":
    m_bad = re.search(r"[^.ac-fhik-tvxy?]", text)
  else:
    arg_error(f"invalid {utype = !r}")
  

      pat_ipu = r"[：。，\［\］]"     # Ideographic punctuation.
      pat_ann = r"[（][^（）]*[）]"   # Apocriphal annotations (ideographic parens).
      pat_ipu = r"[-,.;:'*]"        # Pinyin punctuation.
      pat_ann = r"[(][^()]*[)]"     # Apocriphal annotations (ascii parens).
    pat_junk???notneeded = f"{pat_ipu}|{pat_ann}"  # Junk to be deleted.

######################################################################

def ttype_from_utype(utype):
  # Returns the text type {ttype} of an ".ivt" file required to extract 
  # or count unts of type {utype}.
  
  ttype = None
  if utype == "ch":
    # Chinese characters:
    ttype = "chu"
  elif utype == "ps":
    # Isolated pinyin syllables:
    ttype = "pys"
  elif utype == "ec" or utype == "wc" or utype == "wp":
    # EVA characters or words:
    ttype = "eva"
  else:
    arg_error(f"invalid {utype = !r}")
  return ttype
  # ----------------------------------------------------------------------

########################################################################

def add_all_word_pos_pos_plot_rules(pre, mak, tit):
  # ----------------------------------------------------------------------
  # pos-pos-plots: res/bencao-fu-zhu3-starps-${SPS_TAG}-wpos.png
  # pos-pos-plots: res/bencao-fu-zhu3-starps-${SPS_TAG}-nwo-hist.png
  #   res/starps-${SPS_TAG}.woc: \
  #              \
  #             res/starps-fu-par.ivt \
  #             ${MAKEFILE}
  #         ./list_wpositions_in_parags.py voyn-eva '${SPS_WORD}' res/starps-fu-par.ivt \
  #           > res/starps-${SPS_TAG}.woc
  # 
  #   res/bencao-fu-zhu3.woc: \
  #               list_wpositions_in_parags.py \
  #               in/bencao-fu.chu \
  #               ${MAKEFILE}
  #           ./list_wpositions_in_parags.py chin-chu '主' in/bencao-fu.chu \
  #             > res/bencao-fu-zhu3.woc
  # 
  #   res/bencao-fu-zhu3-starps-${SPS_TAG}-nwo-hist.png: \
  #               ${MAKEFILE} \
  #                \
  #               res/starps-${SPS_TAG}.woc \
  #               res/bencao-fu-zhu3.woc
  #           ./plot_two_word_pos_histograms.sh \
  #             starps-${SPS_TAG} '${SPS_WORD}' \
  #             bencao-fu-zhu3  zhu3
  # 
  #   res/bencao-fu-zhu3-starps-${SPS_TAG}-wpos.png: \
  #               ${MAKEFILE} \
  #               plot_two_word_pos_files.sh \
  #               res/starps-${SPS_TAG}.woc \
  #               res/bencao-fu-zhu3.woc
  #           ./plot_two_word_pos_files.sh \
  #             starps-${SPS_TAG} '${SPS_WORD}' \
  #             bencao-fu-zhu3  zhu3
  # 
  return targets
  # ----------------------------------------------------------------------

########################################################################
def add_all_ivt_rules(pre, mak, tit):
  # Rules to create the specialized "-lin.ivt" files, namely "in/starps-fu-lin.ivt"
  # and "in/starps-gd-lin.ivt" from the starred parags text of Note/074.

  targets = []
  
  if True:
    # Complete "fu-eva-lin.ivt" file with all parags:
    target_full = f"starps-fu-eva-lin.ivt"
    source_full = "../074/st_files/str-parags.ivt"

    tit[target_full] = f"copying full SPS IVTFF file {target_full} from {source_full}" 
    pre[target_full] = [ source_full ]
    mak[target_full] = (
        f"cat {source_full} \\",
        f"  | egrep -v -e '^<f105r.10[;>]' \\", 
        f"  > res/{target_full}"
      )
    targets.append(target_full)

  if True:
    # Subset "gd-eva-lin.ivt" file with good lines only:
    target_good = f"starps-gd-eva-lin.ivt"
    source_good = f"../074/st_files/str-parags.ivt"

    # The filtering script and its imported modules:
    filter_script = "remove_bad_lines_from_starps_ivt.gawk"
    erfn_gawk_lib = "work/error_funcs.gawk"

    tit[target_good] = f"extracting the good SPS source file {target_good} from {source_good}" 
    pre[target_good] = [ source_good, filter_script, erfn_gawk_lib, ]
    mak[target_good] = (
        f"cat {source_good} \\",
        f"  | egrep -v -e '^<f105r.10[;>]' \\", 
        f"  | {filter_script} \\",
        f"      -i {erfn_gawk_lib} \\",
        f"  > res/{target_good}",
      )
    targets.append(target_good)
  
  return targets
  # ----------------------------------------------------------------------
  
add_all_ivt_rules(pre, mak, tit) + \

########################################################################

    ivt_target = f"bencao-fu-{ttype}-lin.ivt"
    tit[ivt_target] = f"making link {ivt_target} to {source_ivt}"
    pre[ivt_target] = [ source_ivt, ]
    mak[ivt_target] = (
      f"( cd res && rm -f {ivt_target} ; ln -s ../{source_ivt} {ivt_target} )",
    )
    targets.append(ivt_target)

######################################################################## 

def add_single_loc_word_pos_file_rules(pre, mak, tit,  book, bsub, unit, sloc, word, tag):
  # Adds rules and commands to create a file "res/{book}-{bsub}-{unit}-{sloc}-{tag}.wpo"
  # with the positions of {word} in parag {sloc} of file {name}
  # with format {fmt} (either "voyn-eva" or "chin-chu").

  source = f"res/{book}-{bsub}-{unit}-{tag}.wpo"
  target = f"{book}-{bsub}-{unit}-{sloc}-{tag}.wpo"

  tit[target] = f"making the single-parag word positions file {target}"
  pre[target] = [ source, ]
  mak[target] = (
      f"cat {source} | egrep -e '^{sloc}[ ]' | cat > res/{target}",
    )
  return target
  # ----------------------------------------------------------------------

########################################################################

# Outputs a list of all tuples {tsize} consecutive words,
# ignoring those that contain words that contain '*'.
#
# For each tuple of {tsize} consecutive words in the same line of the input,
# writes {tsize+1} lines in the output with the format 
# 
#    "«{LEFT}» «{MIDDLE}» «{RIGHT}» <{SEC}.{NLIN}> {KW} {SL} {SM} {SR}"
# 
# where {LEFT}, {MIDDLE}, and {RIGHT} are the words of the tuple; {SL}, {SM}, {SR}
# are the counts of words in each of these strings; {SEC} and {NLIN} specify the input line
# where the tuple occurs; and {KW} is the index of the tuple's first
# word in the input line.
#
# The strings {LEFT}, {MIDDLE}, and {RIGHT} consist of whole input
# words, separated by '.'. The string {MIDDLE} has at least one word, but 
# {LEFT} and {RIGHT} may be empty.


  if book == "bencao":
    assert sub == "fu"
    enc = "chu" if unit == "ch" else "pys" if unit == "ps" else None
  elif book == "starps":
    assert sub == "fu" or sub == "gd"
    enc = "eva"
  else:
    assert False, f"bad {book = }"
  assert enc is not None, f"bad combo {book = } {unit = }"


    m = re.match(pat_punc, text)
  if enc == "utf":
    # Cleanup consists in deleting the Chinese punctuation:
    for ch in text:
      if debug: err.write(f"!! ch = '{ch}'")
      if re.fullmatch(ch, pat_punc):
        num_ignored += 1
        if debug: err.write(" KO")
      else:
        good_chars.append(ch)
        if debug: err.write(" OK")
      if debug: err.write("\n")
    text = "".join(good_chars)
  elif enc == "eva":
    # Cleanup consists of deleting parag markers and ensuring simple EVA.
    if unit == "ec":
      # Remove all EVA punctuation:
      tlen = len(text)
      text = re.sub(r"[-,.]", "", text)
      num_ignored += tlen - len(text)
    elif unit == "wc":
      # Normalize all punc to single '.':
      tlen = len(text)
      text = re.sub(r"[-,]", ".", text)
      # Normalize punctuation:
      text = re.sub(r"[.][.]+", ".", text)
      text = re.sub(r"^[.]+", "", text)
      text = re.sub(r"[.]+$", "", text)
      num_ignored += tlen - len(text)
    else:
      assert False, f"invalid combo {enc = } {unit = }"


  ???charset = None  # Sets of special hanzi characters (punct, blank, etc.)
  if unit == "ch":
   ???charset = dict()
    set_dir = "langbank/chin"
    ???charset['invalid'] = read_chinese_char_set(f"{set_dir}/utf8-invalid.tbl")
    ???charset['bullets'] = read_chinese_char_set(f"{set_dir}/utf8-bullets.tbl")
    ???charset['symbol']  = read_chinese_char_set(f"{set_dir}/utf8-symbol.tbl")
    ???charset['punct']   = read_chinese_char_set(f"{set_dir}/utf8-punct.tbl")
    ???charset['blank']   = read_chinese_char_set(f"{set_dir}/utf8-blank.tbl")


  # Read tables of chinese character sets:
  pat_line = None # Matches a pinyin line, with groups {LOC} and {TEXT}.
  pat_punc = None # Matches pinyin punctuation (excluding blanks).
  pat_word = None # Matches a pinyin word (syllable or compound).
  if unit == "ch":
  elif unit == "ps" or unit == "pj":
    # Patterns for parsing pinyin:
    pat_loc = r"b[1-3][.][1-6][.][0-9][0-9][0-9]"
    pat_line = f"<({pat_loc})>[ ]+(.*)\n"
    pat_punc = r"[.,;()*]"

  else:
    assert False


    pat_sec = r"s[0-2]"                # Section s-number, "s0" to "s2".
    pat_sub = r"[.][0-9]"              # Subsection number, 0 to 9, with '.'.
    pat_lseq = r"[.][0-9][0-9][0-9]"   # , with '.'.
    pat_locid = f"<({pat_sec})({pat_sub})({pat_lseq})>"
      # Is a data line:
      if m.lastindex != 4: prog_error("num fields = %d" % m.lastindex)
      sec = m.group(1)
      sub = m.group(2) 
      lseq = m.group(3)
      text = m.group(4).strip()  # {DATA} field.
    
      
      loc = f"{sec}{sub}{lseq}"

    else:
      # Non-parag data line - ignore:
      if re.search(r"([<][%$][>])", text):
        data_error(nline,line, f"spurious alignment marker '{m.group(1)}'")
      m = re.search(r"([^-,.a-z?]", text)
      if m != None:
        data_error(nline,line, f"invalid char '{m.group(1)}'")