# Last edited on 2026-04-11 22:07:55 by stolfi ???gsizes_ch[iv] = [ len(gs) for gs in gaps_ch[iv] ] ???hsizes_ch[iv] = [ len(hs) for hs in hits_ch[iv] ] err.write(f"!@ {???gsizes_ch[iv] = } {???hsizes_ch[iv] = }\n") # The hits must be the same in both long and trim versions: assert hits_ch[0] == hits_ch[1], "hits don't match" # Merge the two gap size list into a single list of gap size ranges: tsize_ch = tuple(tsize_ch); assert len(tsize_ch) == 2 ???gsizes_ch = list(zip(???gsizes_ch[0], ???gsizes_ch[1])) hits_ch = list(zip(hits_ch[0], hits_ch[1])) # Cleanup the SBJ hanzi text, just in case: text_ch = re.sub(r"<[^<>]*>", "", text_ch) text_ch = re.sub(r"[ \012]", "", text_ch) text_ch = re.sub(r"[:[](),。; ]", "", text_ch) # Split the SBJ text with the hanzi pattern: tsize_ch = len(text_ch) gaps_ch, hits_ch = \ mmf.find_multi_pattern_occurrences(text_ch, ???kwords_ch|kwords_list) score, loc_ec, size, gaps_ec, ???gsizes_ec, hits_ec, ???hsizes_ec = cand assert len(gaps_ec) == ng assert len(hits_ec) == nh assert len(???gsizes_ec) == ng assert len(???hsizes_ec) == nh exp_gsizes_str = spf.format_size_ranges(exp_gsizes_ec) exp_gsizes_ec = ??? exp_???tgsize_ec = ??? ??? exp_gsizes_wc = anf.compute_wc_gap_size_ranges(gsizes_ch) ??? exp_gsizes_ec = anf.compute_ec_gap_size_ranges(gsizes_ch) ??? ??? exp_???tgsize_wc = anf.compute_total_ec_size_range(exp_gsizes_wc) ??? exp_???tgsize_ec = anf.compute_total_wc_size_range(exp_gsizes_ec) ??? ??? exp_gsizes_wc_str = spf.format_size_ranges(exp_gsizes_wc) ??? exp_gsizes_ec_str = spf.format_size_ranges(exp_gsizes_ec) # The {score} will be zero if {tgsize_ec} is in the interval # {exp_tgsize_ec}, and every gap size {gsizes_ec[ig]} is in the interval # {exp_gsizes_ec[ig]}. The score increases as those numbers lie further # and further from those intervals. # # The parameter {exp_tgsize_ec} must be a range, and {exp_gsizes_ec} # must be a list of {ng = nh+1} ranges. Each range is a pair # {(min,max)}. # # The parameter {gsizes_ec} must be a list of {ng} integers # which are interpreted as the counts of EVA letters # before, between, and after some {nh = ng-1} instances of the # relevant keyword(s) in some EVA text. # # The parameter {exp_gsizes_ec???} must be a list of {ng} integer pairs # {(lo,hi)}which are interpreted as the min and max values of {gsizes_ec} # predicted based on the gap sizes in the SBJ entry. The range should # consider excluding or includling fields of the entry that may be # omitted in the SPS. exp_kwsize_ec = (4,5,) # Expected range of keyword size in EVA letters. for gaps_ec, hits_ec, exp_gsizes_ec??? in ( \ ( [ 'nessuno', 'solouno', 'ambedue', 'tretreno', 'quattro', 'quinqux', ], [ 'ONE', 'TWO', 'THREE', 'FOUR', 'FIVE', ], [ (15,18), (6,8), (28,29), (6,8), (2,3), (6,7), ] ), ( [ 'nessuno', 'solouno', 'ambedue', ], [ 'ONE', 'TWO', ], [ (15,18), (6,8), (28,29), ], ), ( [ 'nessuno', 'soloBEHuno', ], [ 'ONE', ], [ (15,18), (6,8), ], ), ): nh = len(hits_ec); ng = nh + 1 kwords_ec = [ r"[A-Z]+" ] * nh gsizes_ec = [ len(g) for g in gaps_ec ] hsizes_ec = [ len(h) for h in hits_ec ] ???tgsize_ec = gsizes_ec[0] for hs, gs in zip(hsizes_ec[0:], gsizes_ec[1:]): ???tgsize_ec += hs + gs err.write(f"{???tgsize_ec = !r} {gsizes_ec = !r}\n") exp_gsize_ec = spf.estimate_total_ec_gap_size_range(gsizes_ch) err.write(f"{exp_???tgsize_ec??? = !r} {exp_gsizes_ec??? = !r}\n") score = compute_score(???tgsize_ec, gsizes_ec, exp_???tgsize_ec???, exp_gsizes_ec???) err.write(f"{score = :6.1f} (from {{opt_gsizes_ec}})\n") all: test-ana-cand test-ana-cand: \ analyze_candidate_matches.py analyze_candidate_matches.py testo def compute_total_gap_size_range(gsizes): # Computes a total size range # given the estimated integer ranges {gsizes[0..ng-1]} of gap sizes # between certain keywords and and the estimated size range # {hsize} for the separating . ng = len(gsizes); nh = ng - 1; ???tgsize_lo = nh*hsize[0]; ???tgsize_hi = nh*hsize[1] for egs in gsizes: ???tgsize_lo += egs[0] ???tgsize_hi += egs[1] return (???tgsize_lo, ???tgsize_hi,) # ---------------------------------------------------------------------- def write_cands_file(cev_file, gsizes_ch, cands, kwords_ec, data, exp_tsize_ec???, exp_gsizes_ec???): # {cev_file} name of output file with evaluated candidates, or "-". # {gsizes_ch} gap sizes in the SBJ entry (for documentation). # {cands} sorted list of candidates. # {kwords_ec} list of EVA keyword pattern. # {data} counts from parag parsing and evaluation. # {exp_gsizes_ec???} expected gaps sizes in EVA letters. # {exp_tsize_ec???} expected total cand size in EVA letters. # # The {exp_gsizes_ec???} is a list of integer pairs (interpreted as a # range), and {exp_tsize_ec???} is an integer pair (ditto). # # Takes a list of evaluated candidates {cands} as produced by # {analize_starps_cands}. Each element should be a tuple # {(score, loc_ec, tsize_ec, gaps_ec, gsizes_ec, hits_ec, hsizes_ec)} # The function writes each candidate to {cev_file}, formatted # as described in {format_cand}. # # Candidates with empty {hits_ec} equal to {None} are not written. ng = len(exp_gsizes_ec???); nh = ng - 1 gsizes_ch_str = spf.format_size_ranges(gsizes_ch) exp_gsizes_ec???_str = spf.format_size_ranges(exp_gsizes_ec???) exp_tsize_ec???_str = spf.format_range(exp_tsize_ec???) err.write(f"writing the file '{cev_file}' ...\n") wr = out if cev_file == "-" else open(cev_file, "w") wr.reconfigure(encoding='utf-8') wr.write("# -*- coding: utf-8 -*-\n") wr.write(f"# npar_read = {data['npar_read']}\n") wr.write(f"# npar_with = {data['npar_with']}\n") wr.write(f"# nh_min = {nh}\n") wr.write(f"# min_size = {data['min_size']}\n") wr.write(f"# max_size = {data['max_size']}\n") wr.write(f"# {kwords_ec = !r}\n") wr.write(f"# gsizes_ch = {gsizes_ch_str!r}\n") wr.write(f"# exp_gsizes_ec??? = {exp_gsizes_ec???_str!r}\n") wr.write(f"# exp_tsize_ec??? = {exp_tsize_ec???_str}\n") for cand in cands: score, loc_ec, tsize_ec, gaps_ec, gsizes_ec, hits_ec, hsizes_ec = cand if hits_ec != None: output_cand(wr, cand, kwords_ec, exp_tsize_ec???, exp_gsizes_ec???) wr.write("\n") wr.close() return # ---------------------------------------------------------------------- def cands_eval_summary_from_cev_file(st, entry_name, kw_num): # Summary of search for matching candidates. cev_file = f"cands/{entry_name}_{kw_num}.cev" rd = open(cev_file, "r") data = read_parms_from_file_header(rd) rd.close() kwords_ec = data['kwords_ec'] cands_eval_summary(st, data, kwords_ec) return # ---------------------------------------------------------------------- cev_file = f"cands/{code_ch}_{kwnum}.cev" anf.write_cands_file \ ( cev_file, gsizes_ch, cands, kwords_ec, data, exp_tsize_ec, exp_gsizes_ec ) # {exp_tsize_ec} range of expected total cand size, in EVA, sans puncts. # {exp_gsizes_ec} ranges of expected sizes of gap, ditto. # between those occurrences are consistent # with the respctive expected values,looking for matching occurrences of the {nh} # patterns in {kwords_ec} and # # exp_tsize_ec, exp_gsizes_ec {exp_tsize_ec???} range of expected total cand size, in EVA, sans puncts. # {exp_gsizes_ec???} ranges of expected sizes of gap, ditto. # {exp_tsize_ec???} range of expected total cand size, in EVA, sans puncts. # {exp_gsizes_ec???} ranges of expected sizes of gap, ditto. # between those occurrences are consistent # with the respctive expected values,looking for matching occurrences of the {nh} # patterns in {kwords_ec} and {exp_tsize_ec???}and # {exp_gsizes_ec???[0..ng-1} where {ng=nh+1}. # exp_tsize_ec???, exp_gsizes_ec??? # {exp_tsize_ec???} range of expected total cand size, in EVA, sans puncts. # {exp_gsizes_ec???} ranges of expected sizes of gap, ditto. # corresponding elements of {exp_gsizes_ec???}, and the # total size {tsize_ec} of {tclean_ec} with {exp_tsize_ec???}. def optimal_split(gsizes_ec, hsizes_ec, exp_gsizes_ec): # The parameter exp_gsizes_ec shoule be a list of {ng_ch} expected gap # size ranges in EVA letters. If {len(gsizes_ec) > ng_ch}, condenses # {gsizes_ec,hsizes_ec} to {ng_ch} gaps in the optimal way. Returns condensed # {fgsizes_ec,fhsizes_ec} ng_ec = len(gsizes_ec) nh_ec = len(hsizes_ec) assert ng_ec == nh_ec + 1 ng_ch = len(exp_gsizes_ec) nh_ch = ng_ch - 1 min_gaps_score = +inf # Score due to gaps only. if nh_ec < nh_ch: # Not enough hits. Returns a placeholder solution. opt_gsizes_ec = gsizes_ec.copy() opt_hsizes_ec = hsizes_ec.copy() return opt_gsizes_ec, opt_hsizes_ec opt_gsizes_ec = None opt_hsizes_ec = None debug = False if debug: err.write(f"!- {ng_ch = } {nh_ch = }\n") def opt_aux(gsz, hsz, k): nonlocal min_gaps_score, opt_gsizes_ec, opt_hsizes_ec # if {len(hsz) > nh_ch} tries to condense {gsz[k:]} with the intervening # hits in all possible ways that result line {len(hsz) == nh_ch}. # Remembers the lowest scoring one in {min_gaps_score,opt_gsizes_ec,opt_hsizes_ec}. mg = len(gsz) mh = len(hsz) assert mg == mh + 1 if debug: err.write(f"\n") if debug: err.write(f"!- {' '*k} {mg = } {mh = } {k = }\n") if debug: err.write(f"!- {' '*k} {gsz = !r}\n") if debug: err.write(f"!- {' '*k} {hsz = !r}\n") if mh == nh_ch: sc = compute_gaps_score(gsz, exp_gsizes_ec) if debug: err.write(f"!- {' '*k} {sc = }\n\n") if sc < min_gaps_score: min_gaps_score = sc opt_gsizes_ec = gsz.copy() opt_hsizes_ec = hsz.copy() return else: assert mh > nh_ch if k+1 < mg: # Try condensing {gsz[k]} with {gsz[k+1]} maybe more: gsz1 = gsz[0:k] + [gsz[k] + hsz[k] + gsz[k+1],] + gsz[k+2:] hsz1 = hsz[0:k] + hsz[k+1:] opt_aux(gsz1, hsz1, k) if k < mh: # Try keeping {gsz[k]} and condensing the rest: opt_aux(gsz, hsz, k+1) return # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ opt_aux(gsizes_ec, hsizes_ec, 0) return opt_gsizes_ec, opt_hsizes_ec # ---------------------------------------------------------------------- opt_gsizes_ec, opt_hsizes_ec = optimal_split(gsizes_ec, hsizes_ec, exp_gsizes_ec) err.write(f"{opt_gsizes_ec = !r}\n") err.write(f"{opt_hsizes_ec = !r}\n") fgaps_ec, fgsizes_ec, fhits_ec, fhsizes_ec = \ combine_gaps_and_hits(gaps_ec, gsizes_ec, opt_gsizes_ec, hits_ec, hsizes_ec, opt_hsizes_ec) err.write(f"{fgaps_ec = !r}\n") err.write(f"{fgsizes_ec = !r}\n") err.write(f"{fhits_ec = !r}\n") err.write(f"{fhsizes_ec = !r}\n") opt_gsizes_ec, opt_hsizes_ec = optimal_split(???gsizes_ec, ???hsizes_ec, exp_gsizes_ec) fgaps_ec, fgsizes_ec, fhits_ec, fhsizes_ec = \ spf.combine_gaps_and_hits(gaps_ec, ???gsizes_ec, opt_gsizes_ec, hits_ec, ???hsizes_ec, opt_hsizes_ec) assert len(fgaps_ec) <= ng?????? assert len(fhits_ec) <= nh?????? loc4, tags4, starps_items = h77.split_formatted_entry(""" (A) | fcheokair.oke» (A1) | - (A3) | ··daiin. (A31) | ····chedy.qokeed.okain.chdy. (A32) | ····laiin.ofar. (A33) | ····chedy.tedam. (A34) | ····okeedy.lkal.daiin. (A35) | ····ykchedy.qokol.chedy. (A36) | ····qokedy.lkedy. (A4) | ··okalo.l.chedl.y. (A41) | ····dchedy.okeedar. (A411) | ······shchy.okol. (A42) | ····kedy.okeedy. (A43) | ····chal.raiin. (A44) | ····otedy.chtal.am. (A45) | ····dain.chey.qokeedy. (A46) | ····chol.cheey.dalkar.okedy. (A61) | - (A62) | - (A63) | - (A7) | - """) assert loc4 == best_loc assert tags4 == tags1 if chosen_loc != None: h.parags(st, f"""Thus we will tentatively assign {entry_code} ({loc_ch}) to {chosen_loc}.""") chosen_cand = cands_1[0] chosen_score = chosen_cand[0]; assert chosen_cand[1] == chosen_loc else: h.parags(st, f"""Thus we will not assign {entry_code} ({loc_ch}) to any SPS parag.""") chosen_cand = None chosen_score = None; def select_best_cand(cands_1, cands_2, max_score): # Selects the best candidate from two candidare lists, # sorted by inceasing badness score. best_cand_1 = cands_1[0] if cands_1 != None else None best_cand_2 = cands_2[0] if cands_2 != None else None score_1 = best_cand_1[0] if best_cand_1 != None else +inf score_2 = best_cand_2[0] if best_cand_2 != None else +inf best_score = score_1 if score_1 <= score_2 else score_2 best_cand = best_cand_1 if score_1 <= score_2 else best_cand_2 if best_score > max_score: best_cand = None return best_cand # ---------------------------------------------------------------------- def compute_total_gap_size???(gsizes, egsizes): tsize = 0; # Total size of all gaps. for gs in gsizes: tsize += gs etsize_min = 0 # Expected max total size of gaps. for egs in egsizes: etsize_min += egs[0] etsize_max = 0 # Expecetd min total size of gaps. for egs in egsizes: etsize_max += egs[1] return tsize, (etsize_min, etsize_max,) # ---------------------------------------------------------------------- avg_deltas=() echo " computing average delta ..." 1>&2 avg_delta=$( \ gawk ' /^ *[a-z]/{ s += $2; s += $3; n += 2 } END { print s/n }' ${wpd_file} \ ) echo " avg_delta = ${avg_delta}" 1>&2 avg_deltas+=( ${avg_delta} ) ######################################################################## # Remove junk not counted as punctuation: if utype == "ch": # Nothing to remove besides punctuation. pass elif utype == "ps": # Nothing to remove besides punctuation. pass elif utype == "ec" or utype == "wc" or utype == "wp": # Delete all markup: text = re.sub(r"[<][!][^<>]*[>]", "", text) text = re.sub(r"[«=» ]", "", text) text = re.sub(r"[{]([^{}]*)[}]", r"\1", text) text = re.sub(r"^<[%]>", "", text) text = re.sub(r"<[$]>$", "", text) # Map weirdos to '?': text = re.sub(r"[&][0-9][0-9][0-9][;]?", "?", text) # Map all to lowercse: text = text.lower() else: arg_error(f"invalid unit type {utype = !r}") ###################################################################### ???if utype == "ec": m_bad = re.search(r"[^ac-fhik-tvxy?]") elif utype == "wc" or utype == "wp": m_bad = re.search(r"[^.ac-fhik-tvxy?]", text) else: arg_error(f"invalid {utype = !r}") pat_ipu = r"[:。,\[\]]" # Ideographic punctuation. pat_ann = r"[(][^()]*[)]" # Apocriphal annotations (ideographic parens). pat_ipu = r"[-,.;:'*]" # Pinyin punctuation. pat_ann = r"[(][^()]*[)]" # Apocriphal annotations (ascii parens). pat_junk???notneeded = f"{pat_ipu}|{pat_ann}" # Junk to be deleted. ###################################################################### def ttype_from_utype(utype): # Returns the text type {ttype} of an ".ivt" file required to extract # or count unts of type {utype}. ttype = None if utype == "ch": # Chinese characters: ttype = "chu" elif utype == "ps": # Isolated pinyin syllables: ttype = "pys" elif utype == "ec" or utype == "wc" or utype == "wp": # EVA characters or words: ttype = "eva" else: arg_error(f"invalid {utype = !r}") return ttype # ---------------------------------------------------------------------- ######################################################################## def add_all_word_pos_pos_plot_rules(pre, mak, tit): # ---------------------------------------------------------------------- # pos-pos-plots: res/bencao-fu-zhu3-starps-${SPS_TAG}-wpos.png # pos-pos-plots: res/bencao-fu-zhu3-starps-${SPS_TAG}-nwo-hist.png # res/starps-${SPS_TAG}.woc: \ # \ # res/starps-fu-par.ivt \ # ${MAKEFILE} # ./list_wpositions_in_parags.py voyn-eva '${SPS_WORD}' res/starps-fu-par.ivt \ # > res/starps-${SPS_TAG}.woc # # res/bencao-fu-zhu3.woc: \ # list_wpositions_in_parags.py \ # in/bencao-fu.chu \ # ${MAKEFILE} # ./list_wpositions_in_parags.py chin-chu '主' in/bencao-fu.chu \ # > res/bencao-fu-zhu3.woc # # res/bencao-fu-zhu3-starps-${SPS_TAG}-nwo-hist.png: \ # ${MAKEFILE} \ # \ # res/starps-${SPS_TAG}.woc \ # res/bencao-fu-zhu3.woc # ./plot_two_word_pos_histograms.sh \ # starps-${SPS_TAG} '${SPS_WORD}' \ # bencao-fu-zhu3 zhu3 # # res/bencao-fu-zhu3-starps-${SPS_TAG}-wpos.png: \ # ${MAKEFILE} \ # plot_two_word_pos_files.sh \ # res/starps-${SPS_TAG}.woc \ # res/bencao-fu-zhu3.woc # ./plot_two_word_pos_files.sh \ # starps-${SPS_TAG} '${SPS_WORD}' \ # bencao-fu-zhu3 zhu3 # return targets # ---------------------------------------------------------------------- ######################################################################## def add_all_ivt_rules(pre, mak, tit): # Rules to create the specialized "-lin.ivt" files, namely "in/starps-fu-lin.ivt" # and "in/starps-gd-lin.ivt" from the starred parags text of Note/074. targets = [] if True: # Complete "fu-eva-lin.ivt" file with all parags: target_full = f"starps-fu-eva-lin.ivt" source_full = "../074/st_files/str-parags.ivt" tit[target_full] = f"copying full SPS IVTFF file {target_full} from {source_full}" pre[target_full] = [ source_full ] mak[target_full] = ( f"cat {source_full} \\", f" | egrep -v -e '^]' \\", f" > res/{target_full}" ) targets.append(target_full) if True: # Subset "gd-eva-lin.ivt" file with good lines only: target_good = f"starps-gd-eva-lin.ivt" source_good = f"../074/st_files/str-parags.ivt" # The filtering script and its imported modules: filter_script = "remove_bad_lines_from_starps_ivt.gawk" erfn_gawk_lib = "work/error_funcs.gawk" tit[target_good] = f"extracting the good SPS source file {target_good} from {source_good}" pre[target_good] = [ source_good, filter_script, erfn_gawk_lib, ] mak[target_good] = ( f"cat {source_good} \\", f" | egrep -v -e '^]' \\", f" | {filter_script} \\", f" -i {erfn_gawk_lib} \\", f" > res/{target_good}", ) targets.append(target_good) return targets # ---------------------------------------------------------------------- add_all_ivt_rules(pre, mak, tit) + \ ######################################################################## ivt_target = f"bencao-fu-{ttype}-lin.ivt" tit[ivt_target] = f"making link {ivt_target} to {source_ivt}" pre[ivt_target] = [ source_ivt, ] mak[ivt_target] = ( f"( cd res && rm -f {ivt_target} ; ln -s ../{source_ivt} {ivt_target} )", ) targets.append(ivt_target) ######################################################################## def add_single_loc_word_pos_file_rules(pre, mak, tit, book, bsub, unit, sloc, word, tag): # Adds rules and commands to create a file "res/{book}-{bsub}-{unit}-{sloc}-{tag}.wpo" # with the positions of {word} in parag {sloc} of file {name} # with format {fmt} (either "voyn-eva" or "chin-chu"). source = f"res/{book}-{bsub}-{unit}-{tag}.wpo" target = f"{book}-{bsub}-{unit}-{sloc}-{tag}.wpo" tit[target] = f"making the single-parag word positions file {target}" pre[target] = [ source, ] mak[target] = ( f"cat {source} | egrep -e '^{sloc}[ ]' | cat > res/{target}", ) return target # ---------------------------------------------------------------------- ######################################################################## # Outputs a list of all tuples {tsize} consecutive words, # ignoring those that contain words that contain '*'. # # For each tuple of {tsize} consecutive words in the same line of the input, # writes {tsize+1} lines in the output with the format # # "«{LEFT}» «{MIDDLE}» «{RIGHT}» <{SEC}.{NLIN}> {KW} {SL} {SM} {SR}" # # where {LEFT}, {MIDDLE}, and {RIGHT} are the words of the tuple; {SL}, {SM}, {SR} # are the counts of words in each of these strings; {SEC} and {NLIN} specify the input line # where the tuple occurs; and {KW} is the index of the tuple's first # word in the input line. # # The strings {LEFT}, {MIDDLE}, and {RIGHT} consist of whole input # words, separated by '.'. The string {MIDDLE} has at least one word, but # {LEFT} and {RIGHT} may be empty. if book == "bencao": assert sub == "fu" enc = "chu" if unit == "ch" else "pys" if unit == "ps" else None elif book == "starps": assert sub == "fu" or sub == "gd" enc = "eva" else: assert False, f"bad {book = }" assert enc is not None, f"bad combo {book = } {unit = }" m = re.match(pat_punc, text) if enc == "utf": # Cleanup consists in deleting the Chinese punctuation: for ch in text: if debug: err.write(f"!! ch = '{ch}'") if re.fullmatch(ch, pat_punc): num_ignored += 1 if debug: err.write(" KO") else: good_chars.append(ch) if debug: err.write(" OK") if debug: err.write("\n") text = "".join(good_chars) elif enc == "eva": # Cleanup consists of deleting parag markers and ensuring simple EVA. if unit == "ec": # Remove all EVA punctuation: tlen = len(text) text = re.sub(r"[-,.]", "", text) num_ignored += tlen - len(text) elif unit == "wc": # Normalize all punc to single '.': tlen = len(text) text = re.sub(r"[-,]", ".", text) # Normalize punctuation: text = re.sub(r"[.][.]+", ".", text) text = re.sub(r"^[.]+", "", text) text = re.sub(r"[.]+$", "", text) num_ignored += tlen - len(text) else: assert False, f"invalid combo {enc = } {unit = }" ???charset = None # Sets of special hanzi characters (punct, blank, etc.) if unit == "ch": ???charset = dict() set_dir = "langbank/chin" ???charset['invalid'] = read_chinese_char_set(f"{set_dir}/utf8-invalid.tbl") ???charset['bullets'] = read_chinese_char_set(f"{set_dir}/utf8-bullets.tbl") ???charset['symbol'] = read_chinese_char_set(f"{set_dir}/utf8-symbol.tbl") ???charset['punct'] = read_chinese_char_set(f"{set_dir}/utf8-punct.tbl") ???charset['blank'] = read_chinese_char_set(f"{set_dir}/utf8-blank.tbl") # Read tables of chinese character sets: pat_line = None # Matches a pinyin line, with groups {LOC} and {TEXT}. pat_punc = None # Matches pinyin punctuation (excluding blanks). pat_word = None # Matches a pinyin word (syllable or compound). if unit == "ch": elif unit == "ps" or unit == "pj": # Patterns for parsing pinyin: pat_loc = r"b[1-3][.][1-6][.][0-9][0-9][0-9]" pat_line = f"<({pat_loc})>[ ]+(.*)\n" pat_punc = r"[.,;()*]" else: assert False pat_sec = r"s[0-2]" # Section s-number, "s0" to "s2". pat_sub = r"[.][0-9]" # Subsection number, 0 to 9, with '.'. pat_lseq = r"[.][0-9][0-9][0-9]" # , with '.'. pat_locid = f"<({pat_sec})({pat_sub})({pat_lseq})>" # Is a data line: if m.lastindex != 4: prog_error("num fields = %d" % m.lastindex) sec = m.group(1) sub = m.group(2) lseq = m.group(3) text = m.group(4).strip() # {DATA} field. loc = f"{sec}{sub}{lseq}" else: # Non-parag data line - ignore: if re.search(r"([<][%$][>])", text): data_error(nline,line, f"spurious alignment marker '{m.group(1)}'") m = re.search(r"([^-,.a-z?]", text) if m != None: data_error(nline,line, f"invalid char '{m.group(1)}'")