#! /usr/bin/python3 # Last edited on 2026-03-11 09:41:24 by stolfi # The command line arguments are an input file name {raw_file}, a unit type # name {utype} ("ch" or "ec"), a keyword pattern {kword}, and an # output file name {opa_file}. # # Reads the file {raw_file}, The file should by an IVTFF-like format, # with lines in the format "<{LOC}> {TEXT}". Typically it should # have one paragraph per line. # # Perform on the {TEXT} of each line the cleanup appropriate for the # given {utype}, obtaining a cleaned text {TCLEAN}. See # {clean_up_raw_text} in {size_position_funcs.py} for details. Then # looks for the first occurrence of a given string or RE pattern {kword} # in the {TCLEAN}. # # Writes to {opa_file} a line for each input line that matches # {kword}. Each output line has the format "{LOC} {WPOS} {PREF} {MIDF} # {SUFF} {TEXT}" where {MIDF} is the substring of {TCLEAN} that matched # the {kword}, "{PREF}", "{MIDF}", and "{SUFF}" is a tripartition of # that cleaned {TCLEAN}, {PDEL} and {and {WPOS} is the the position of # the match (that is, the length of {PREF} in EVA or hanzi characters) # and {TEXT} is the original text before cleanup. # # Actually {PREF} is preceded by a period, {MIDF} is enclosed in # brackets, and {SUFF} is ended by a period; where these delimiters are # '.', '[', and ']' for "starps", and '。', '[', and ']' for "bencao". # This way, the three fields will never be empty. # # If {utype} is "ec", the {kword} must match only the characters # '[a-z?]'. If {utype} is "ch", it must match only simplified hanzi # characters. In eiter case it must not contain any punctuation or the # specialpatterns '^', '$', and '\b'. # # In any case the input file is assumed to be in Unicode UTF-8 encoding, # and so will be the output file. import sys, os, re from sys import stderr as err from process_funcs import bash, basic_line_loop from error_funcs import arg_error, file_line_error, prog_error from chinese_funcs import read_chinese_char_set from note_077_funcs import compute_and_print_stats, name_for_tex_macro import size_position_funcs as spf def main(raw_file, utype, kword, opa_file): # {raw_file} Name of input file. # {utype} Either "ec" or "ch". # {kword} String or RE pattern to search. # {opa_file} Name of output file. if utype == "ch": book = "bencao" dot_del = '。'; bra_del = '['; ket_del = ']' elif utype == "ec": book = "starps" dot_del = '.'; bra_del = '['; ket_del = ']' else: arg_error(f"invalid {utype = !r}") rd = open(raw_file, "r") rd.reconfigure(encoding='utf-8') wr = open(opa_file, "w") wr.reconfigure(encoding='utf-8') wr.write("# -*- coding: utf-8 -*-\n") pat_line, pat_unit, pat_sepa, clean_sepa = spf.get_parsing_patterns(utype) tot_para = 0 # Count of input data lines (parags_. tot_with = 0 # Total input lines that matched {kword}. def process_input_line(nline, line): nonlocal tot_para, tot_with # # Parses a line {line} assuming it is line {nline} of the file. # The {line} is always a string (never {None}), but may be "" if the line # is empty. # # Ignores the line if it is a blank or #-comment. # # Otherwise the line must be a data line, matching {pat_line} # # Increments {tot_para} for each data line. # # Gets {tclean} by cleaning up {text}. Looks for the string or RE # pattern {kword} in {tclean}. If found, writes the corresponding # output line and increments {tot_with}. # Should we debug the line? debug = False def data_error(msg): nonlocal raw_file, nline, line file_line_error(raw_file, nline, msg, line) assert False # ---------------------------------------------------------------------- assert line != None, "The {line} arg must not be {None}" # Ignore comments and blank lines: if re.match(r" *([#]|$)", line): return # Just in case, ignore IVTFF page headers: if re.match(r"", line): return tot_para += 1 m = re.match(pat_line, line) if m is None: # Invalid line format. data_error("invalid line format") # Parse the line into locus ID {loc} and raw text: assert m.lastindex == 2, f"bug {m.lastindex = }" loc = m.group(1) raw_text = m.group(2) # Cleanup the raw text: tclean = spf.clean_up_raw_text(raw_text, utype, data_error) if debug: err.write(f"!~ {tclean = !r}\n") # Search for {kword} in the {tclean}: m = re.search(kword, tclean) if m is not None: wbeg = m.start() wend = m.end() if debug: err.write(f"!~ {loc:<12s} {wbeg = } {wend = }\n") tot_with += 1 pref = tclean[0:wbeg] midf = tclean[wbeg:wend] suff = tclean[wend:] # Cleanup the raw text for reference: tdirt = raw_text if utype == "ec": tdirt = re.sub(r"[<][!][^<>]*[>]", "", tdirt) tdirt = re.sub(r"[«=»]", "", tdirt); tdirt = re.sub(r"[,]", "", tdirt) tdirt = re.sub(r"[-]", ".", tdirt) elif utype == "ch": tdirt = re.sub(r"([^()]*)", "", tdirt) tdirt = re.sub(r"[]:。;", "", tdirt) else: assert False wr.write( \ "%-8s %3d %s%s %s%s%s %s%s %s\n" % \ ( loc, wbeg, dot_del, pref, bra_del, midf, ket_del, suff, dot_del, tdirt ) ) return # :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: err.write(f"reading file '{raw_file}' ...\n") nread = basic_line_loop(rd, process_input_line) rd.close() wr.close() frac_match = tot_with/tot_para # Ratio of parags with matches. err.write(f"{nread:6d} lines read\n") err.write(f"{tot_para:6d} parags read\n") err.write(f"{tot_with:6d} parags matched {kword}\n") err.write(f"{frac_match:8.2f} fraction of parags matching\n") tex_file = re.sub(r"[.]opa", "-parms.tex", opa_file) write_TeX_parms_file(tex_file, utype, tot_para, tot_with) return # ---------------------------------------------------------------------- def write_TeX_parms_file(tex_file, utype, tot_para, tot_with): # Writes the file {tex-file} with # parameter defintions for LaTeX. err.write("\n") unit_size = spf.hanzi_per_unit(utype) # Nominal avg num of Chinese chars per unit. err.write(f"assumed avg hanzi per unit = {unit_size:8.2f}\n") tex_name = tex_file tex_name = re.sub(r"^.*[/]", "", tex_name) tex_name = re.sub(r"-parms.*$", "", tex_name) txpref = name_for_tex_macro(tex_name) tex_wr = open(tex_file, "w") # These should match the defs from other TeX parms files: tex_wr.write(f"\\def\\{txpref}NumParags{{{tot_para}}}\n") tex_wr.write(f"\n") tot_sans = tot_para - tot_with tex_wr.write(f"\\def\\{txpref}ParagsWith{{{tot_with}}}\n") tex_wr.write(f"\\def\\{txpref}ParagsSans{{{tot_sans}}}\n") tex_wr.write(f"\n") tex_wr.write(f"\\def\\{txpref}HanziPerUnit{{{unit_size:.3f}}}\n") tex_wr.write(f"\n") tex_wr.close() return # ---------------------------------------------------------------------- def test_stuff(): arg_error("no tests yet\n") return # ---------------------------------------------------------------------- if sys.argv[1] == "test": test_stuff() else: narg = len(sys.argv) iarg = 1 raw_file = sys.argv[iarg]; iarg += 1 utype = sys.argv[iarg]; iarg += 1 kword = sys.argv[iarg]; iarg += 1 opa_file = sys.argv[iarg]; iarg += 1 main(raw_file, utype, kword, opa_file)