#! /usr/bin/python3 # Last edited on 2026-03-11 10:02:16 by stolfi import sys, os, re; from sys import stderr as err from error_funcs import arg_error, prog_error from process_funcs import bash from math import fabs import make_077_funcs as mfn import size_position_funcs as spf def add_all_rules(pre, mak, tit): # Adds rules to build all derived files that # depend only on the SPS sources. # # Returns the list of ultimate targets. targets = \ add_all_bencao_par_ivt_rules(pre, mak, tit) + \ add_all_starps_lin_ivt_rules(pre, mak, tit) + \ add_all_starps_par_ivt_rules(pre, mak, tit) + \ add_all_starps_pag_ivt_rules(pre, mak, tit) + \ add_all_upp_rules(pre, mak, tit) + \ add_all_word_tuple_rules(pre, mak, tit) + \ add_all_lines_with_pattern_rules(pre, mak, tit) + \ add_all_single_hist_rules(pre, mak, tit) + \ add_all_double_hist_rules(pre, mak, tit) + \ add_all_bencao_word_pos_plot_rules(pre, mak, tit) + \ add_all_starps_word_pos_plot_rules(pre, mak, tit) + \ add_all_word_delta_pairs_plot_rules(pre, mak, tit) + \ add_all_coin_image_rules(pre, mak, tit) return targets # ---------------------------------------------------------------------- def add_all_bencao_par_ivt_rules(pre, mak, tit): # Rules to create the "bencao-fu-{utype}-par.ivt" files # from the raw transcription files in the "in" folder, # for {utype} "ch" and "ps". # # If {utype} is "ch", the source is "in/bencao.uts", and # the {TEXT} will be hanzi chars. # # If {utype} is "ps", the source is "in/bencao.pys", # and the {TEXT} will be pinyin with separate syllables # (There should be one hanzi character for each pinyin # syllable). # # Beware that both raw source files have extemporaneous inline # comments in ideographic parentehses, sundry punctuation, # and fields that seem to be missing in the SPS. # All this fluff wil be removed in the conversion. targets = [] for utype in ("ch", "ps"): # The "-lin.ivt" file is the reference file, with annotations and other dirt: source_raw_ext = "uts" if utype == "ch" else "pys" if utype == "ps" else None assert source_raw_ext is not None source_raw = f"in/bencao.{source_raw_ext}" ivt_lin_name = f"bencao-fu-{utype}-lin" ivt_par_name = f"bencao-fu-{utype}-par" ivt_lin_target = f"{ivt_lin_name}.ivt" # The "-par.ivt" file is cleaned from annotations and other dirt: target = f"bencao-fu-{utype}-par.ivt" script = "convert_bencao_raw_to_par_ivt.py" tit[target] = f"making res/{target} from {source_raw}" pre[target] = [ source_raw, script, ] mak[target] = ( f"./{script} {utype} {source_raw} res/{target}", ) targets.append(target) return targets # ---------------------------------------------------------------------- def add_all_starps_lin_ivt_rules(pre, mak, tit): # Rules to create the "-lin.ivt" files of the SPS from the raw ".ivt" # transcription files, without comments, weirdos, ligatures, etc. targets = [] source_raw = "../074/st_files/str-parags.ivt" # The join script and its imported modules: raw_to_lin_script = f"convert_starps_raw_to_lin_ivt.py" pfn_py_lib = "work/process_funcs.py" ivt_py_lib = "work/ivt_format.py" nfn_py_lib = "note_077_funcs.py" filter_script = "remove_bad_lines_from_starps_ivt.gawk" efn_gawk_lib = "work/error_funcs.gawk" for bsub in ("fu", "gd",): for utype in "ec", "wp", "wc": target = f"starps-{bsub}-{utype}-lin.ivt" tit[target] = f"creating cleaned '{bsub}' SPS transcripton file {target} from {source_raw}" pre[target] = [ source_raw, raw_to_lin_script, pfn_py_lib, ivt_py_lib, nfn_py_lib ] mak[target] = [ f"cat {source_raw} \\", f" | egrep -v -e '^]' \\", ] if bsub == "gd": # Subset with good parags only: pre[target] += [ filter_script, efn_gawk_lib, ] mak[target] += [ f" | {filter_script} -i {efn_gawk_lib} \\", ] mak[target] += [ f" | {raw_to_lin_script} {utype} \\", f" > res/{target}", ] targets.append(target) return targets # ---------------------------------------------------------------------- def add_all_starps_par_ivt_rules(pre, mak, tit): # Rules to create the "-par.ivt" files of the SPS from the "-lin.ivt" # files, with each parag joined into a single line, without # comments, weirdos, ligatures, etc. targets = [] # The join script and its imported modules: lin_to_par_script = f"convert_starps_lin_to_par_ivt.py" pfn_py_lib = "work/process_funcs.py" ivt_py_lib = "work/ivt_format.py" nfn_py_lib = "note_077_funcs.py" for bsub in ("fu", "gd",): for utype in "ec", "wp", "wc": target = f"starps-{bsub}-{utype}-par.ivt" source = f"res/starps-{bsub}-{utype}-lin.ivt" tit[target] = f"creating '{bsub}' SPS line-per-parag file {target} from {source}" pre[target] = [ source, lin_to_par_script, pfn_py_lib, ivt_py_lib, nfn_py_lib ] mak[target] = [ f"cat {source} \\", f" | {lin_to_par_script} {utype} \\", f" > res/{target}", ] targets.append(target) return targets # ---------------------------------------------------------------------- def add_all_starps_pag_ivt_rules(pre, mak, tit): # Rules to create the "-pag.ivt" files for the SPS, like "-par.ivt" # but with all the parags on the same page joined into a single big # parag. # The join script and its imported modules: par_to_pag_script = f"convert_starps_par_ivt_to_pag_ivt.py" pfn_py_lib = "work/process_funcs.py" ivt_py_lib = "work/ivt_format.py" nfn_py_lib = "note_077_funcs.py" targets = [] # No sense making the "gd" version. for bsub in ("fu",): for utype in "ec", "wp", "wc": target = f"starps-{bsub}-{utype}-pag.ivt" source = f"res/starps-{bsub}-{utype}-par.ivt" tit[target] = f"creating '{bsub}' SPS line-per-page file {target} from {source}" pre[target] = [ source, par_to_pag_script, pfn_py_lib, ivt_py_lib, nfn_py_lib ] mak[target] = [ f"cat {source} \\", f" | {par_to_pag_script} {utype} \\", f" > res/{target}", ] targets.append(target) return targets # ---------------------------------------------------------------------- def add_all_upp_rules(pre, mak, tit): # Rules to create the "bencao-fu-{utype}.upp" files, with the size of # each recipe. The {utype} specifies the unit of size: "ch" for hanzi # chars, "ps" for pinyin words with one syllable per word. The latter # should match the "ch" version if the pinyin source is correct. targets = [] for book, bsub, utype, ltype in ( \ ( "bencao", "fu", "ch", "par", ), ( "bencao", "fu", "ps", "par", ), \ ( "starps", "fu", "wp", "par", ), ( "starps", "fu", "wc", "par", ), ( "starps", "fu", "ec", "par", ), \ ( "starps", "fu", "wp", "pag", ), ( "starps", "fu", "wc", "pag", ), ( "starps", "fu", "ec", "pag", ), \ ( "starps", "gd", "wp", "par", ), ( "starps", "gd", "wc", "par", ), ( "starps", "gd", "ec", "par", ), ): ivt_name = f"{book}-{bsub}-{utype}-{ltype}" target = mfn.add_rules_for_units_per_line_file(pre, mak, tit, ivt_name, utype) targets.append(target) return targets # ---------------------------------------------------------------------- def add_all_single_hist_rules(pre, mak, tit): targets = [] for book, bsub, utype in ( \ ( "bencao", "fu", "ch", ), ( "starps", "fu", "wp", ), ( "starps", "fu", "wc", ), ( "starps", "fu", "ec", ), ( "starps", "gd", "wp", ), ( "starps", "gd", "wc", ), ( "starps", "gd", "ec", ), ): ivt_name = f"{book}-{bsub}-{utype}-par" usize = 1.000 # Plot sizes in raw units. color = mfn.hist_color(book, bsub, utype) bin_size = 5 if utype == "ec" else 1 target = mfn.add_rules_for_single_size_hist_plot(pre, mak, tit, ivt_name, usize, color, bin_size) targets.append(target) return targets # ---------------------------------------------------------------------- def add_all_double_hist_rules(pre, mak, tit): targets = [] for book0, bsub0, utype0, book1, bsub1, utype1, raw_size in ( \ ("bencao", "fu", "ch", "bencao", "fu", "ps", True, ), # To check equality of hanzi and pinyin versions. \ ("starps", "fu", "ec", "starps", "gd", "ec", True, ), # Compare full vs good SPS in "ec" units. ("starps", "fu", "wc", "starps", "gd", "wc", True, ), # Compare full vs good SPS in "wc" units. ("starps", "gd", "wc", "starps", "gd", "wp", True, ), # Compare "wc" vs "wp" SPS in good subset. ("starps", "gd", "wc", "starps", "gd", "ec", False, ), # Compare "wc" vs "ec" SPS in good subset. \ ("bencao", "fu", "ch", "starps", "fu", "wp", False, ), # Compare SBJ with SPS "fu-wp". ("bencao", "fu", "ch", "starps", "fu", "wc", False, ), # Compare SBJ with SPS "fu-wc". ("bencao", "fu", "ch", "starps", "fu", "ec", False, ), # Compare SBJ with SPS "fu-ec". \ ("bencao", "fu", "ch", "starps", "gd", "wp", False, ), # Compare SBJ with SPS "gd-wp". ("bencao", "fu", "ch", "starps", "gd", "wc", False, ), # Compare SBJ with SPS "gd-wc". ("bencao", "fu", "ch", "starps", "gd", "ec", False, ), # Compare SBJ with SPS "gd-ec". ): ivt_name0 = f"{book0}-{bsub0}-{utype0}-par" usize0 = 1 if raw_size else spf.hanzi_per_unit(utype0) color0 = mfn.hist_color(book0, bsub0, utype0) ivt_name1 = f"{book1}-{bsub1}-{utype1}-par" usize1 = 1 if raw_size else spf.hanzi_per_unit(utype1) color1 = mfn.hist_color(book1, bsub1, utype1) bin_size = 5 if utype0 == "ec" else 1 target = mfn.add_rules_for_double_size_hist_plot \ (pre, mak, tit, ivt_name0, usize0, color0, ivt_name1, usize1, color1, bin_size) targets.append(target) return targets # ---------------------------------------------------------------------- def add_all_bencao_word_pos_plot_rules(pre, mak, tit): # Word position plots for various words and parags in the SBJ. book = "bencao" bsub = "fu" ltype = "par" utype = "ch" ivt_name = f"{book}-{bsub}-{utype}-{ltype}" color = mfn.hist_color(book, bsub, utype) kwt_pairs = ( \ ( "主治", "zhu3zhi4", ), ) kwt_pairs_not = ( \ ( "主", "zhu3", ), ) slocs_maybe = ( \ "b1.4.100", # 3 hits. "b2.5.118", # 3 hits?. "b2.5.518", # 3 hits?. "b3.3.080", # 3 hits. "b2.1.013", # 3 hits. ) slocs_yes = ( \ "b1.4.096", # 7 hits - red rooster. "b2.4.094", # 3 hits. "b3.3.088", # 3 hits. ) slocs_not = ( \ "b1.1.014", # 1 hit. "b1.2.061", # 1 hit. "b1.1.007", # 1 hit. "b1.1.014", # 1 hit. "b1.2.061", # 1 hit. "b1.4.090", # 2 hits. "b1.5.106", # 1 hit. "b1.5.109", # 2 hits. "b2.1.001", # 1 hit. "b2.2.066", # 1 hit. "b2.3.090", # 2 hits. "b3.3.077", # 1 hit. "b3.5.119", # 2 hits. ) sloc_list = slocs_maybe + slocs_yes targets = mfn.add_rules_for_many_word_pos_plots(pre, mak, tit, ivt_name, utype, color, kwt_pairs, sloc_list) return targets # ---------------------------------------------------------------------- def add_all_starps_word_pos_plot_rules(pre, mak, tit): # Word position plots for various words and parags in the SPS. book = "starps" bsub = "fu" ltype = "par" utype = "ec" ivt_name = f"{book}-{bsub}-{utype}-{ltype}" color = mfn.hist_color(book, bsub, utype) kwt_pairs = ( \ ( r"daiin\b", "daiin", ), ( r"(daiin|laiin)\b", "DAIIN", ), ( r"[dlrspf]aiii?n", "XAIIN", ), ) kwt_pairs_not = ( \ ) slocs_maybe = ( \ "f105v.1", # 4 hits. "f105v.14", # 4 hits. "f106r.42", # 4 hits. "f104v.22", # 6 hits. "f114r.4", # 6 hits. "f114r.8", # 6 hits. "f114r.19", # 6 hits. "f114r.24", # 6 hits. ) slocs_yes = ( \ "f112v.11", # 3 hits. "f105v.20", # 4 hits. "f105v.32", # 5 hits. ) slocs_later = ( \ "f104v.1", # 7 hits. ) slocs_no_match = ( \ "f105r.42", # 0 hits. "f105v.8", # 2 hits. "f106v.42", # 2 hits. "f115r.13", # 2 hits. "f114r.14", # 4 hits. "f104r.1", # 3 hits. "f104r.12", # 3 hits. "f105r.1", # 3 hits. "f105v.5", # 3 hits. "f106v.1", # 3 hits. "f107r.21", # 3 hits. "f113r.10", # 3 hits. ) slocs_messy = ( \ "f105r.17", # 5 hits. "f108v.20", # 5 hits. "f108v.23", # 4 hits. "f108v.23", # 5 hits. "f111r.10", # 3 hits. "f111r.25", # 4 hits. "f114r.32", # 4 hits. "f114v.36", # 4 hits. ) sloc_list = slocs_maybe + slocs_yes targets = mfn.add_rules_for_many_word_pos_plots \ ( pre, mak, tit, ivt_name, utype, color, kwt_pairs, sloc_list ) return targets # ---------------------------------------------------------------------- def add_all_lines_with_pattern_rules(pre, mak, tit): # Creates ".opa" files with SPS lines that contain certain patterns. debug = True targets = [] for book, kword, ktag in ( \ ('starps', 'daiin', 'daiin', ), ('starps', '[dl]ai(r|in)', 'XaiR', ), ('bencao', '主', 'zhu3', ), ): if book == "bencao": utype_raw = "ch"; utype_match = "ch"; ltype = "par" elif book == "starps": utype_raw = "wp"; utype_match = "ec"; ltype = "lin" # raw_file="../074/st_files/str-parags.ivt" else: assert False bsub = "fu" ivt_name = f"{book}-{bsub}-{utype_raw}-{ltype}" target = mfn.add_rules_for_lines_with_pattern_files \ ( pre, mak, tit, ivt_name, utype_match, kword, ktag ) if debug: err.write(f"!= {target = }\n") targets.append(target) return targets # ---------------------------------------------------------------------- def add_bencao_starps_word_delta_pairs_plot_rules \ ( pre, mak, tit, sbj_slocs, sbj_stag, sps_ltype, sps_pmag, sps_slocs, sps_stag ): # Adds rules to make a plot of points {(d1,d2)} where {d1} and {d2} # are distances between three occurrences of cerain patterns in # certain recipes of the SBJ and certain parags or pages of the SBJ # or SPS file. # # The SBJ loci will be those listed in {sbj_slocs}, identified by the tag {sbj_stag}. # The SPS loci will be those listed in {sps_slocs}, identified by the tag {sps_stag}. sbj_book = "bencao"; sbj_bsub = "fu"; sbj_utype = "ch"; sbj_ltype = "par" sbj_kword = "主治"; sbj_ktag = "zhu3zhi4" sbj_color = mfn.hist_color(sbj_book, sbj_bsub, sbj_utype) ivt_name0 = f"{sbj_book}-{sbj_bsub}-{sbj_utype}-{sbj_ltype}" sbj_pmag = 1.0 sps_book = "starps"; sps_bsub = "fu"; sps_utype = "wp"; sps_kword = r"daiin\b"; sps_ktag = "daiin" # sps_kword = r"(daiin|laiin)\b"; sps_ktag = "DAIIN" # sps_kword = r"[dlrspf]aiii?n"; sps_ktag = "XAIIN" sps_color = mfn.hist_color(sps_book, sps_bsub, sps_utype) ivt_name1 = f"{sps_book}-{sps_bsub}-{sps_utype}-{sps_ltype}" target = mfn.add_rules_for_double_word_delta_pairs_plot \ ( pre, mak, tit, ivt_name0, sbj_utype, sbj_pmag, sbj_kword, sbj_ktag, sbj_slocs, sbj_stag, sbj_color, ivt_name1, sps_utype, sps_pmag, sps_kword, sps_ktag, sps_slocs, sps_stag, sps_color, ) return [ target, ] # ---------------------------------------------------------------------- def add_all_word_delta_pairs_plot_rules(pre, mak, tit): # Adds rules to create the file # "res/bencao-{ktag0}-{stag0}-starps-{ktag1}-{stag1}.wpd" with # the deltas of positions of selected words in selected lines of # of the SBJ and SPS files sbj_triple_locs = ( # All recipes with three "daiin": "b1.4.100", # 3 hits. "b2.1.013", # 3 hits. "b2.5.118", # 3 hits?. "b2.5.518", # 3 hits?. "b3.3.080", # 3 hits. "b3.3.088", # 3 hits. ) sbj_ltype = "par" sps_ltype = "par" sps_pmag = 0.85 # sps_pmag = 0.90 # sps_pmag = 1.00 # sps_pmag = 1.07 # sps_pmag = 1.15 # sps_pmag = 1.20 # sps_pmag = 1.30 # sps_pmag = 1.35 # sps_pmag = 1.40 # sps_pmag = 1.60 # sps_pmag = 2.20 sbj_sps_loc_list_pairs_maybe = ( ( sbj_triple_locs, "bA", ( "f114r.24", # 6 hits. Maybe b3.3.080* (x1.30), b1.4.100 (x1.35), b3.3.088 (x1.15), b2.1.013 (x0.85). ), "fA", ), ) sbj_sps_loc_list_pairs_matched = ( ( # The "red rooster" recipe: ( "b1.4.096", ), "bR", # 7 hits. ( "f105v.32", ), "fR", # 5 hits "daiin", 2 "dair", 1 "laiin". (x1.40) ), ( # The "white horse penis" recipe: ( "b2.4.094", ), "bH", # 3 hits. ( "f105v.14", ), "fH", # 4 hits. Maybe b2.4.094* (x1.15). ), ) sps_locs_maybe = ( "f104v.1", # 7 hits. Maybe b2.1.013* (x1.40), b3.3.088*, b3.3.080 (x1.20). "f104v.22", # 6 hits. Maybe b2.1.013*, b3.3.080 (x1.40). "f114r.8", # 6 hits. Maybe b2.1.013*, b3.3.088* (x1.40). "f108v.20", # 5 hits. Maybe b2.1.013* (x1.30) b3.3.088 (x1.07). "f114r.19", # 6 hits. No close match (x1.40 or less) "f114r.4", # 6 hits. Maybe b3.3.088*, b3.3.080* (x1.4). "f105v.32", # 5 hits. No match (any mag). "f114r.39", # 5 hits. Maybe b3.3.088* (x1.00). "f105v.1", # 4 hits. No match (any mag). "f105r.17", # 4 hits. Maybe b3.3.088* (x2.20!). "f105v.20", # 4 hits. No match (any mag). "f106r.42", # 4 hits. No match (any mag). "f108v.23", # 4 hits. No match (any mag). "f111r.25", # 4 hits. Maybe b3.3.088* (x0.90). "f114r.14", # 4 hits. No match (any mag). "f114r.32", # 4 hits. Maybe b3.3.088, poorly (x1.30). "f114r.42", # 4 hits. No match (any mag). "f114v.36", # 4 hits. Maybe b3.3.080 (x1.60). "f112v.11", # 3 hits. No match (any mag). "f104r.1", # 3 hits. No match (any mag). "f104r.12", # 3 hits. Maybe b3.3.088, poorly (x1.15). "f105r.1", # 3 hits. No match (any mag). "f105v.5", # 3 hits. No match (any mag). "f106v.1", # 3 hits. No match (any mag). "f107r.21", # 3 hits. No match (any mag). "f111r.10", # 3 hits. No match (any mag). "f112v.11", # 3 hits. No match (any mag). "f113r.10", # 3 hits. No match (any mag). "f114r.22", # 3 hits. No match (any mag). "f114v.1", # 3 hits. No match (any mag). "f114v.20", # 3 hits. No match (any mag). "f114v.8", # 3 hits. No match (any mag). ) sps_locs_not = ( ) sbj_sps_loc_list_pairs = sbj_sps_loc_list_pairs_matched + sbj_sps_loc_list_pairs_maybe targets = [] for sbj_locs, sbj_stag, sps_locs, sps_stag in sbj_sps_loc_list_pairs: target = add_bencao_starps_word_delta_pairs_plot_rules \ ( pre, mak, tit, sbj_locs, sbj_stag, sps_ltype, sps_pmag, sps_locs, sps_stag) targets.append(target) return targets # ---------------------------------------------------------------------- def add_all_coin_image_rules(pre, mak, tit): # Adds rules and commands to create files # "res/{ivt_name0}-{ivt_name1}-coin-map.png" that shows coincidences of sizes # etc between the parags of the SBJ "in/{ivt_name0}-par.ivt" and those of the # SPS "res/{ivt_name1}-par.ivt". Returns the ivt_names of the targets (sans "res/"). targets = [] # sub1, utype1 in (("fu", "wp"), ("fu", "wc"), ("gd", "wp"), ("gd", "ec")): # for sub1, utype1 in (("gd", "ec",),): # dir0 = "in"; ivt_name0 = f"bencao-fu-ch" # dir1 = "res"; ivt_name1 = f"sps_-{sub1}-{utype1}" # target = add_rules_for_parag_parag_coin_image(pre, mak, tit, dir0, ivt_name0, dir1, ivt_name1) # targets.append(target) # return targets # ---------------------------------------------------------------------- def add_all_word_tuple_rules(pre, mak, tit): # Word tuples targets = [] for ivt_name, utype, tsize, kword, ktag in ( ( "bencao-fu-ch-par", "ch", 1, None, None, ), ( "bencao-fu-ch-par", "ch", 2, None, None, ), ( "bencao-fu-ch-par", "ch", 3, None, None, ), ( "bencao-fu-ch-par", "ch", 4, r'主', "zhu3", ), ( "starps-fu-ec-pag", "ec", 5, None, None, ), ( "starps-fu-ec-pag", "ec", 11, None, None, ), ( "starps-fu-wc-par", "wc", 1, None, None, ), ( "starps-fu-wc-par", "wc", 2, None, None, ), ( "starps-fu-wp-par", "wp", 1, None, None, ), ( "starps-fu-wp-pag", "wp", 2, r'(daiin|laiin)\b', "DAIIN", ), ): target = mfn.add_rules_for_word_tuple_file(pre, mak, tit, ivt_name, utype, tsize, kword, ktag) targets.append(target) return targets # ----------------------------------------------------------------------