#! /usr/bin/python3 # Last edited on 2026-02-09 09:27:35 by stolfi # Reads from {stdin} a file in pinyin (utf-8 encoding) # possibly with with joined compounds. Outputs to {stdout} # same file with compounds split into single syllables. # # Data lines in the input must have the format "<{LOC}> {TEXT}" # where {LOC} is a locus ID, which can be any string of [.a-z0-9]; # and {TEXT} is a line of pinyin text. # # The output will have the same format except that any compounds # in {TEXT} wil be split into individual syllables. # # Punctuation, spaces, blank lines, and #-comments are preserved. # # Warns of ambiguous compounds like "eru", "shanu" "shangu". import sys, os, re from sys import stdout as out, stderr as err, stdin as inp from process_funcs import bash, basic_line_loop from error_funcs import file_line_error, arg_error from note_077_funcs import compute_and_print_stats def main(): file_name = "stdin" inp.reconfigure(encoding='utf-8') out.reconfigure(encoding='utf-8') # Parsing patterns: pn_cons = r"b-df-hj-np-tv-z" # Pinyin consonants. pn_vows = r"aeiouü" + r"āēīōūǖ" + r"àèìòùǜ" + r"áéíóúǘ" + r"ǎěǐǒǔǚ" pat_word = f"[{pn_cons}]*[{pn_vows}]+(r|ng|n|)" pat_punc = r"[ ]*[.,;'()*][ ]*|[ ]+" pat_loc = r"[a-z0-9.]+" pat_line = f"<({pat_loc})>[ ]+(.*)" # Global counters: ndata = 0 # Count of data lines. nword_in = 0 # Total count of input pinyin words or compounds. nword_ot = 0 # Total count of pinyin syllables. npunc = 0 # Total count of tokens (words + punct). sizes = [] # List of recipe sizes, to compute statistics. def process_bencao_line(nline, line): nonlocal ndata, nword_in, nword_ot, npunc # Parses a line {line} assuming it is line {nline} of the file. # Outputs its word count. Increments # {ndata,nword_in,nword_ot,npunc}. # # The {line} is always a string (never {None}), but may be "" if the line # is empty. # # Ignores the line if it is a blank or #-comment. # # Otherwise the line must be a data line, matching # "^<{sec}.{sub}.{lseq}> +{text}$" # # where # # {seq} is a section number, "s0" to "s2"; # # {sub} is a subsection number, "1" to "9"; # # {lseq} is a 3-digit integer sequential through the whole file, with gaps. # # {text} is a Pinyin text in UTF-8 # # Ignores the intro section "s0". # # Increments {ndata} for each data line. # Writes to stdout one output line for each input data line. def data_error(msg): nonlocal file_name, nline, line file_line_error(file_name, nline, msg, line) # .................................................................... def parse_token(text): # Parses the next token (pinyin word, punctuation, or space) # in {text}. Returns the token parsed, the type ('w' for syllable, # 'p' for punct or space) and the rest of the text. # Try to parse a Pinyin word: m = re.match(pat_punc, text) if m != None: tok = m.group(0) return tok, 'p', text[len(tok):] m = re.match(pat_word, text) if m != None: tok = m.group(0); return tok, 'w', text[len(tok):] data_error(f"invalid char «{text[0:6]}...»") assert False # Ne plus ultra. # .................................................................. assert line != None, "The {line} arg must not be {None}" line = line.strip() # Ignore comments and blank lines: if re.match(r" *([#]|$)", line): return m = re.fullmatch(pat_line, line) if m is None: # Invalid line format. data_error("invalid line format") else: ndata += 1 # Parse the line into locus ID and text: loc = m.group(1) text = m.group(2) # Should we debug the line? debug = False # Cleanup and count tokens: text = text.strip() text = re.sub(r'[ ]+', ' ', text) toks_ot = [] prev_type = None while text != "": tok, type, text = parse_token(text) if debug: err.write(f"!! token = '{tok}' type = {type}\n") if type == 'w': nword_ot += 1 # err.write(f"!! {tok}\n") if prev_type != 'w': # New compound: nword_in += 1 elif type == 'p': npunc += 1 else: assert False, "bug" toks_ot.append(tok) prev_type = type text_ot = "".join(toks_ot) out.write(f"<{loc}> {text_ot}\n") # ...................................................................... nread = basic_line_loop(inp, process_bencao_line) out.flush() err.write(f"{nread:5d} total lines\n") err.write(f"{ndata:5d} total data lines\n") err.write(f"{nword_in:5d} total words/compounds in\n") err.write(f"{nword_ot:5d} total syllables out\n") err.write(f"{npunc:5d} total punctuation and spaces\n") return # ---------------------------------------------------------------------- main()