#! /usr/bin/python3
# Last edited on 2026-02-09 09:27:35 by stolfi

# Reads from {stdin} a file in pinyin (utf-8 encoding)
# possibly with with joined compounds. Outputs to {stdout} 
# same file with compounds split into single syllables.
#
# Data lines in the input must have the format "<{LOC}> {TEXT}"
# where {LOC} is a locus ID, which can be any string of [.a-z0-9];
# and {TEXT} is a line of pinyin text. 
#
# The output will have the same format except that any compounds
# in {TEXT} wil be split into individual syllables.
#
# Punctuation, spaces, blank lines, and #-comments are preserved. 
#
# Warns of ambiguous compounds like "eru", "shanu" "shangu".

import sys, os, re
from sys import stdout as out, stderr as err, stdin as inp
from process_funcs import bash, basic_line_loop
from error_funcs import file_line_error, arg_error
from note_077_funcs import compute_and_print_stats

def main():

  file_name = "stdin"
  inp.reconfigure(encoding='utf-8')
  out.reconfigure(encoding='utf-8')
  
  # Parsing patterns:
  pn_cons = r"b-df-hj-np-tv-z" # Pinyin consonants.
  pn_vows = r"aeiouü" + r"āēīōūǖ" + r"àèìòùǜ" + r"áéíóúǘ" + r"ǎěǐǒǔǚ"
  
  pat_word = f"[{pn_cons}]*[{pn_vows}]+(r|ng|n|)"
  pat_punc = r"[ ]*[.,;'()*][ ]*|[ ]+"

  pat_loc = r"[a-z0-9.]+"
  pat_line = f"<({pat_loc})>[ ]+(.*)"

  # Global counters:
  ndata = 0    # Count of data lines.
  nword_in = 0 # Total count of input pinyin words or compounds.
  nword_ot = 0 # Total count of pinyin syllables.
  npunc = 0    # Total count of tokens (words + punct).
   
  sizes = [] # List of recipe sizes, to compute statistics.
 
  def process_bencao_line(nline, line):
    nonlocal ndata, nword_in, nword_ot, npunc
    # Parses a line {line} assuming it is line {nline} of the file.
    # Outputs its word count. Increments 
    # {ndata,nword_in,nword_ot,npunc}.
    # 
    # The {line} is always a string (never {None}), but may be "" if the line
    # is empty.
    # 
    # Ignores the line if it is a blank or #-comment.
    # 
    # Otherwise the line must be a data line, matching
    # "^<{sec}.{sub}.{lseq}> +{text}$" 
    # 
    # where
    #
    #    {seq} is a section number, "s0" to "s2";
    #
    #    {sub} is a subsection number, "1" to "9";
    #
    #    {lseq} is a 3-digit integer sequential through the whole file, with gaps.
    #
    #    {text} is a Pinyin text in UTF-8
    #
    # Ignores the intro section "s0".
    #
    # Increments {ndata} for each data line.
    # Writes to stdout one output line for each input data line.

    def data_error(msg):
      nonlocal file_name, nline, line
      file_line_error(file_name, nline, msg, line)
      # ....................................................................

    def parse_token(text):
      # Parses the next token (pinyin word, punctuation, or space) 
      # in {text}. Returns the token parsed, the type ('w' for syllable, 
      # 'p' for punct or space) and the rest of the text.
      # Try to parse a Pinyin word:
      m = re.match(pat_punc, text)
      if m != None:
        tok = m.group(0)
        return tok, 'p', text[len(tok):]   
      m = re.match(pat_word, text)
      if m != None:
        tok = m.group(0);
        return tok, 'w', text[len(tok):]
      data_error(f"invalid char «{text[0:6]}...»")
      assert False # Ne plus ultra.
      # ..................................................................

    assert line != None, "The {line} arg must not be {None}" 
    
    line = line.strip()

    # Ignore comments and blank lines:
    if re.match(r" *([#]|$)", line): return

    m = re.fullmatch(pat_line, line)
    if m is None: 
      # Invalid line format.
      data_error("invalid line format")
    else:
      ndata += 1
      # Parse the line into locus ID and text:
      loc = m.group(1)
      text = m.group(2) 

      # Should we debug the line?
      debug = False

      # Cleanup and count tokens:
      text = text.strip()
      text = re.sub(r'[ ]+', ' ', text)
      toks_ot = []
      prev_type = None
      while text != "":
        tok, type, text = parse_token(text)
        if debug: err.write(f"!! token = '{tok}' type = {type}\n")
        if type == 'w':
          nword_ot += 1
          # err.write(f"!! {tok}\n")
          if prev_type != 'w':
            # New compound:
            nword_in += 1
        elif type == 'p':
          npunc += 1
        else:
          assert False, "bug"
        toks_ot.append(tok)
        prev_type = type
        
      text_ot = "".join(toks_ot)        
      out.write(f"<{loc}> {text_ot}\n")
            
    # ......................................................................

  nread = basic_line_loop(inp, process_bencao_line)
  out.flush()

  err.write(f"{nread:5d} total lines\n")
  err.write(f"{ndata:5d} total data lines\n")
  err.write(f"{nword_in:5d} total words/compounds in\n")
  err.write(f"{nword_ot:5d} total syllables out\n")
  err.write(f"{npunc:5d} total punctuation and spaces\n")
  return
  # ----------------------------------------------------------------------

main()