#! /usr/bin/gawk -f
# Last edited on 2026-01-16 10:01:35 by stolfi

# To be included in gawk programs

function parse_word_into_elements(wd,  we,punct,ok,el) {
  # Parses {wd} into valid elements surrounding them by "{}"
  # Glyphs that cannot be parsed are wrapped in "{}" with 
  # a "!" prefix
  we = ""
  while (wd != "") {
    ok = 1; punct = 0
    if (match(wd, /^[-,.«=»]/)) { punct = 1 }
    else if (match(wd, /^[<][%$][>]/)) { punct = 1 }
    else if (match(wd, /^[qoaydrl]/)) { }
    # Resolving @e ambiguities:
    else if (match(wd, /^[cs]h[e][e]/)) { RSTART = 1; RLENGTH = 2 }
    else if (match(wd, /^ee[e][e]/)) {RSTART = 1; RLENGTH = 2 }
    else if (match(wd, /^[kt][e][e]/)) { RSTART = 1; RLENGTH = 1 }
    else if (match(wd, /^[c][kt]h[e][e]/)) { RSTART = 1; RLENGTH = 3 }
    else if (match(wd, /^[c][pfwz]h[e][e]/)) { RSTART = 1; RLENGTH = 3 }
    # Non-ambiguous cases:
    else if (match(wd, /^[cs]h[e]?/)) { }
    else if (match(wd, /^ee[e]?/)) { }
    else if (match(wd, /^[kt][e]?/)) { }
    else if (match(wd, /^[pfwz]/)) { }
    else if (match(wd, /^[c][kt]h[e]?/)) { }
    else if (match(wd, /^[c][pfwz]h[e]?/)) { }
    else if (match(wd, /^[s]/)) { }
    else if (match(wd, /^[i]?[i]?[i]?n/)) { }
    else if (match(wd, /^[i]?[i]?m/)) { }
    else if (match(wd, /^[i][i]?[r]/)) { }
    else { ok = 0; RSTART = 1; RLENGTH = 1 }

    if (RSTART != 1) { prog_error("RSTART") }
    el = substr(wd, RSTART, RLENGTH)
    wd = substr(wd, RSTART + RLENGTH)

    if (ok) {
      if (punct) {
        we = ( we el )
      } else {
        we = ( we "{" el "}" )
      }
    } else {
      we = ( we "!{" el "}" )
    }
  }
  return we
}

function parse_elword_into_okoko_pats(we,  wo,wr,omax,initial,onum,class,ok,el) {
  # Assumes {we} is parsed into elements surrounded by "{}"
  # (if valid) or "[[]]" (if invalid).  Parses it according to the 
  # OKOKO submodel.
  #
  # Punctuations [-,.«»] and "<[%$]>" are passed through, and each word 
  # they delimit is processed independently.
  #
  # Returns "{wo}:{we}" where {wo} is string of [OK], with bad elements 
  # replaced by "*" in {wo} and surrounded by "[[]]" in {we}. If the
  # parsing fails, inserts a "!" in both 
  # {wo} and {we} before the offending element.
  
  wo = ""; wr = ""
  omax_seq = 2 # Max number of consecutive "O"s.
  omax_tot = 3 # Max total number of "O"s.
  initial = 1; onum_seq = 0; onum_tot = 0;
  while (we != "") {
    ok = 1; class = "*"; 
    if (match(we, /^[-,.«=»]/)) { class = "." }
    else if (match(we, /^[<][%$][>]/)) { class = "." }
    else if (match(we, /^[{][aoy][}]/)) { class = "O";}
    else if (match(we, /^[{][cdefhiklmnpqrstwz]+[}]/)) { class = "K"; }
    else if (match(we, /^[{][^{}]*[}]/)) { ok = 0; }
    else if (match(we, /^[[][[][^¡!{}]*[]][]]/)) { ok = 0; }
    else { ok = 0; RSTART = 1; RLENGTH = 1  }

    if (RSTART != 1) { prog_error("RSTART") }
    el = substr(we, RSTART, RLENGTH)
    we = substr(we, RSTART + RLENGTH)

    # Rules for number of "O"s:
    if (class == "O") {
      # Test with "==" instead of ">=" to avoid excessive "!"s:
      if ((onum_seq == omax_seq) || (onum_tot == omax_tot)) { ok = 0 }
      onum_seq += 1
      onum_tot += 1
    } else {
      onum_seq = 0
    }

    if (ok) {
      if (class == ".") {
        wo = ( wo class )
        wr = ( wr el )
        # Starting new word -- reset local and total "O" counters:
        onum_seq = 0; onum_tot = 0; initial = 1;
      } else {
        if (! match(class, /^[OK]$/)) { prog_error("CLASS") }
        wo = ( wo class )
        wr = ( wr el )
        initial = 0;
      }
    } else {
      if (class == "*") { 
        gsub(/[][]/, "", el); el = ( "[[" el "]]" )
      }
      wo = ( wo "!" class )
      wr = ( wr "!" el "" )
      # Reset "O" counters as if started new word:
      initial = 0; onum_seq = 0; onum_tot = 0;
    }
  }
  return ( wo ":" wr )
}

function parse_elword_into_cmc_pats(we,  wa,wr,state,dmax,dnum,class,ok,el,bug) {
  # Assumes {we} is parsed into elements surrounded by "{}"
  # (if valid), or "[[]]" (if invalid).  Discards the "O" elements
  # maps the others to classes "Q", "D", "X", "H", "N" and checks if it 
  # fits the CMC model.
  #
  # Punctuations [-,.«»] and "<[%$]>" are passed through, and each word 
  # they delimit is processed independently.
  #
  # Returns "{wa}:{we}" where {wa} is a string of 
  # [QDXHN].  Anything that is not punctuation or one of those elements
  # is mapped to "*" in {wa}.  If the parsing fails, inserts "!" in {wa} and {we} 
  # where the parsing failed.
  
  
  wa = ""; wr = ""
  # state is -1 = initial, 0 = after prefix "Q", 1 = after prefix "D", 2 = after prefix "X", 
  # 3 = after the "G" or "H", 4 = after suffix "X", 5 = after one or more suffix "D" 
  # 6 = after "N".
  state = -1 
  dmax = 3  # Max number of "QDN" elements.
  xmax = 2  # Max number of "XH" elements.
  dnum = 0; xnum = 0
  while (we != "") {
    ok = 1; class = "*" 
    if (match(we, /^[-,.«=»]/)) { class = "."; }
    else if (match(we, /^[<][%$][>]/)) { class = "."; }
    else if (match(we, /^[{][oay][}]/)) { class = "O" }
    else if (match(we, /^[{][q][}]/)) { class = "Q" }
    else if (match(we, /^[{][dlrs][}]/)) { class = "D" }
    else if (match(we, /^[{]([cs]h|ee)[e]?[}]/)) { class = "X"; }
    else if (match(we, /^[{][ktpfwz][e]?[}]/)) { class = "G"; }
    else if (match(we, /^[{]c[ktpfwz]h[eh]?[}]/)) { class = "H"; }
    else if (match(we, /^[{][i]*[nm][}]/)) { class = "N" }
    else if (match(we, /^[{][i]+r[}]/)) { class = "N" }
    else if (match(we, /^[{][^{}]*[}]/)) { ok = 0; }
    else if (match(we, /^[¡][^¡!{}]*[!]/)) { ok = 0; }
    else { ok = 0; RSTART = 1; RLENGTH = 1  }

    if (RSTART != 1) { prog_error("RSTART") }
    el = substr(we, RSTART, RLENGTH)
    we = substr(we, RSTART + RLENGTH)
    
    if (ok) {
      if (class == "O") {
        # Ignore.
        class = ""
      } else if (class == ".") {
        state = -1; dnum = 0; xnum = 0
      } else if (class == "Q") {
        if (dnum == dmax) { ok = 0 }
        dnum += 1
        if (state < 0) {
          state = 0
        } else {
          ok = 0
        }
      } else if (class == "D") {
        if (dnum == dmax) { ok = 0  }
        dnum += 1; 
        if (state < 1) {
          state = 1;
        } else if (state == 1) {
          # Ok.
        } else if (state < 5) {
          state = 5;
        } else if (state == 5) {
          # Ok.
        } else {
          ok = 0
        }
      } else if (class == "X") {
        if (xnum == xmax) { ok = 0 }
        xnum += 1; 
        if (state < 2) {
          state = 2
        } else if (state == 2) {
          # Ok.
        } else if (state < 4) {
          state = 4
        } else if (state == 4) {
          # Ok.
        } else {
          ok = 0
        }
      } else if (class == "G") {
        if (state < 3) {
          state = 3
        } else {
          ok = 0
        }
      } else if (class == "H") {
        if (xnum == xmax) { ok = 0 }
        xnum += 1
        if (state < 3) {
          state = 3
        } else {
          ok = 0
        }
      } else if (class == "N") {
        if (dnum == dmax) { ok = 0  }
        dnum += 1; 
        if (state < 6) {
          state = 6 
        } else { ok = 0 }
      } else {
        prog_error("CLASS")
      }
    }
    if (ok) {
      wa = ( wa class )
      wr = ( wr el )
    } else {
      if (class == "") { prog_error("EMPTY CLASS") }
      if (class == "*") {
        gsub(/[][]/, "", el); el = ( "[[" el "]]" )
      }
      wa = ( wa "!" class )
      wr = ( wr "!" el )
    }
  }
  return ( wa ":" wr )
}