#! /usr/bin/gawk -f # Last edited on 2026-01-16 10:01:35 by stolfi # To be included in gawk programs function parse_word_into_elements(wd, we,punct,ok,el) { # Parses {wd} into valid elements surrounding them by "{}" # Glyphs that cannot be parsed are wrapped in "{}" with # a "!" prefix we = "" while (wd != "") { ok = 1; punct = 0 if (match(wd, /^[-,.«=»]/)) { punct = 1 } else if (match(wd, /^[<][%$][>]/)) { punct = 1 } else if (match(wd, /^[qoaydrl]/)) { } # Resolving @e ambiguities: else if (match(wd, /^[cs]h[e][e]/)) { RSTART = 1; RLENGTH = 2 } else if (match(wd, /^ee[e][e]/)) {RSTART = 1; RLENGTH = 2 } else if (match(wd, /^[kt][e][e]/)) { RSTART = 1; RLENGTH = 1 } else if (match(wd, /^[c][kt]h[e][e]/)) { RSTART = 1; RLENGTH = 3 } else if (match(wd, /^[c][pfwz]h[e][e]/)) { RSTART = 1; RLENGTH = 3 } # Non-ambiguous cases: else if (match(wd, /^[cs]h[e]?/)) { } else if (match(wd, /^ee[e]?/)) { } else if (match(wd, /^[kt][e]?/)) { } else if (match(wd, /^[pfwz]/)) { } else if (match(wd, /^[c][kt]h[e]?/)) { } else if (match(wd, /^[c][pfwz]h[e]?/)) { } else if (match(wd, /^[s]/)) { } else if (match(wd, /^[i]?[i]?[i]?n/)) { } else if (match(wd, /^[i]?[i]?m/)) { } else if (match(wd, /^[i][i]?[r]/)) { } else { ok = 0; RSTART = 1; RLENGTH = 1 } if (RSTART != 1) { prog_error("RSTART") } el = substr(wd, RSTART, RLENGTH) wd = substr(wd, RSTART + RLENGTH) if (ok) { if (punct) { we = ( we el ) } else { we = ( we "{" el "}" ) } } else { we = ( we "!{" el "}" ) } } return we } function parse_elword_into_okoko_pats(we, wo,wr,omax,initial,onum,class,ok,el) { # Assumes {we} is parsed into elements surrounded by "{}" # (if valid) or "[[]]" (if invalid). Parses it according to the # OKOKO submodel. # # Punctuations [-,.«»] and "<[%$]>" are passed through, and each word # they delimit is processed independently. # # Returns "{wo}:{we}" where {wo} is string of [OK], with bad elements # replaced by "*" in {wo} and surrounded by "[[]]" in {we}. If the # parsing fails, inserts a "!" in both # {wo} and {we} before the offending element. wo = ""; wr = "" omax_seq = 2 # Max number of consecutive "O"s. omax_tot = 3 # Max total number of "O"s. initial = 1; onum_seq = 0; onum_tot = 0; while (we != "") { ok = 1; class = "*"; if (match(we, /^[-,.«=»]/)) { class = "." } else if (match(we, /^[<][%$][>]/)) { class = "." } else if (match(we, /^[{][aoy][}]/)) { class = "O";} else if (match(we, /^[{][cdefhiklmnpqrstwz]+[}]/)) { class = "K"; } else if (match(we, /^[{][^{}]*[}]/)) { ok = 0; } else if (match(we, /^[[][[][^¡!{}]*[]][]]/)) { ok = 0; } else { ok = 0; RSTART = 1; RLENGTH = 1 } if (RSTART != 1) { prog_error("RSTART") } el = substr(we, RSTART, RLENGTH) we = substr(we, RSTART + RLENGTH) # Rules for number of "O"s: if (class == "O") { # Test with "==" instead of ">=" to avoid excessive "!"s: if ((onum_seq == omax_seq) || (onum_tot == omax_tot)) { ok = 0 } onum_seq += 1 onum_tot += 1 } else { onum_seq = 0 } if (ok) { if (class == ".") { wo = ( wo class ) wr = ( wr el ) # Starting new word -- reset local and total "O" counters: onum_seq = 0; onum_tot = 0; initial = 1; } else { if (! match(class, /^[OK]$/)) { prog_error("CLASS") } wo = ( wo class ) wr = ( wr el ) initial = 0; } } else { if (class == "*") { gsub(/[][]/, "", el); el = ( "[[" el "]]" ) } wo = ( wo "!" class ) wr = ( wr "!" el "" ) # Reset "O" counters as if started new word: initial = 0; onum_seq = 0; onum_tot = 0; } } return ( wo ":" wr ) } function parse_elword_into_cmc_pats(we, wa,wr,state,dmax,dnum,class,ok,el,bug) { # Assumes {we} is parsed into elements surrounded by "{}" # (if valid), or "[[]]" (if invalid). Discards the "O" elements # maps the others to classes "Q", "D", "X", "H", "N" and checks if it # fits the CMC model. # # Punctuations [-,.«»] and "<[%$]>" are passed through, and each word # they delimit is processed independently. # # Returns "{wa}:{we}" where {wa} is a string of # [QDXHN]. Anything that is not punctuation or one of those elements # is mapped to "*" in {wa}. If the parsing fails, inserts "!" in {wa} and {we} # where the parsing failed. wa = ""; wr = "" # state is -1 = initial, 0 = after prefix "Q", 1 = after prefix "D", 2 = after prefix "X", # 3 = after the "G" or "H", 4 = after suffix "X", 5 = after one or more suffix "D" # 6 = after "N". state = -1 dmax = 3 # Max number of "QDN" elements. xmax = 2 # Max number of "XH" elements. dnum = 0; xnum = 0 while (we != "") { ok = 1; class = "*" if (match(we, /^[-,.«=»]/)) { class = "."; } else if (match(we, /^[<][%$][>]/)) { class = "."; } else if (match(we, /^[{][oay][}]/)) { class = "O" } else if (match(we, /^[{][q][}]/)) { class = "Q" } else if (match(we, /^[{][dlrs][}]/)) { class = "D" } else if (match(we, /^[{]([cs]h|ee)[e]?[}]/)) { class = "X"; } else if (match(we, /^[{][ktpfwz][e]?[}]/)) { class = "G"; } else if (match(we, /^[{]c[ktpfwz]h[eh]?[}]/)) { class = "H"; } else if (match(we, /^[{][i]*[nm][}]/)) { class = "N" } else if (match(we, /^[{][i]+r[}]/)) { class = "N" } else if (match(we, /^[{][^{}]*[}]/)) { ok = 0; } else if (match(we, /^[¡][^¡!{}]*[!]/)) { ok = 0; } else { ok = 0; RSTART = 1; RLENGTH = 1 } if (RSTART != 1) { prog_error("RSTART") } el = substr(we, RSTART, RLENGTH) we = substr(we, RSTART + RLENGTH) if (ok) { if (class == "O") { # Ignore. class = "" } else if (class == ".") { state = -1; dnum = 0; xnum = 0 } else if (class == "Q") { if (dnum == dmax) { ok = 0 } dnum += 1 if (state < 0) { state = 0 } else { ok = 0 } } else if (class == "D") { if (dnum == dmax) { ok = 0 } dnum += 1; if (state < 1) { state = 1; } else if (state == 1) { # Ok. } else if (state < 5) { state = 5; } else if (state == 5) { # Ok. } else { ok = 0 } } else if (class == "X") { if (xnum == xmax) { ok = 0 } xnum += 1; if (state < 2) { state = 2 } else if (state == 2) { # Ok. } else if (state < 4) { state = 4 } else if (state == 4) { # Ok. } else { ok = 0 } } else if (class == "G") { if (state < 3) { state = 3 } else { ok = 0 } } else if (class == "H") { if (xnum == xmax) { ok = 0 } xnum += 1 if (state < 3) { state = 3 } else { ok = 0 } } else if (class == "N") { if (dnum == dmax) { ok = 0 } dnum += 1; if (state < 6) { state = 6 } else { ok = 0 } } else { prog_error("CLASS") } } if (ok) { wa = ( wa class ) wr = ( wr el ) } else { if (class == "") { prog_error("EMPTY CLASS") } if (class == "*") { gsub(/[][]/, "", el); el = ( "[[" el "]]" ) } wa = ( wa "!" class ) wr = ( wr "!" el ) } } return ( wa ":" wr ) }