# Last edited on 2000-05-31 01:19:07 by stolfi function factor_text(x,eelump,chshsmash,ktsmash,hicsmash,hicsplit,esplit, y,e) { # # Decomposes the EVA text "x" into its QOKOKOKOK elements, which are # delimited by "{}". Any empty elements are represented by "{_}". # Assumes "x" is uncapitalized EVA. EVA spaces and "/" are allowed. # # If "eelump" is set, the group "ee" is treated as single letter # similar to "ch" and "sh", independently of "esplit". (Any "eee" # groups are parsed as "ee" followed by "e".) # # If "chshsmash" is set, the combination "sh" is mapped to "ch". # (If "eelump" and "chshsmash" are both set, "ee" too is mapped to "ch"). # # If "ktsmash" is set, the letter "t" is mapped to "k", # and "f" to "p". # # If "hicsmash" is set, the letters [hc] except in "ch" and "sh", # and the letter "i" before a gallows, are converted to "e". # # If "hicsplit" is set, the letters [hic] are treated as independent # K elements (except for "i" before [dlrsxvmngj], and the digraphs # "ch" and "sh"). In particular, platform gallows, like "cth", are # split into three elements, like "{c}_{t}_{h}". Otherwise the # letters [ic] are parsed as element pre-modifiers and "h" as a # post-modifier, whenever possible. # # If "esplit" is set, the letter "e" is treated as a separate K # element, otherwise it is treated as an element post-modifier # whenever possible. (However, lumped "ee"s remain lumped.) # Delete non-significant comments: gsub(/{[^{}]*}/, "", x); gsub(/[! ]/, "", x); # Check for invalid letters: if (match(x, /[^-=\/,. *?%a-z]/)) { error(("invalid char in word \"" x "\"")); } # Map "sh" "ch" to single letters to simplify the parsing. gsub(/ch/, "C", x); gsub(/sh/, "S", x); if (eelump) { gsub(/ee/, "E", x); } # Map [hic] to "e" if the user asked for it: if (hicsmash) { $0 = gensub(/i([ktpf])/, "e\\1", "g", $0); $0 = gensub(/c/, "e", "g", $0); $0 = gensub(/h/, "e", "g", $0); } # Map [tf] to [kp] if the user asked for it: if (ktsmash) { gsub(/[t]/, "k", $0); gsub(/[f]/, "p", $0); } # Map "sh" (and lumped "ee") to "ch" if the user asked for it: if (chshsmash) { gsub(/[SE]/, "C", $0); } # Main loop - consumes words from "x" and appends results to "y". y = ""; while (x != "") { # printf "x = [%s]\n", x > "/dev/stderr"; # copy punctuation if any: if (match(x, /^[-=\/,. ]+/)) { e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); y = ( y e ); } else { # split off initial if any: if (match(x, /^[q]/)) { e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); } else { e = "_"; } y = ( y "{" e "}"); # Secondary loop - splits elements from "x", appends them to "y" while (1) { # split off "[aoy]" group with eventual [ci] prefix and [he] suffix if (match_O(x,hicsplit,esplit)) { e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); } else { e = "_"; } y = ( y "{" e "}" ); # copy next main letter with [ci] prefix and [he] suffix if (match_K(x,hicsplit,esplit)) { e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); } else { break; } y = ( y "{" e "}" ); } } } # Unfold letter folding: gsub(/C/, "ch", y); gsub(/S/, "sh", y); gsub(/E/, "ee", y); return y; } function match_O(x,hicsplit,esplit) { if (esplit && hicsplit) { match(x, /^[aoy]/); } else if (hicsplit) { match(x, /^[aoy][e]*/); } else if (esplit) { if (! match(x, /^[i][aoy][h][h]*/)) { match(x, /^[c]*[aoy][h]*/); } } else { if (! match(x, /^[i][aoy][h][h]*/)) { match(x, /^[c]*[aoy][h]*[e]*/); } } return(RSTART); } function match_K(x,hicsplit,esplit) { if (! match(x, /^[i]*[dlrsxvnmgj]/)) { if (esplit && hicsplit) { match(x, /^[^-=\/,. %!aoy]/); } else if (hicsplit) { match(x, /^[^-=\/,. %!aoy][e]*/); } else if (esplit) { if (! match(x, /^[i][^-=\/,. %!aoy][h][h]*/)) { match(x, /^[c]*[^-=\/,. %!aoy][h]*/); } } else { if (! match(x, /^[i][^-=\/,. %!aoy][h][h]*[e]*/)) { match(x, /^[c]*[^-=\/,. %!aoy][h]*[e]*/); } } } return(RSTART); }