# Last edited on 2012-05-05 19:46:18 by stolfilocal # To be included in factor-field-general etc. # Factors a Chinese pinyin text with disambiguating suffixes # by placing "{}" around each letter, plus a single "{}" # around the tone and disambiguation suffix. # If the tone is 4 it is omitted. If the tone and suffix are # omitted then the corresponding element is omitted too. # Also parses the digraphs "yi" and "wu" as single letters. function factor_text(x, y,e,ts,t,s) { # Extract tone and disambiguating suffix: if (match(x, /[^0-9.][0-9.]*$/)) { ts = substr(x, RSTART+1); x = substr(x, 1, RSTART); if (match(ts, /[.]/)) { t = substr(ts, 1,RSTART-1); s = substr(ts, RSTART+1); } else { t = ts; s = ""; } } else { ts = ""; t = ""; s = ""; } # Format checks: if (t !~ /^[1-5]?$/) { data_error(("bad tone code \"" x ts "\"")); } if (x !~ /^([eE][\^]?|[uU][:]?|[a-df-tv-zA-DF-TV-Z]|ü|Ü|ê|Ê)+$/) { data_error(("bad pinyin \"" x ts "\"")); } # Omit tone 4 (the most common one): if (t == "4") { t = ""; } # Make each letter into one element, # but beware of the letters "u:" and "e^" y = gensub(/([yy][iI]|[wW][uU]|([a-zA-Z]|ü|Ü|ê|Ê)[:\^]?)/, "{\\1}", "g", x); if ((t != "") || (s != "")) { y = ( y "{" ts "}" ); } return y; }