#! /usr/bin/gawk -f # Last edited on 2001-01-02 04:10:49 by stolfi BEGIN { abort = -1; usage = ( \ "factor-word-oko \\\n" \ " [ -v inField=NUM ] \\\n" \ " [ -v erase=BOOL ] \\\n" \ " [ -v outField=NUM ] \\\n" \ " < INFILE > OUTFILE" \ ); # Factors the "inField"th field of INFILE into its OKOKOKO elements. # Assumes that the field is in EVA. Ligature-capitalization is # ignored on input, and added to the output. # # If "erase" is set, the input field is erased, otherwise it is # preserved. Then inserts the factored word as the "outField"th # field. # # Each QOKOKOKO element with its "I" and "E" complements # is wrapped with "{}". # if (inField == "") inField = 1; if (erase == "") erase = 0; if (outField == "") outField = inField; } (abort >= 0) { exit abort; } /^#/ { print; next; } /./ { if (NF < inField) { data_error("not enough input fields"); } x = $(inField); x = uncapitalize_ligatures(x); y = factor_text_oko(x); y = capitalize_ligatures(y); printout(y, outField, inField, erase); next; } function factor_text_oko(x, y,e) { # Decomposes "x" into its OKOKOKO elements. # Assumes "x" is uncapitalized EVA without comments and fillers. # EVA spaces and "/" are allowed. Ouput is uncapitalized and # has braces around each element. gsub(/{[^{}]*}/, "", x); gsub(/[!]/, "", x); if (match(x, /[^-=\/,. *?%a-z]/)) { data_error(("invalid char in word \"" x "\"")); } # Map "sh", "ch", and "ee" to single letters to simplify the parsing. # Note that "eee" groups are paired off from the left end. gsub(/ch/, "C", x); gsub(/sh/, "S", x); gsub(/ee/, "E", x); # Map platformed and half-platformed letters to capitals to simplify the parsing: gsub(/ckh/, "K", x); gsub(/cth/, "T", x); gsub(/cfh/, "F", x); gsub(/cph/, "P", x); # gsub(/ikh/, "G", x); gsub(/ith/, "H", x); gsub(/ifh/, "M", x); gsub(/iph/, "N", x); # gsub(/ck/, "U", x); gsub(/ct/, "V", x); gsub(/cf/, "X", x); gsub(/cp/, "Y", x); y = ""; while (x != "") { # printf "x = [%s]\n", x > "/dev/stderr"; if (match(x, /^[-=\/,. ]+/)) { e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); y = ( y e ); } else { # split off initial if any: if (match(x, /^[q]/)) { e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); y = ( y "{" e "}"); } while (1) { if (match(x, /^[aoy]/)) { # split off "[aoy]" group e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); } else if (match(x, /^([i]+[dlrsxmgn]?|[^-,./=aoyehi][eh]?|[^-,./=aoy])/)) { # copy next main letter with "i" and "e" complements e = substr(x,1,RLENGTH); x = substr(x, RLENGTH+1); } else { break; } y = ( y "{" e "}"); } } } # Unfold letter folding: gsub(/U/, "ck", y); gsub(/V/, "ct", y); gsub(/X/, "cf", y); gsub(/Y/, "cp", y); # gsub(/G/, "ikh", y); gsub(/H/, "ith", y); gsub(/M/, "ifh", y); gsub(/N/, "iph", y); # gsub(/K/, "ckh", y); gsub(/T/, "cth", y); gsub(/P/, "cph", y); gsub(/F/, "cfh", y); # gsub(/C/, "ch", y); gsub(/S/, "sh", y); gsub(/E/, "ee", y); return y; } function uncapitalize_ligatures(w) { # Removes ligature-capitalization: gsub(/C/, "c", w); gsub(/S/, "s", w); gsub(/I/, "i", w); gsub(/H/, "h", w); gsub(/K/, "k", w); gsub(/T/, "t", w); gsub(/P/, "p", w); gsub(/F/, "f", w); gsub(/Y/, "y", w); gsub(/O/, "o", w); gsub(/A/, "a", w); return w; } function capitalize_ligatures(w) { # Capitalize ligatures: gsub(/ch/, "Ch", w); gsub(/sh/, "Sh", w); gsub(/ckh/, "CKh", w); gsub(/ikh/, "IKh", w); gsub(/[?]kh/, "?Kh", w); gsub(/cth/, "CTh", w); gsub(/ith/, "ITh", w); gsub(/[?]th/, "?Th", w); gsub(/cph/, "CPh", w); gsub(/iph/, "IPh", w); gsub(/[?]ph/, "?Ph", w); gsub(/cfh/, "CFh", w); gsub(/ifh/, "IFh", w); gsub(/[?]fh/, "?Fh", w); gsub(/c[?]h/, "C?h", w); gsub(/i[?]h/, "I?h", w); gsub(/c[?]/, "C?", w); gsub(/hh/, "Hh", w); return w; } function printout(mw, ofn, ifn, del, i) { # prints $0 with "mw" inserted as field "$(ofn)" # if "del" is true, deletes field "$(ifn)" if (del) { if (NF < ifn) { data_error("not enough input fields\n"); } else { for(i=ifn; i> "/dev/stderr"; abort = 1; exit 1; } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1; }