#! /usr/bin/gawk -f # Last edited on 1999-01-31 23:10:38 by stolfi BEGIN { abort = -1; usage = ( "factor-line-OK < INFILE > OUTFILE" ); # Factors each line of the INFILE into its OKOKOKO # elements, and writes the result to standard output. # # Each "Q" and "K" element with its "I" and "E" complements is # wrapped with "{}". Will provide a dummy element "{ø}" if the word # has no "q". The O elements are left undelimited, but a "_" is # provided where the O element is empty (including at the end, but # not at the beginning.) # # All EVA and ASCII spaces in the input are replaced by single # "."s. The above rules are applied independently to each word; # there is no interference between adjacent words # # The inField and outField are "1" if not specified, # i.e. the new field is prepended to the original record. } (abort >= 0) { exit abort; } /^#/ { print; next; } /./ { print factor_text($0); next; } function factor_text(x, y,n,wds,i,w,e) { # Decomposes "x" into its OKOKOKO elements, separated by ":". # Assumes "x" is uncapitalized EVA without comments and fillers. # EVA spaces and "/" are allowed. gsub(/[{][^{}]*[}]/, "", x); gsub(/[!]/, "", x); gsub(/[&][0-9][0-9][0-9][;]?/, "?", x); gsub(/[%?*]/, "?", x); if (match(x, /[^-=\/,. *?%a-z]/)) { error(("invalid char in word \"" x "\"")); } # Map "sh", "ch", and "ee" to single letters to simplify the parsing. # Note that "eee" groups are paired off from the left end. gsub(/ch/, "C", x); gsub(/sh/, "S", x); gsub(/ee/, "E", x); # Map platformed and half-platformed letters to capitals to simplify the parsing: gsub(/ckh/, "K", x); gsub(/cth/, "T", x); gsub(/cfh/, "F", x); gsub(/cph/, "P", x); gsub(/c[?]h/, "?", x); # gsub(/ikh/, "G", x); gsub(/ith/, "H", x); gsub(/ifh/, "M", x); gsub(/iph/, "N", x); gsub(/i[?]h/, "?", x); # gsub(/ck/, "U", x); gsub(/ct/, "V", x); gsub(/cf/, "X", x); gsub(/cp/, "Y", x); gsub(/c[?]/, "?", x); y = ""; # Split into words: gsub(/^[-=\/,. ]+/, "", x); gsub(/[-=\/,. ]+$/, "", x); n = split(x, wds, /[-=\/,. ]+/); for (i=1;i<=n;i++) { w = wds[i]; if (y != "") { y = ( y "." ); } # split off initial if any: if (match(w, /^[q]/)) { e = substr(w,1,RLENGTH); w = substr(w, RLENGTH+1); } else { e = "ø"; } y = ( y "{" e "}"); while (1) { # split off "[aoy]" group if (match(w, /^[aoy]+/)) { e = substr(w,1,RLENGTH); w = substr(w, RLENGTH+1); } else { e = "_"; } y = ( y e ); if (w == "") { break; } # copy next main letter with "i" and "e" complements if (match(w, /^([i]+[^aoyi]|[^aoyehi][eh]?|[^aoy])/)) { e = substr(w,1,RLENGTH); w = substr(w, RLENGTH+1); } else { error(("got stuck with \"" w "\"")); } y = ( y "{" e "}"); } } # Unfold letter folding: gsub(/U/, "ck", y); gsub(/V/, "ct", y); gsub(/X/, "cf", y); gsub(/Y/, "cp", y); # gsub(/G/, "ikh", y); gsub(/H/, "ith", y); gsub(/M/, "ifh", y); gsub(/N/, "iph", y); # gsub(/K/, "ckh", y); gsub(/T/, "cth", y); gsub(/P/, "cph", y); gsub(/F/, "cfh", y); # gsub(/C/, "ch", y); gsub(/S/, "sh", y); gsub(/E/, "ee", y); return y; } function error(msg) { printf "line %d: %s\n", NR, msg >> "/dev/stderr"; abort = 1; exit 1; } function arg_error(msg) { printf "%s\n", msg >> "/dev/stderr"; abort = 1; exit 1; }