#! /usr/bin/gawk -f # Last edited on 2000-05-23 16:06:41 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] " [ -v omods=BOOL ] < INFILE.elt > OUTFILE.wsp" ); # Reads a stream of element-factored words, outputs a stream of words # with nested parentheses/brackets around mantle and core components. # # A "core" component is defined as a single element containing a # gallows letter. Each core component gets surrounded by angle # brackets "<>" # # A "mantle" component is a maximal consecutive sequence of mantle # elements. A mantle element is one that contains no gallows # but contains one of the letters "ch", "sh", or "ee". # The mantle components are marked off with parentheses "()". # # Optionally, if "omods" is 1: # * any "o" element that is immediately preceded by a core # element and immediately followed by a core or mantle # element is considered part of the core; # * any "o" element that is immediately preceded by a # mantle element and immediately followed by a # core or mantle element is considered part of the mantle. # # The "crust" elements are what remains of the word after the # removing the mantle and core components. They are not marked. # # The presence of [ic] pre-modifiers and/or [ch] post-modifiers in an # element does not affect its classification. } (abort >= 0) { exit abort; } /^ *$/{next;} /./{ # Delete dummy (empty) factors: gsub(/{[_]*}/, "", $0); gsub(/[_][_]*/, "", $0); # Map "ch", "sh", and "ee" elements to "C"/"S"/"E" to simplify processing: gsub(/ch/, "C", $0); gsub(/sh/, "S", $0); $0 = gensub(/([{][ic]*)ee/, "\\1E", "g", $0); # Split and bracket core+mantle segments: y = ""; x = $0; while (x != "") { ok = 0; # split off crust prefix, if any: if (match(x, /^[{][^CSEktpf]*[}]/)) { # printf "c %d %d %s : %s\n", RSTART, RLENGTH, y, x; y = ( y substr(x,1,RLENGTH)); x = substr(x,RLENGTH+1); ok = 1; } # split off mantle/core chunk, if any: if (match_core_mantle_element(x)) { # printf "m %d %d %s : %s\n", RSTART, RLENGTH, y, x; y = ( y "(" substr(x,1,RLENGTH) ); x = substr(x,RLENGTH+1); ok = 1; while (match_core_mantle_element(x)) { # printf "+ %d %d %s : %s\n", RSTART, RLENGTH, y, x; y = ( y substr(x,1,RLENGTH) ); x = substr(x,RLENGTH+1); } y = ( y ")" ); } # printf "%d %d %s\n", RSTART, RLENGTH, x > "/dev/stderr"; if (! ok) { error(("bad word \"" $0 "\", got stuck with \"" x "\"")); } } # mark off any core elements in "y": y = gensub(/([{][ic]*[ktpf][he]*[}])/, "<\\1>", "g", y); if (omods) { # Incorporate into the mantle any "o" elements that should go there: y = gensub(/[)]([{][o][e]*[}])[(]/, "\\1", "g", y); # Incorporate into the core any "o" elements that should go there: y = gensub(/[>]([{][o][e]*[}])/, "\\1>", "g", y); } # Restore mantle letters: gsub(/C/, "ch", y); gsub(/S/, "sh", y); gsub(/E/, "ee", y); print y; } function match_core_mantle_element(x) { match(x, /^[{][ic]*[CSEktpf][eh]*[}]/); return RSTART; } function error(msg) { printf "line %d: %s\n", NR, msg >> "/dev/stderr"; abort = 1; exit 1; }