#! /usr/bin/gawk -f # Last edited on 2000-05-30 18:47:13 by stolfi BEGIN { abort = -1; usage = ( ARGV[0] " -f get-component.gawk \\\n" \ " -v select=CTAGS \\\n" \ " < INFILE.fcp \\\n" \ " > OUTFILE.pairs" \ ); # Reads a file of words factored into QOKOKOKO elements, with their # crust/mantle/core structure bracketed with "()<>", (see the script # "split-words"). Outputs a list of components selected by the # component tags CTAGS. See pack_components for details. # # The following CTAGS values are treated especially. # They ouput zero or more pairs X-Y for each word, where # X and Y are given by the folowing table: # # tag X Y # ------- ------------------- -------------------- # tc-y: type of component coarse component. # tf-z: type of component fine component. # tw-w: type of word the word. # k-w: number of "peaks" the word. # if (select == "") { error("must specify \"select\""); } } (abort >= 0 ) { exit abort; } /^ *$/{next;} /./{ if (select == "k-w") { d = pack_npeaks_word($0); } else if (select == "tc-y") { d = pack_type_and_coarse_comps($0); } else if (select == "tf-z") { d = pack_type_and_fine_comps($0); } else if (select == "tw-w") { d = pack_type_word($0); } else { d = pack_components($0,select, "."); } if (d != "") { print d; } } function error(msg) { printf "line %d: %s\n", NR, msg >> "/dev/stderr"; abort = 1; exit 1; } function pack_components(w,tag,ety, y,t) { # # Assumes "w" is a simple word. Returns the component(s) of "w" # selected by the given "tag". Each letter of "tag" selects one # component, according to the following table: # # p = crust prefix. # m = mantle prefix # c = core # n = mantle suffix # s = crust suffix # # The components selected by successive letters of the "tag" are # concatenated without separators in the output word. The "<>" and # "()" delmiters are removed, but braces (if present) are retained. # Any "tag" character that is not one of the above is simply copied # to the output. If the result is the empty string, "ety" is # returned instead. # # If a requested element is missing or empty in "w", it is simply # omitted from the output string. On the other hand, if the # request is ambiguous, the procedure aborts and returns # the empty string. # # In particular, if "w" has no core, the codes "m" and "n" select # the entire mantle only if they occur next to each other in "tag", # or separated by "c". Any other occurrences of "m" or "n" will # cause the procedure to abort. # # Analogously, if "w" has neither core nor mantle, the codes "p" and # "s" will select the entire word if they occur next to each other, # or separated by any of "m", "n", or "c"; otherwise the procedure will abort. get_components(w); # debug_components(w); y = ""; while(tag != "") { t = pick_unambiguous_tag_group(tag, maxlevel); tag = substr(tag,length(t)+1); if (t == "") { return(""); } t = reduce_tag_group(t,maxlevel); # printf " tag group = %s ...", t; if (t == "") { y = y; } else if (t == "p") { y = ( y precrust ); } else if (t == "ps") { y = ( y crust ); } else if (t == "m") { y = ( y premantle); } else if (t == "mn") { y = ( y mantle ); } else if (t == "c") { y = ( y core ); } else if (t == "n") { y = ( y sufmantle ); } else if (t == "s") { y = ( y sufcrust ); } else { y = ( y t ); } # printf " y = [%s]\n", y; } return((y == "" ? ety : y)); } function pick_unambiguous_tag_group(tag,maxlevel) { # Returns the shortest non-empty prefix of "tag" that describes a # set of components that can be unambiguously identified in any word # with given "maxlevel". If there is no such prefix, returns "". # # Thus, for example, "p" and "s" can be unambiguously identified if # "maxlevel != 1"; otherwise, one can only identify the # concatenation "p[mcn]*s". Likewise, "m" and "n" can be # unambiguously identified only if "maxlevel != 2", otherwise one # can only identify the concatenation "mc*s". if (tag == "") { return(""); } else if (maxlevel == 1) { if (match(tag, /^([^ps])|([p][mcn]*[s])/)) { return(substr(tag,1,RLENGTH)); } else { return(""); } } else if (maxlevel == 2) { if (match(tag, /^([^mn])|([m][c]*[n])/)) { return(substr(tag,1,RLENGTH)); } else { return(""); } } else { return(substr(tag,1,1)); } } function reduce_tag_group(t,maxlevel, n) { # Given a minimal non-ambiguous tag group for the specified # maxlevel, reduces the group to the tags of non-empty # components. if (maxlevel < 3) { gsub(/[c]/, "", t); } if (maxlevel < 2) { gsub(/[mn]/, "", t); } if (maxlevel < 1) { gsub(/[ps]/, "", t); } return(t); } function pack_type_and_coarse_comps(w, y) { # Splits "w" into get_components(w); # debug_components(w); if (maxlevel == 0) { y = "c-\npm-\nns-"; } else if (maxlevel == 1) { y = sprintf("c-\npmns-%s", crust); } else if (maxlevel == 2) { y = sprintf("c-\npmns-%s%s%s", precrust,mantle,sufcrust); } else if (maxlevel == 3) { y = sprintf("c-%s\npm-%s%s\nns-%s%s", core, precrust, premantle, sufmantle, sufcrust); } else { error(("bad maxlevel = " maxlevel)); } return(y); } function pack_type_and_fine_comps(w, y) { get_components(w); # debug_components(w); if (maxlevel == 0) { y = "c-\nm-\nn-\np-\ns-\n"; } else if (maxlevel == 1) { y = sprintf("m-\nn-\nc-\nps-%s", crust); } else if (maxlevel == 2) { y = sprintf("c-\nmn-%s\np-%s\ns-%s", mantle,precrust,sufcrust); } else if (maxlevel == 3) { y = sprintf("c-%s\nm-%s\nn-%s\np-%s\ns-%s", core, premantle, sufmantle, precrust, sufcrust); } else { error(("bad maxlevel = " maxlevel)); } return(y); } function pack_type_word(w, y) { get_components(w); # debug_components(w); if (maxlevel == 0) { y = ""; } else if (maxlevel == 1) { y = (crust =="" ? "" : "ps"); } else if (maxlevel == 2) { y = ( \ (precrust == "" ? "" : "p") \ "mn" \ (sufcrust == "" ? "" : "s") \ ); } else if (maxlevel == 3) { y = ( \ (precrust == "" ? "" : "p") \ (premantle == "" ? "" : "m") \ "c" \ (sufmantle == "" ? "" : "n") \ (sufcrust == "" ? "" : "s") \ ); } else { error(("bad maxlevel = " maxlevel)); } return ((y "-" w)); } function pack_npeaks_word(w, t) { # printf "%s\n", w; # Count local minima of component bracketing: t = w; gsub(/[)>][^<>()]*[<(]/, "@", t); gsub(/[^@]/, "", t); return ((length(t)+1) "-" w); } function debug_components(w) { if (maxlevel == 0) { printf "%s = \n", w; } else if (maxlevel == 1) { printf "%s = {%s}\n", w, crust; } else if (maxlevel == 2) { printf "%s = {%s{%s}%s}\n", w, precrust,mantle,sufcrust; } else if (maxlevel == 3) { printf "%s = {%s{%s{%s}%s}%s}\n", w, \ precrust,premantle,core,sufmantle,sufcrust; } else { error(("bad maxlevel = " maxlevel)); } }