#! /usr/bin/gawk -f
# Last edited on 2008-06-18 00:49:40 by stolfi

BEGIN {
  USAGE = ( \
    "extract_RefSeq_ranges \\\n" \
    "  [<] {INFILE}.gbk" \
  );

  # Reads from standard input an NCBI RefSeq genomic cDNA file in the
  # full GenBank flatfile format. Extracts from the FEATURES section
  # the nucleotide labeling information that is relevant to
  # coding/non-coding analysis.
   
  # INPUTS
   
  # The format of the RefSeq files appears to be similar or identical
  # to the EMBL format described in
  #    
  #    http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
        
  # OUTPUTS
   
  # The output begins with a perfunctory header with the format
  #   
  #   "> {ITEMVERS} /ini={SRCINI} /fin={SRCFIN} /circular={CIRC}"
  #   
  # Where {ITEMVERS} is the item's name, including the ".{V}" version
  # suffix, as specified in the input file's "VERSION" line;
  # {SRCINI,SRCFIN} are the range of valid positions specified in the
  # "source" feature (usually 1 and {N}, where {N} is the number of
  # base pairs in the input file); and {CIRC} is 0 if the
  # molecule is linear (open), 1 if circular (closed).
   
  # The header is followed by a list of /labeling ranges/,
  # one per line. Each line assigns functional labels for one
  # continuous range of positions on one of the two strands. The
  # format of each line is "{STRAND} {INI} {FIN} {STATUS} {LABELS}" where
  #   
  #   {STRAND} is "+1" for the primal strand, "-1" for the dual strand.
  #   
  #   {INI}    is the lowest-numbered position in the range.
  # 
  #   {FIN}    is the highest-numbered position in the range.
  #
  #   {STATUS} is a hint about the reliability of this feature.
  #   
  #   {LABELS} is the string of label letters assigned to those positions.
  #   
  # The /primal strand/ is the one whose nucleotide sequence is given
  # in the file; the /dual/ strand is the other one, which as the
  # complementary bases in the reverse sequence.
   
  # A /position/ is a positive integer that designates a base pair in
  # the double-stranded cDNA described in the input file. Positions
  # are always relative to the beginning of the primal strand, even
  # when the range refers to nucleotides on the dual strand, and ever
  # range satisfies {INI <= FIN}. Moreover, {INI} and {FIN} are always
  # within the range {SRCINI..SRCFIN}.
   
  # The letters of the {LABELS} string are assigned
  # to the positions {INI..FIN} in the order that matches
  # the normal direction of DNA->RNA transcription: that is,
  # from {INI} up to {FIN} if {STRAND == +1} and from {FIN} down to {INI}
  # if {STRAND == -1}.  The {LABELS} string is implicitly
  # replicated or truncated as neded to fill the range. So, for
  # example, the output line "+1 200 204 DEF" assigns letters
  # "D","E","F","D","E" to the primal-strand bases at positions
  # 200,201,202,203,204, respectively; while "-1 200 204 DEF" assigns
  # the same labels to the dual-strand bases at positions 
  # 204,203,202,201,200, respectively.
   
  # In the case of circular strands, any range {A..B} that may appear
  # in the input file with {A > B} is assumed to span the bond between
  # positions {SRCFIN} and {SRCINI}. Such a range will be split into
  # two separate ranges {A..SRCFIN} and {SRCINI..B}. (Hopefully no
  # such range will cover the same base more than once!)
  
  # LABEL LETTERS
   
  # The label letters record the function of the corresponding
  # nucleotide, as inferred from the "FEATURES" section of the input
  # file. Each label may be:
  #   
  #      'D', 'E', 'F'      The three codon positions in coding regions.
  #      'N'                Base that gets transcribed to RNA.
  #      'O'                Gap in the input data.
  #      'Z'                Unknown function.
  #       
  # The coding labels [DEF] come from "CDS" features. The 'N' labels
  # come from transcription features ("mRNA", "misc_RNA", "ncRNA",
  # "rRNA", "tRNA"). The label 'O' comes from "gap" features.
  # The label 'Z' is assigned to all positions in the "source"
  # range.  
  
  # Note that the same position may get labeled several times.
  # Every base on both strands gets a 'Z' label, and every base
  # that gets a [DEF] label should also get an 'N' label.
  
  # RELIABILITY STATUS
  
  # The {STATUS} field tries to summarize the reliability of the 
  # feature, as inferred by this program from the "/note" and
  # other qualifiers.  It is a digit that hopefully means
  #
  #   0  The feature is bogus, erroneous, or irrelevant.
  #   
  #   2  The feature was detected by a program only.
  #   
  #   4  This program was unable to determine the feature's reliability.
  #   
  #   6  The feature has some experimental evidence.
  #   
  #   8  The feature has been fully confirmed experimentally.
  #   
  #   9  The feature must be taken as is.
  #   
  # This programs may generate features with status 0 internally, but
  # should not even write them out. Status 9 is used for features like
  # "source" and "gap" that, even if they are not correct, must be
  # assumed to be in order to use the program's output.
  
  # Paranoia mode:
  LINT = 0;

  # Flag to force termination on errors:
  abort = -1; 
  
  # Argument checking and defaulting:
  # (No args for now.)
  
  # Parsing state:
  #   0 before "FEATURES" line
  #   1 between "FEATURES" and "ORIGIN" lines
  #   2 between "ORIGIN" and "//" lines.
  #   3 after "//" line..
  state = 0;

  # Other global attributes:
  locus = "";       # NCBI locus identifier from the "LOCUS" line, or "".
  version = "";    # NCBI locus + version identifier from the "VERSION" line, or "". 
  GenBank_id = ""; # GenBank acession number from the "VERSION" line, or "". 

  source_ok = 0;   # TRUE iff "source" entry has been found.
}

(abort >= 0) { exit abort; }

# Ignore blank lines:
/^ *$/ { next; }

/[Ťť]/ { 
  data_error(("funny quotes in file")); 
}

# PARSING THE MAIN HEADER LINES

/^LOCUS[ ]/ {
  if (state != 0) { data_error(("misplaced ŤLOCUSť line")); }
  if (locus != "") { data_error(("duplicate ŤLOCUSť line")); }
  
  # Grab attributes from "LOCUS" line:
  locus = $2;       # Locus identifier
  tot_length = $3;  # Total length of sequence. 
  length_unit = $4; # Unit of {tot_length} (should be "bp").
  seq_type = $5;    # Type of sequence (should be "DNA").
  seq_shape = $6;   # "circular" or "linear".
  locus_kind = $7;  # NCBI classifier: "PRI", "CON", etc.
  date = $8;        # Date of file in {DD}-{MMM}-{YYYY} format.
  
  if (locus !~ /^[A-Z0-9_]+$/) { data_error(("invalid locus Ť" locus "ť")); }
  if (tot_length !~ /^[0-9]+$/) { data_error(("invalid length Ť" tot_length "ť")); }
  tot_length = tot_length + 0;
  if (length_unit != "bp") 
    { data_error(("expected size unit Ťbpť, found Ť" length_unit "ť")); }
  if (seq_type != "DNA")
    { data_error(("expected sequence type ŤDNAť, found Ť" seq_type "ť")); }
  if (seq_shape == "linear") 
    { circular = 0; }
  else if (seq_shape == "circular") 
    { circular = 1; }
  else 
    { data_error(("invalid sequence shape Ť" date "ť")); }
  if (date !~ /^[0-9]+[-][A-Z][A-Z][A-Z]-[12][09][0-9][0-9]$/) 
    { data_error(("invalid date format Ť" date "ť")); }
  next;
}

/^VERSION[ ]/ {
  if (state != 0) { data_error(("misplaced ŤVERSIONť line")); }
  if (version != "") { data_error(("duplicate ŤVERSIONť line")); }
  version = $2;
  GenBank_id = $3;
  next;
}

/^FEATURES([ ]|$)/ {
  if (state != 0) { data_error(("misplaced ŤFEATURESť line")); }
  state = 1;
  
  # Print global header data to stderr:
  if (locus == "") { data_error(("missing ŤLOCUSť line")); }
  if (version == "") { data_error(("missing ŤVERSIONť line")); }
  printf "locus = %s  version = %s  date = %s\n", locus, version, date > "/dev/stderr";
  printf "  tot_length = %d\n", tot_length > "/dev/stderr";
  printf "  seq_type = %s  circular = %d \n", seq_type, circular > "/dev/stderr";

  # Prepare to parse the features: 
  key = "";  # Keyword of current feature.
  arg = "";  # Argument string of current feature.
  next;
}

/^ORIGIN([ ]|$)/ {
  if (state != 1) { data_error(("misplaced ŤORIGINť line")); }
  state = 2;
  
  # Finish processing the last feature, if any: 
  if (key != "") { process_feature(key, arg); }
  
  # Check for required features: 
  if (! source_ok) { data_error(("missing Ťsourceť entry")); }
  
  # Stop reading and go to the {END} block:
  exit;
}

/^[\/][\/]/ {
  if (state != 2) { data_error(("misplaced Ť//ť line")); }
  if (bas_next != seq_fin+1) 
    { data_error(("position of last base = " bas_next-1 " should be " seq_fin "")); }
  state = 3;
  next;
}

/^[A-Z]+([ ]|$)/ {
  # Any other header line: 
  if (state != 0) { data_error(("misplaced Ť" $1 "ť header line")); }
  # Ignore extra headers in state 0:
  next;
}

# PARSING THE NON-HEADER LINES

(state == 0) { 
   # Non-header line before the "FEATURES" lines:
  next;
}

(state == 1) {
  # Non-header line between "FEATURES" and "ORIGIN" lines:
  
  # Get keyword in columns 1--20:
  new_key = substr($0, 1, 20);
  gsub(/[ ]/, "", new_key);
  
  # Get argument in columns 21--end:
  new_arg = substr($0, 21); 
  gsub(/^[ ]+/, "", new_arg);
  gsub(/[ ]+$/, "", new_arg);
  if (new_key == "")
    { # Continuation line: 
      if (key == "") { data_error(("missing start of feature")); } 
      arg = (arg " " new_arg);
    }
  else 
    { # Start of a new feature:
      if (key != "") { process_feature(key, arg); }
      key = new_key;
      arg = new_arg;
    }
  next;
}

(state >= 2) {
  # This should not happen:
  prog_error(("program kept on reading after going to state 2")); 
  next;
}

END {
  if (abort >= 0) { exit abort; }
  
  # Check whether all necessary information has been parsed:
  if (state != 2) { data_error(("missing ŤORIGINť line")); }
  
  # To keep {LINT} happy:
  close(stdout);
  close(stderr);
  close(stdin);
}

# DATA PROCESSING FUNCTIONS

function process_feature(key,arg,    loc,qua,qmap,status,note)
{
  # Processes a feature from the "FEATURES" section,
  # with the given {key} and arguments {arg}.
  
  # The interesting feature keys (which generate
  # labeling ranges) are:
  # 
  #        at       ce       dr       dm  KEY
  #   -------  -------  -------  -------  ---
  #         7        7       26        7  source
  #   -------  -------  -------  -------  ---
  #         .        .     2458        8  gap
  #   -------  -------  -------  -------  ---
  #     32615    23208    24225    20221  mRNA
  #        14        2        2        2  rRNA
  #        84      204        .      296  ncRNA
  #       577      228      614      339  misc_RNA
  #       689      630       22      314  tRNA
  #         .        .        .        .  tmRNA
  #   -------  -------  -------  -------  ---
  #     32817    23221    24220    20234  CDS
  #   -------  -------  -------  -------  ---
  # 
  # Here are some other uninteresting feature keys that
  # have been foudn to occur in the NCBI RefSeq files:
  # 
  #        at       ce       dr       dm  KEY
  #   -------  -------  -------  -------  ---
  #        14        .        .        .  promoter
  #     33263    21052    25205    14690  gene
  #       716    16980       12      975  STS
  #      5228        2        .        7  misc_feature
  #        32        .     3482        .  exon
  #        21        .        .        .  intron
  #         .        .        9        .  C_region
  #         .        .        1        .  D-loop
  #         .        .        2        .  V_segment
  #         .        .        1        1  rep_origin
  #         3        .        .     5395  repeat_region
  #         3        .        .        .  repeat_unit
  # 
  
  # Case is not significant for keys, so:
  key = tolower(key);
  
  # Split the argument {arg} into a location {loc}
  # and a qualifier string {qua}:
  
  if (match(arg, /[\/]/))
    { loc = substr(arg,1,RSTART-1); qua = substr(arg,RSTART); }
  else
    { loc = arg; qua = ""; }
    
  # General cleanup of location: 
  loc = tolower(loc);
  gsub(/[ ]/, "", loc);
  
  # Split qualifiers {qua} into a table {qmap} indexed by the qualif. name:
  split("", qmap);
  split_qualifiers(qua, qmap);
  
  # Dispatch on {key}:
  if (key == "source")
    { if (source_ok) { data_error(("more than one Ťsourceť feature")); }
      source_ok = 1;
      status = process_source(key, loc, qmap);
    }
  else
    { # The "source" feature must come first:
      if (! source_ok) { data_error(("missing Ťsourceť feature")); } 
      if (key == "gap")
        { status = process_gap(key, loc, qmap); }
      else if (key ~ /rna$/)
        { status = process_rna(key, loc, qmap); }
      else if (key == "cds")
        { status = process_cds(key, loc, qmap); }
      else if (key ~ /^(gene|sts|promoter|misc_feature|exon|intron|repeat_region|repeat_unit)$/)
        { status = process_useless_feature(key, loc, qmap) }
      else 
        { data_warning(("ignored feature key = Ť" key "ť"));
          status = 1;
        }
    }

  # Printe for debugging:
  # printf "%s:%d: key = Ť%sť loc = Ť%sť\n", FILENAME, FNR, key, loc > "/dev/stderr";
  
  # Print decision for analysis:
  if ((status != 0) && ("/note" in qmap))
    { note = qmap["/note"];
      if (length(note) > 200) { note = substr(note,1,200); }
      printf "%s:%d: # status = %d key = Ť%sť /note=Ť%sť\n", FILENAME, FNR, status, key, note > "/dev/stderr"; 
    }
}

function process_source(key,loc,qmap,   i,status)
{
  # Processes the "source" feature.
  # Defines the variables {seq_ini} and {seq_fin}.
   
  # The "source" location should be a simple range: 
  if (loc !~ /^[0-9]+[.][.][0-9]+$/) { data_error(("funny source location = Ť" loc "ť")); }
  
  # Set {seq_ini,seq_fin}:
  seq_ini = get_range_ini(loc);
  seq_fin = get_range_fin(loc);
  if (seq_ini != 1) { data_error(("source range Ť" loc "ť does not start at 1")); }
  if (seq_fin < seq_ini) { data_error(("source range Ť" loc "ť is empty")); }
  
  # Print the header:
  printf "> %s /ini=%d /fin=%d /circular=%d\n", version, seq_ini, seq_fin, circular;
  
  # Label all positions with 'Z':
  status = 9;
  label_range(+1, seq_ini, seq_fin, status, "Z");
  label_range(-1, seq_ini, seq_fin, status, "Z");
  
  return status;
}

function process_gap(key,loc,qmap,   gini,gfin,status)
{
  # Processes the "gap" feature.
  
  # The location should be a simple range (or a single position):
  if (loc !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("funny Ťgapť feature")); }
  
  # Label both strands with 'O':
  gini = get_range_ini(loc);
  gfin = get_range_fin(loc);
  status = 9;
  label_range(+1, gini, gfin, status, "O");
  label_range(-1, gini, gfin, status, "O");
  
  return status;
}

function process_rna(key,loc,qmap,  k,ini,fin,i,status)
{
  # Processes the "*RNA" (transcribed region) features.
  
  # Try to determine whether the sequence has experimental status:
  status = infer_feature_status(qmap);
  
  # !!! Should parse the "/note" qualifier to know whether 
  # the feature is experimental or hypothetical.
  
  # Just label all bases in {loc} with 'N':
  label_location(loc, status, "N");
  
  return status;
}

function process_cds(key,loc,qmap,  frame,labels,status)
{
  # Processes the "CDS" (coding region) feature.
  
  # Essential qualifiers: 
  #   
  #   /codon_start={1|2|3}
  #  
  # Qualifiers relevant for consistency checking: 
  #  
  #  /translation="<aaletter_string>"
  #  /transl_table=<number>
  #  /codon=(seq:"codon-sequence",aa:<amino_acid>)
  #  /exception="text"
  #  /transl_except=(pos:<location>,aa:<amino_acid>)
  #     transl_except=(pos:213..215,aa:Trp)
  #     transl_except=(pos:213..214,aa:TERM) #-- incomplete stop codon
  
  # Try to determine whether the sequence has experimental status:
  status = infer_feature_status(qmap);
  
  # !!! Should analyze the "/note" qualifier to know whether 
  # the feature is experimental or hypothetical.
  
  # Get the starting offset from the "/codon_start" feature:
  frame = get_qualifier("/codon_start", qmap, 1);
  if (frame !~ /^[123]$/) { data_error(("invalid Ť/codon-startť value = Ť" frame "ť")); }
  labels = substr("DEFDEF", 5 - frame, 3);
  if (frame != 1) { data_warning(("codon start = " frame "")); }
  label_location(loc, status, labels);

  # !!! Should check the length of the "/translation"
  
  return status;
}

function process_useless_feature(key,loc,qmap,   status)
{
  # Processes a common but useless feature.
  # Just ignore it:
  status = 0;
  return status;
  # Done. (That was easy...)
}

function infer_feature_status(qmap,   note,status)
{
  # Tries to determine the reliability status of a feature
  # from its qualifiers {qmap}.
  
  status = 4; # Default.
  
  # Get the "/note" qualifier:
  note = get_qualifier("/note", qmap, "");
  if (note ~ /[Ii]dentical to /) 
    { status = 3; }
  else if (note ~ /^[Ss]imilar to/) 
    { status = 2; }
  else if (note ~ /[Ss]upported by cDNAs/) 
    { status = 2; }
  else if (note ~ /[Ss]upported by ESTs/) 
    { status = 5; }
  else if (note ~ / by automated computational /) 
    { status = 2; }
    
  return status;
}

function label_location(loc,status,labels,   nrt,rt,j,k,rng,len,strand)
{
  # Writes out one or more lines that
  # label the location {loc} with repeated copies of the string
  # {labels}.  These lines are marked with reliability hint {status}.
  
  # The location {loc} specifies a sequence of positions on a specific
  # strand. According to the documentation, {loc} must be either
  #   
  #   * a single position "{POS}",
  #   * a range of consecutive positions "{INI}..{FIN}",
  #   * a call "complement({LOC})", or
  #   * a call "join({LOC[1]},{LOC[2]},..{LOC[n]})"
  #   
  # where each {LOC} or {LOC[i]} is a location. However, cannot be two
  # nested calls to the same operator ("complement" or "join"). So the
  # formula has maximum depth 2. Moreover one cannot join positions
  # belonging to opposite strands.

  # The "join" operator appends the sequences described by its
  # operands in the order they appear in the argument list. The
  # "complement" perator flips each position to the other strand and
  # reverses the order of the sequence. Thus,
  # "complement(join({A},{B}))" is equivalent to
  # "join(complement({B}),complement({A}))".
  
  # The procedure applies letters 1,2,... of {labels} (cyclically) to
  # the positions specified by {loc}, in the order specified by {loc}.
  # Thus, for example, the call
  #   
  #   {label_location("complement(join(50..60,30..40))","DEF",status)}
  #   
  # will start labeling "complement(40)" with 'D', "complement(39)"
  # with 'E', and so on.
  
  # The procedure fails with error if {loc} is malformed or names any
  # invalid position.
   
  # If {circular} is true, any range with {INI > FIN} appearing in
  # {loc} is assumed to mean
  # "join({INI}..{seq_fin},{seq_ini}..{FIN})", If {circular} is false,
  # all ranges in {loc} must have {INI <= FIN}.

  split("", rt); # Temp array for arguments of "join".
  if (loc ~ /^complement[(].*[)]$/)
    { # Strip the "complement" operator: 
      loc = substr(loc,12,length(loc)-12);
      # Extract the ranges:
      nrt = split_join_of_ranges(loc, rt);
      # Label them in reverse order:
      for (j = nrt; j > 0; j--)
        { len = parse_and_label_range(-1, rt[j], status, labels);
          # Cyclically shift the labels:
          labels = cycle_labels(labels, len);
        }
    }
  else if (loc ~ /^join[(].*[)]$/) 
    { # Get the joined ranges (or complements thereof): 
      nrt = split_join_of_ranges(loc, rt);
      # Label them in increasing order:
      for (j = 1; j <= nrt; j++)
        { rng = rt[j];
          if (rng ~ /^complement[(].*[)]$/) 
            { # strip the "complement" operator:
              rng = substr(rng,12,length(rng)-12);
              strand = 1;
            }
          else
            { strand = 0; }
          len = parse_and_label_range(strand, rng, status, labels);
          # Cyclically shift the labels:
          labels = cycle_labels(labels, len);
        }
    }
  else
    { # Location should be a simple range or position:
      len = parse_and_label_range(+1, loc, status, labels);
    }
}

function parse_and_label_range(strand,rng,status,labels,  len,rini,rfin)
{ 
  # Performs syntax checking and clenup of the range {rng} which must
  # be a simple range "{INI}..{FIN}" or a single position "{POS}",
  # then labels it with the given labels by calling
  # {label_range(strand,INI,FIN,status,labels)} (q.v.). Returns the number of
  # positions in the range.
  
  rng = cleanup_range(rng);
  rini = get_range_ini(rng);
  rfin = get_range_fin(rng);
  len = label_range(strand, rini, rfin, status, labels);
  return len;
}

function cleanup_range(rng   )
{
  # Checks whether {rng} is a well-formed range or single position. If
  # it contains '<' or '>' constructs, prints a warning and removes
  # them. Returns the cleaned-up range. Fails, in particular, if the
  # range contains the constructs "{POS1}.{POS2}" or "{POS1}^{POS2}".
  
  if (rng ~ /[<>]/)
    { data_warning(("uncertain range Ť" rng "ť ignoring the [<>]")); 
      gsub(/[<>]/, "", rng);
    }
  if (rng !~ /^[0-9]+([.][.][0-9]+|)$/) 
    { data_error(("expected position or range, found Ť" rng "ť")); }
  return rng;
}

function split_join_of_ranges(loc,rt,  nrt)
{
  # Given a location {loc} that is a join of locations, splits
  # its argument list.  Returns the number {nrt} of arguments 
  # result, and puts the arguments in {rt[1..nrt]}.  If {loc} is not
  # a "join(...)" construct, sets {rt[1]=loc} and returns 1.
  # The parameter {rt} must be an empty array created by the caller.
  
  if (loc ~ /^join[(].*[)]$/)
    { # Remove the "join(...)" wrapper:
      loc = substr(loc, 6, length(loc)-6); 
      # Break at commas:
      nrt = split(loc, rt, ",");
      return nrt; 
    }
  else
    { # Not a "join(...)":
      rt[1] = loc;
      return 1;
    }
}

function label_range(strand,rini,rfin,status,labels,   len1,len2)
{
  
  # Like {label_location}, specialized for a simple range on the
  # indicated {strand}. Returns the number of positions labeled.
  
  # If {rini > rfin}, then {circular} must be true, and the range is
  # split into two ranges {rini..seq_fin} and {seq_ini..rfin}. In any
  # case, if {strand} is -1, the range(s) are reversed.

  if (rini > rfin) 
    { if (circular)
        { # The range apparently spans the {seq_fin->seq_ini} bond, split it in two:
          if (strand + 0 > 0)
            { len1 = label_range(strand, rini, seq_fin, status, labels);
              labels = cycle_labels(labels, len1);
              len2 = label_range(strand, seq_ini, rfin, status, labels);
            }
          else
            { len1 = label_range(strand, seq_ini, rfin, status, labels);
              labels = cycle_labels(labels, len1);
              len2 = label_range(strand, rini, seq_fin, status, labels);
            }
          # Sanity check:
          if (len1 + len2 > 0.5*(src_fin - src_ini + 1)) 
            { data_warning(("extra long range = " rini ".." rfin " (" len1+len2 " positions)")); }
          return len1 + len2;
        }
      else
        { data_error(("reverse range = " rini ".." rfin " in linear molecule")); }
    }
  else
    { # Simple case:
      printf "%+2d %10d %10d %d %s\n", strand, rini, rfin, status, labels;
      return (rfin - rini + 1);
    }
}

function get_range_ini(r)
{
  # Requires {r} to be a simple range "{X}..{Y}" or single position "{X}".
  # Returns {X} as a numeric value. Does not check whether {X} is in {seq_ini..seq_fin},
  # but fails if {X} is zero.
  if (r !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("bad range format Ť" r "ť")); }
  gsub(/[.][.][0-9]+$/, "", r);
  r += 0;
  if (r < 1) { data_error(("invalid position Ť" r "ť")); }
  return r;
}

function get_range_fin(r)
{
  # Requires {r} to be a simple range "{X}..{Y}" or single position "{Y}".
  # Returns {Y} as a numeric value. Does not check whether {Y} is in {seq_ini..seq_fin},
  # but fails if {Y} is zero.
  if (r !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("bad range format Ť" r "ť")); }
  gsub(/^[0-9]+[.][.]/, "", r);
  r += 0;
  if (r < 1) { data_error(("invalid position Ť" r "ť")); }
  return r;
}

function split_qualifiers(qua,qmap,   name,val,sep)
{
  # Splits the qualifiers contained in the string {qua} into a table
  # {qmap} such that {qmap[name]} is the value of the qualifier called
  # {name}. The {qmap} parameter must be an empty array created by the caller.
  # 
  # The {name} will include the "/" but not the "=".
  # Text values have all their extra quotes stripped off.
  # Other text values are unchanged. 
  
  while (1)
    { # Remove any leading blanks from {qua}:
      gsub(/^[ ]+/, "", qua);
      # Check for termination: 
      if (qua == "") { return; }
      # Check for presence of a name: 
      if (! match(qua, /^[\/][-_A-Za-z0-9]+/))
        { data_error((" no Ť/{NAME}ť in Ť" qua "ť")); }
      name = substr(qua, RSTART, RLENGTH);
      qua = substr(qua, RSTART+RLENGTH);
      # Check and consume the "=":
      if (! match(qua, /^[ ]*[=][ ]*/))
        { # Missing "=" after "/{name}"; may be a presence/absence qualifier.
          # Set its value to "1" (value "" might be more logical, but is too subtle).
          val = 1;
        }
      else
        { qua = substr(qua, RSTART+RLENGTH);
          # Check for presence and type of value: 
          if (qua ~ /^["]/)
            { # Text value, grab and remove extra double quotes: 
              val = ""; sep = "";
              do
                { if (! match(qua, /^["][^"]*["]/)) { data_error(("malformed value of Ť" name "ť qualifier")); }
                  val = ( val sep substr(qua,2,RLENGTH-2) );
                  qua = substr(qua, RSTART+RLENGTH);
                  # Allow for blanks between quote pairs: 
                  if (match(qua, /^[ ]+/)) { qua = substr(qua, RSTART+RLENGTH); }
                  sep = "\"";
                }
              while (qua ~ /^["]/);
            }
          else
            { # Integer or keyword value, fetch to next "/" or end of string;
              if (! match(qua, /^[^\/]*/)) { program_error(("duh?")); }
              val = substr(qua, RSTART, RLENGTH);
              qua = substr(qua, RSTART+RLENGTH);
            }

          # Regularize spaces in {val}:
          gsub(/^[ ]+/, "", val);
          gsub(/[ ]+$/, "", val);
          gsub(/[ ][ ]+/, " ", val);
        }
        
      # Save {val} in {qmap}:
      if (name in qmap)
        { # Multiple qualifiers with same {name}, append them:
          qmap[name] = ( qmap[name] " " val );
        }
      else
        { qmap[name] = val; }
    }
}

function get_qualifier(name,qmap,def,  val,res,sep)
{ 
  # Extracts from the qualifier table {qmap} the value of the
  # text-valued qualifier named {name}. The {name} should include the
  # "/" but not the "=". If there is no such qaulifier in {qmap},
  # returns the {def} value.
  
  if (name in qmap)
    { return qmap[name]; }
  else
    { return def; }
}

function cycle_labels(str,k,    n)
{
  # Shifts the string {str} cyclically to the left by {k} characters.
  n = length(str);
  k = k % n;
  if (k == 0) 
    { return str; }
  else
    { return (substr(str, 1+k) substr(str,1,k)); }
}

# AUXILIARY PROCEDURES

function arg_error(msg)
{
  printf "** %s\n", msg > "/dev/stderr";
  printf "usage: %s\n", USAGE > "/dev/stderr";
  abort = 1;
  exit abort;
}

function data_warning(msg)
{
  printf "%s:%s: warning: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "  Ť%sť\n", $0 > "/dev/stderr";
}

function data_error(msg)
{
  printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr";
  printf "  Ť%sť\n", $0 > "/dev/stderr";
  abort = 1;
  exit abort;
}

function prog_error(msg)
{
  printf "%s:%s: ** program error: %s\n", FILENAME, FNR, msg > "/dev/stderr";
  abort = 1;
  exit abort;
}