#! /usr/bin/gawk -f # Last edited on 2008-06-18 00:49:40 by stolfi BEGIN { USAGE = ( \ "extract_RefSeq_ranges \\\n" \ " [<] {INFILE}.gbk" \ ); # Reads from standard input an NCBI RefSeq genomic cDNA file in the # full GenBank flatfile format. Extracts from the FEATURES section # the nucleotide labeling information that is relevant to # coding/non-coding analysis. # INPUTS # The format of the RefSeq files appears to be similar or identical # to the EMBL format described in # # http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html # OUTPUTS # The output begins with a perfunctory header with the format # # "> {ITEMVERS} /ini={SRCINI} /fin={SRCFIN} /circular={CIRC}" # # Where {ITEMVERS} is the item's name, including the ".{V}" version # suffix, as specified in the input file's "VERSION" line; # {SRCINI,SRCFIN} are the range of valid positions specified in the # "source" feature (usually 1 and {N}, where {N} is the number of # base pairs in the input file); and {CIRC} is 0 if the # molecule is linear (open), 1 if circular (closed). # The header is followed by a list of /labeling ranges/, # one per line. Each line assigns functional labels for one # continuous range of positions on one of the two strands. The # format of each line is "{STRAND} {INI} {FIN} {STATUS} {LABELS}" where # # {STRAND} is "+1" for the primal strand, "-1" for the dual strand. # # {INI} is the lowest-numbered position in the range. # # {FIN} is the highest-numbered position in the range. # # {STATUS} is a hint about the reliability of this feature. # # {LABELS} is the string of label letters assigned to those positions. # # The /primal strand/ is the one whose nucleotide sequence is given # in the file; the /dual/ strand is the other one, which as the # complementary bases in the reverse sequence. # A /position/ is a positive integer that designates a base pair in # the double-stranded cDNA described in the input file. Positions # are always relative to the beginning of the primal strand, even # when the range refers to nucleotides on the dual strand, and ever # range satisfies {INI <= FIN}. Moreover, {INI} and {FIN} are always # within the range {SRCINI..SRCFIN}. # The letters of the {LABELS} string are assigned # to the positions {INI..FIN} in the order that matches # the normal direction of DNA->RNA transcription: that is, # from {INI} up to {FIN} if {STRAND == +1} and from {FIN} down to {INI} # if {STRAND == -1}. The {LABELS} string is implicitly # replicated or truncated as neded to fill the range. So, for # example, the output line "+1 200 204 DEF" assigns letters # "D","E","F","D","E" to the primal-strand bases at positions # 200,201,202,203,204, respectively; while "-1 200 204 DEF" assigns # the same labels to the dual-strand bases at positions # 204,203,202,201,200, respectively. # In the case of circular strands, any range {A..B} that may appear # in the input file with {A > B} is assumed to span the bond between # positions {SRCFIN} and {SRCINI}. Such a range will be split into # two separate ranges {A..SRCFIN} and {SRCINI..B}. (Hopefully no # such range will cover the same base more than once!) # LABEL LETTERS # The label letters record the function of the corresponding # nucleotide, as inferred from the "FEATURES" section of the input # file. Each label may be: # # 'D', 'E', 'F' The three codon positions in coding regions. # 'N' Base that gets transcribed to RNA. # 'O' Gap in the input data. # 'Z' Unknown function. # # The coding labels [DEF] come from "CDS" features. The 'N' labels # come from transcription features ("mRNA", "misc_RNA", "ncRNA", # "rRNA", "tRNA"). The label 'O' comes from "gap" features. # The label 'Z' is assigned to all positions in the "source" # range. # Note that the same position may get labeled several times. # Every base on both strands gets a 'Z' label, and every base # that gets a [DEF] label should also get an 'N' label. # RELIABILITY STATUS # The {STATUS} field tries to summarize the reliability of the # feature, as inferred by this program from the "/note" and # other qualifiers. It is a digit that hopefully means # # 0 The feature is bogus, erroneous, or irrelevant. # # 2 The feature was detected by a program only. # # 4 This program was unable to determine the feature's reliability. # # 6 The feature has some experimental evidence. # # 8 The feature has been fully confirmed experimentally. # # 9 The feature must be taken as is. # # This programs may generate features with status 0 internally, but # should not even write them out. Status 9 is used for features like # "source" and "gap" that, even if they are not correct, must be # assumed to be in order to use the program's output. # Paranoia mode: LINT = 0; # Flag to force termination on errors: abort = -1; # Argument checking and defaulting: # (No args for now.) # Parsing state: # 0 before "FEATURES" line # 1 between "FEATURES" and "ORIGIN" lines # 2 between "ORIGIN" and "//" lines. # 3 after "//" line.. state = 0; # Other global attributes: locus = ""; # NCBI locus identifier from the "LOCUS" line, or "". version = ""; # NCBI locus + version identifier from the "VERSION" line, or "". GenBank_id = ""; # GenBank acession number from the "VERSION" line, or "". source_ok = 0; # TRUE iff "source" entry has been found. } (abort >= 0) { exit abort; } # Ignore blank lines: /^ *$/ { next; } /[«»]/ { data_error(("funny quotes in file")); } # PARSING THE MAIN HEADER LINES /^LOCUS[ ]/ { if (state != 0) { data_error(("misplaced «LOCUS» line")); } if (locus != "") { data_error(("duplicate «LOCUS» line")); } # Grab attributes from "LOCUS" line: locus = $2; # Locus identifier tot_length = $3; # Total length of sequence. length_unit = $4; # Unit of {tot_length} (should be "bp"). seq_type = $5; # Type of sequence (should be "DNA"). seq_shape = $6; # "circular" or "linear". locus_kind = $7; # NCBI classifier: "PRI", "CON", etc. date = $8; # Date of file in {DD}-{MMM}-{YYYY} format. if (locus !~ /^[A-Z0-9_]+$/) { data_error(("invalid locus «" locus "»")); } if (tot_length !~ /^[0-9]+$/) { data_error(("invalid length «" tot_length "»")); } tot_length = tot_length + 0; if (length_unit != "bp") { data_error(("expected size unit «bp», found «" length_unit "»")); } if (seq_type != "DNA") { data_error(("expected sequence type «DNA», found «" seq_type "»")); } if (seq_shape == "linear") { circular = 0; } else if (seq_shape == "circular") { circular = 1; } else { data_error(("invalid sequence shape «" date "»")); } if (date !~ /^[0-9]+[-][A-Z][A-Z][A-Z]-[12][09][0-9][0-9]$/) { data_error(("invalid date format «" date "»")); } next; } /^VERSION[ ]/ { if (state != 0) { data_error(("misplaced «VERSION» line")); } if (version != "") { data_error(("duplicate «VERSION» line")); } version = $2; GenBank_id = $3; next; } /^FEATURES([ ]|$)/ { if (state != 0) { data_error(("misplaced «FEATURES» line")); } state = 1; # Print global header data to stderr: if (locus == "") { data_error(("missing «LOCUS» line")); } if (version == "") { data_error(("missing «VERSION» line")); } printf "locus = %s version = %s date = %s\n", locus, version, date > "/dev/stderr"; printf " tot_length = %d\n", tot_length > "/dev/stderr"; printf " seq_type = %s circular = %d \n", seq_type, circular > "/dev/stderr"; # Prepare to parse the features: key = ""; # Keyword of current feature. arg = ""; # Argument string of current feature. next; } /^ORIGIN([ ]|$)/ { if (state != 1) { data_error(("misplaced «ORIGIN» line")); } state = 2; # Finish processing the last feature, if any: if (key != "") { process_feature(key, arg); } # Check for required features: if (! source_ok) { data_error(("missing «source» entry")); } # Stop reading and go to the {END} block: exit; } /^[\/][\/]/ { if (state != 2) { data_error(("misplaced «//» line")); } if (bas_next != seq_fin+1) { data_error(("position of last base = " bas_next-1 " should be " seq_fin "")); } state = 3; next; } /^[A-Z]+([ ]|$)/ { # Any other header line: if (state != 0) { data_error(("misplaced «" $1 "» header line")); } # Ignore extra headers in state 0: next; } # PARSING THE NON-HEADER LINES (state == 0) { # Non-header line before the "FEATURES" lines: next; } (state == 1) { # Non-header line between "FEATURES" and "ORIGIN" lines: # Get keyword in columns 1--20: new_key = substr($0, 1, 20); gsub(/[ ]/, "", new_key); # Get argument in columns 21--end: new_arg = substr($0, 21); gsub(/^[ ]+/, "", new_arg); gsub(/[ ]+$/, "", new_arg); if (new_key == "") { # Continuation line: if (key == "") { data_error(("missing start of feature")); } arg = (arg " " new_arg); } else { # Start of a new feature: if (key != "") { process_feature(key, arg); } key = new_key; arg = new_arg; } next; } (state >= 2) { # This should not happen: prog_error(("program kept on reading after going to state 2")); next; } END { if (abort >= 0) { exit abort; } # Check whether all necessary information has been parsed: if (state != 2) { data_error(("missing «ORIGIN» line")); } # To keep {LINT} happy: close(stdout); close(stderr); close(stdin); } # DATA PROCESSING FUNCTIONS function process_feature(key,arg, loc,qua,qmap,status,note) { # Processes a feature from the "FEATURES" section, # with the given {key} and arguments {arg}. # The interesting feature keys (which generate # labeling ranges) are: # # at ce dr dm KEY # ------- ------- ------- ------- --- # 7 7 26 7 source # ------- ------- ------- ------- --- # . . 2458 8 gap # ------- ------- ------- ------- --- # 32615 23208 24225 20221 mRNA # 14 2 2 2 rRNA # 84 204 . 296 ncRNA # 577 228 614 339 misc_RNA # 689 630 22 314 tRNA # . . . . tmRNA # ------- ------- ------- ------- --- # 32817 23221 24220 20234 CDS # ------- ------- ------- ------- --- # # Here are some other uninteresting feature keys that # have been foudn to occur in the NCBI RefSeq files: # # at ce dr dm KEY # ------- ------- ------- ------- --- # 14 . . . promoter # 33263 21052 25205 14690 gene # 716 16980 12 975 STS # 5228 2 . 7 misc_feature # 32 . 3482 . exon # 21 . . . intron # . . 9 . C_region # . . 1 . D-loop # . . 2 . V_segment # . . 1 1 rep_origin # 3 . . 5395 repeat_region # 3 . . . repeat_unit # # Case is not significant for keys, so: key = tolower(key); # Split the argument {arg} into a location {loc} # and a qualifier string {qua}: if (match(arg, /[\/]/)) { loc = substr(arg,1,RSTART-1); qua = substr(arg,RSTART); } else { loc = arg; qua = ""; } # General cleanup of location: loc = tolower(loc); gsub(/[ ]/, "", loc); # Split qualifiers {qua} into a table {qmap} indexed by the qualif. name: split("", qmap); split_qualifiers(qua, qmap); # Dispatch on {key}: if (key == "source") { if (source_ok) { data_error(("more than one «source» feature")); } source_ok = 1; status = process_source(key, loc, qmap); } else { # The "source" feature must come first: if (! source_ok) { data_error(("missing «source» feature")); } if (key == "gap") { status = process_gap(key, loc, qmap); } else if (key ~ /rna$/) { status = process_rna(key, loc, qmap); } else if (key == "cds") { status = process_cds(key, loc, qmap); } else if (key ~ /^(gene|sts|promoter|misc_feature|exon|intron|repeat_region|repeat_unit)$/) { status = process_useless_feature(key, loc, qmap) } else { data_warning(("ignored feature key = «" key "»")); status = 1; } } # Printe for debugging: # printf "%s:%d: key = «%s» loc = «%s»\n", FILENAME, FNR, key, loc > "/dev/stderr"; # Print decision for analysis: if ((status != 0) && ("/note" in qmap)) { note = qmap["/note"]; if (length(note) > 200) { note = substr(note,1,200); } printf "%s:%d: # status = %d key = «%s» /note=«%s»\n", FILENAME, FNR, status, key, note > "/dev/stderr"; } } function process_source(key,loc,qmap, i,status) { # Processes the "source" feature. # Defines the variables {seq_ini} and {seq_fin}. # The "source" location should be a simple range: if (loc !~ /^[0-9]+[.][.][0-9]+$/) { data_error(("funny source location = «" loc "»")); } # Set {seq_ini,seq_fin}: seq_ini = get_range_ini(loc); seq_fin = get_range_fin(loc); if (seq_ini != 1) { data_error(("source range «" loc "» does not start at 1")); } if (seq_fin < seq_ini) { data_error(("source range «" loc "» is empty")); } # Print the header: printf "> %s /ini=%d /fin=%d /circular=%d\n", version, seq_ini, seq_fin, circular; # Label all positions with 'Z': status = 9; label_range(+1, seq_ini, seq_fin, status, "Z"); label_range(-1, seq_ini, seq_fin, status, "Z"); return status; } function process_gap(key,loc,qmap, gini,gfin,status) { # Processes the "gap" feature. # The location should be a simple range (or a single position): if (loc !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("funny «gap» feature")); } # Label both strands with 'O': gini = get_range_ini(loc); gfin = get_range_fin(loc); status = 9; label_range(+1, gini, gfin, status, "O"); label_range(-1, gini, gfin, status, "O"); return status; } function process_rna(key,loc,qmap, k,ini,fin,i,status) { # Processes the "*RNA" (transcribed region) features. # Try to determine whether the sequence has experimental status: status = infer_feature_status(qmap); # !!! Should parse the "/note" qualifier to know whether # the feature is experimental or hypothetical. # Just label all bases in {loc} with 'N': label_location(loc, status, "N"); return status; } function process_cds(key,loc,qmap, frame,labels,status) { # Processes the "CDS" (coding region) feature. # Essential qualifiers: # # /codon_start={1|2|3} # # Qualifiers relevant for consistency checking: # # /translation="" # /transl_table= # /codon=(seq:"codon-sequence",aa:) # /exception="text" # /transl_except=(pos:,aa:) # transl_except=(pos:213..215,aa:Trp) # transl_except=(pos:213..214,aa:TERM) #-- incomplete stop codon # Try to determine whether the sequence has experimental status: status = infer_feature_status(qmap); # !!! Should analyze the "/note" qualifier to know whether # the feature is experimental or hypothetical. # Get the starting offset from the "/codon_start" feature: frame = get_qualifier("/codon_start", qmap, 1); if (frame !~ /^[123]$/) { data_error(("invalid «/codon-start» value = «" frame "»")); } labels = substr("DEFDEF", 5 - frame, 3); if (frame != 1) { data_warning(("codon start = " frame "")); } label_location(loc, status, labels); # !!! Should check the length of the "/translation" return status; } function process_useless_feature(key,loc,qmap, status) { # Processes a common but useless feature. # Just ignore it: status = 0; return status; # Done. (That was easy...) } function infer_feature_status(qmap, note,status) { # Tries to determine the reliability status of a feature # from its qualifiers {qmap}. status = 4; # Default. # Get the "/note" qualifier: note = get_qualifier("/note", qmap, ""); if (note ~ /[Ii]dentical to /) { status = 3; } else if (note ~ /^[Ss]imilar to/) { status = 2; } else if (note ~ /[Ss]upported by cDNAs/) { status = 2; } else if (note ~ /[Ss]upported by ESTs/) { status = 5; } else if (note ~ / by automated computational /) { status = 2; } return status; } function label_location(loc,status,labels, nrt,rt,j,k,rng,len,strand) { # Writes out one or more lines that # label the location {loc} with repeated copies of the string # {labels}. These lines are marked with reliability hint {status}. # The location {loc} specifies a sequence of positions on a specific # strand. According to the documentation, {loc} must be either # # * a single position "{POS}", # * a range of consecutive positions "{INI}..{FIN}", # * a call "complement({LOC})", or # * a call "join({LOC[1]},{LOC[2]},..{LOC[n]})" # # where each {LOC} or {LOC[i]} is a location. However, cannot be two # nested calls to the same operator ("complement" or "join"). So the # formula has maximum depth 2. Moreover one cannot join positions # belonging to opposite strands. # The "join" operator appends the sequences described by its # operands in the order they appear in the argument list. The # "complement" perator flips each position to the other strand and # reverses the order of the sequence. Thus, # "complement(join({A},{B}))" is equivalent to # "join(complement({B}),complement({A}))". # The procedure applies letters 1,2,... of {labels} (cyclically) to # the positions specified by {loc}, in the order specified by {loc}. # Thus, for example, the call # # {label_location("complement(join(50..60,30..40))","DEF",status)} # # will start labeling "complement(40)" with 'D', "complement(39)" # with 'E', and so on. # The procedure fails with error if {loc} is malformed or names any # invalid position. # If {circular} is true, any range with {INI > FIN} appearing in # {loc} is assumed to mean # "join({INI}..{seq_fin},{seq_ini}..{FIN})", If {circular} is false, # all ranges in {loc} must have {INI <= FIN}. split("", rt); # Temp array for arguments of "join". if (loc ~ /^complement[(].*[)]$/) { # Strip the "complement" operator: loc = substr(loc,12,length(loc)-12); # Extract the ranges: nrt = split_join_of_ranges(loc, rt); # Label them in reverse order: for (j = nrt; j > 0; j--) { len = parse_and_label_range(-1, rt[j], status, labels); # Cyclically shift the labels: labels = cycle_labels(labels, len); } } else if (loc ~ /^join[(].*[)]$/) { # Get the joined ranges (or complements thereof): nrt = split_join_of_ranges(loc, rt); # Label them in increasing order: for (j = 1; j <= nrt; j++) { rng = rt[j]; if (rng ~ /^complement[(].*[)]$/) { # strip the "complement" operator: rng = substr(rng,12,length(rng)-12); strand = 1; } else { strand = 0; } len = parse_and_label_range(strand, rng, status, labels); # Cyclically shift the labels: labels = cycle_labels(labels, len); } } else { # Location should be a simple range or position: len = parse_and_label_range(+1, loc, status, labels); } } function parse_and_label_range(strand,rng,status,labels, len,rini,rfin) { # Performs syntax checking and clenup of the range {rng} which must # be a simple range "{INI}..{FIN}" or a single position "{POS}", # then labels it with the given labels by calling # {label_range(strand,INI,FIN,status,labels)} (q.v.). Returns the number of # positions in the range. rng = cleanup_range(rng); rini = get_range_ini(rng); rfin = get_range_fin(rng); len = label_range(strand, rini, rfin, status, labels); return len; } function cleanup_range(rng ) { # Checks whether {rng} is a well-formed range or single position. If # it contains '<' or '>' constructs, prints a warning and removes # them. Returns the cleaned-up range. Fails, in particular, if the # range contains the constructs "{POS1}.{POS2}" or "{POS1}^{POS2}". if (rng ~ /[<>]/) { data_warning(("uncertain range «" rng "» ignoring the [<>]")); gsub(/[<>]/, "", rng); } if (rng !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("expected position or range, found «" rng "»")); } return rng; } function split_join_of_ranges(loc,rt, nrt) { # Given a location {loc} that is a join of locations, splits # its argument list. Returns the number {nrt} of arguments # result, and puts the arguments in {rt[1..nrt]}. If {loc} is not # a "join(...)" construct, sets {rt[1]=loc} and returns 1. # The parameter {rt} must be an empty array created by the caller. if (loc ~ /^join[(].*[)]$/) { # Remove the "join(...)" wrapper: loc = substr(loc, 6, length(loc)-6); # Break at commas: nrt = split(loc, rt, ","); return nrt; } else { # Not a "join(...)": rt[1] = loc; return 1; } } function label_range(strand,rini,rfin,status,labels, len1,len2) { # Like {label_location}, specialized for a simple range on the # indicated {strand}. Returns the number of positions labeled. # If {rini > rfin}, then {circular} must be true, and the range is # split into two ranges {rini..seq_fin} and {seq_ini..rfin}. In any # case, if {strand} is -1, the range(s) are reversed. if (rini > rfin) { if (circular) { # The range apparently spans the {seq_fin->seq_ini} bond, split it in two: if (strand + 0 > 0) { len1 = label_range(strand, rini, seq_fin, status, labels); labels = cycle_labels(labels, len1); len2 = label_range(strand, seq_ini, rfin, status, labels); } else { len1 = label_range(strand, seq_ini, rfin, status, labels); labels = cycle_labels(labels, len1); len2 = label_range(strand, rini, seq_fin, status, labels); } # Sanity check: if (len1 + len2 > 0.5*(src_fin - src_ini + 1)) { data_warning(("extra long range = " rini ".." rfin " (" len1+len2 " positions)")); } return len1 + len2; } else { data_error(("reverse range = " rini ".." rfin " in linear molecule")); } } else { # Simple case: printf "%+2d %10d %10d %d %s\n", strand, rini, rfin, status, labels; return (rfin - rini + 1); } } function get_range_ini(r) { # Requires {r} to be a simple range "{X}..{Y}" or single position "{X}". # Returns {X} as a numeric value. Does not check whether {X} is in {seq_ini..seq_fin}, # but fails if {X} is zero. if (r !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("bad range format «" r "»")); } gsub(/[.][.][0-9]+$/, "", r); r += 0; if (r < 1) { data_error(("invalid position «" r "»")); } return r; } function get_range_fin(r) { # Requires {r} to be a simple range "{X}..{Y}" or single position "{Y}". # Returns {Y} as a numeric value. Does not check whether {Y} is in {seq_ini..seq_fin}, # but fails if {Y} is zero. if (r !~ /^[0-9]+([.][.][0-9]+|)$/) { data_error(("bad range format «" r "»")); } gsub(/^[0-9]+[.][.]/, "", r); r += 0; if (r < 1) { data_error(("invalid position «" r "»")); } return r; } function split_qualifiers(qua,qmap, name,val,sep) { # Splits the qualifiers contained in the string {qua} into a table # {qmap} such that {qmap[name]} is the value of the qualifier called # {name}. The {qmap} parameter must be an empty array created by the caller. # # The {name} will include the "/" but not the "=". # Text values have all their extra quotes stripped off. # Other text values are unchanged. while (1) { # Remove any leading blanks from {qua}: gsub(/^[ ]+/, "", qua); # Check for termination: if (qua == "") { return; } # Check for presence of a name: if (! match(qua, /^[\/][-_A-Za-z0-9]+/)) { data_error((" no «/{NAME}» in «" qua "»")); } name = substr(qua, RSTART, RLENGTH); qua = substr(qua, RSTART+RLENGTH); # Check and consume the "=": if (! match(qua, /^[ ]*[=][ ]*/)) { # Missing "=" after "/{name}"; may be a presence/absence qualifier. # Set its value to "1" (value "" might be more logical, but is too subtle). val = 1; } else { qua = substr(qua, RSTART+RLENGTH); # Check for presence and type of value: if (qua ~ /^["]/) { # Text value, grab and remove extra double quotes: val = ""; sep = ""; do { if (! match(qua, /^["][^"]*["]/)) { data_error(("malformed value of «" name "» qualifier")); } val = ( val sep substr(qua,2,RLENGTH-2) ); qua = substr(qua, RSTART+RLENGTH); # Allow for blanks between quote pairs: if (match(qua, /^[ ]+/)) { qua = substr(qua, RSTART+RLENGTH); } sep = "\""; } while (qua ~ /^["]/); } else { # Integer or keyword value, fetch to next "/" or end of string; if (! match(qua, /^[^\/]*/)) { program_error(("duh?")); } val = substr(qua, RSTART, RLENGTH); qua = substr(qua, RSTART+RLENGTH); } # Regularize spaces in {val}: gsub(/^[ ]+/, "", val); gsub(/[ ]+$/, "", val); gsub(/[ ][ ]+/, " ", val); } # Save {val} in {qmap}: if (name in qmap) { # Multiple qualifiers with same {name}, append them: qmap[name] = ( qmap[name] " " val ); } else { qmap[name] = val; } } } function get_qualifier(name,qmap,def, val,res,sep) { # Extracts from the qualifier table {qmap} the value of the # text-valued qualifier named {name}. The {name} should include the # "/" but not the "=". If there is no such qaulifier in {qmap}, # returns the {def} value. if (name in qmap) { return qmap[name]; } else { return def; } } function cycle_labels(str,k, n) { # Shifts the string {str} cyclically to the left by {k} characters. n = length(str); k = k % n; if (k == 0) { return str; } else { return (substr(str, 1+k) substr(str,1,k)); } } # AUXILIARY PROCEDURES function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; printf "usage: %s\n", USAGE > "/dev/stderr"; abort = 1; exit abort; } function data_warning(msg) { printf "%s:%s: warning: %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; } function data_error(msg) { printf "%s:%s: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " «%s»\n", $0 > "/dev/stderr"; abort = 1; exit abort; } function prog_error(msg) { printf "%s:%s: ** program error: %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; }