#! /usr/bin/gawk -f # Last edited on 2011-12-15 02:45:54 by stolfi BEGIN{ abort = -1; clear_it(); } (abort >= 0) { exit abort; } # Cleanup: //{ gsub(/^[ ]*/, ""); gsub(/[ ]*$/, ""); gsub(/[ ][ ]+/, " "); gsub(/[ ]+[:]/, ":"); gsub(/[ ]+[,\/][ ]+/, " "); # Junk lines that got through the HTML splitter: gsub(/^[ ]*mouse, human and drosophila.*$/, ""); gsub(/^[ ]*Ortholog name.*$/, ""); gsub(/^[ ]*Find Cis-regulatory regions.*$/, ""); gsub(/^[ ]*Cis-regulatory regions: .*$/, ""); gsub(/^[ ]*Genome Browser .*Orhtolog prediction /, ""); gsub(/^[ ]*ID[*]?[ ]*$/, ""); gsub(/^[ ]*IP ID[*]?[ ]*$/, ""); gsub(/^[ ]*GO ID[*]?[ ]*$/, ""); gsub(/^[ ]*Name[*]?[ ]*$/, ""); gsub(/^[ ]*Score[*]?[ ]*$/, ""); gsub(/^[ ]*Type[*]?[ ]*$/, ""); gsub(/^[ ]*Term[*]? No match[*]?[ ]*$/, ""); gsub(/^[ ]*Term[*]?[ ]*$/, ""); gsub(/^[ ]*Origin[*]?[ ]*$/, ""); gsub(/^[ ]*Protein familly[*]?[ ]*$/, ""); gsub(/^[ ]*Species[*]?[ ].*$/, ""); gsub(/^[ ]*Database[*]?[ ].*$/, ""); gsub(/^[ ]*Gene Ontology[*]?[ ].*$/, ""); gsub(/^[ ]*Mus musculus [*]?annotations.*$/, ""); gsub(/^[ ]*Homo sapiens [*]?annotations.*$/, ""); gsub(/^[ ]*Drosophila melanogaster [*]?annotations.*$/, ""); gsub(/^[ ]*Interpro[*]?[ ].*$/, ""); # gsub(/(HIGHLY |WEAKLY |)SIMILAR TO [A-Z0-9_]+ [(][A-Z0-9]+[)]/, ""); # gsub(/[(]Eval: [-0-9.e]+[)]/, ""); gsub(/^[ ]*/, ""); gsub(/[ ]*$/, ""); gsub(/[ ][ ]+/, " "); gsub(/Gene Name [(]Manual Annotation[)]/, "GeneNameManual"); gsub(/Gene Name [(]Inferred[)]/, "GeneNameInferred"); gsub(/Aniseed Gene id/, "AniseedGeneID"); gsub(/Cipro id/, "CiproID"); gsub(/Transcript models/, "TranscriptModels"); gsub(/Corresponding clone in gene collection 1 [(]N. Satoh[)][ ]*/, ""); gsub(/Full ORF Gateway-compatible clone[ ]*/, "FullORFClone: "); } /Gene Card/{ next; } /^ *$/{ next; } # Parsing: /^===$/ { dump_it(); clear_it(); next; } /^AniseedGeneID:([ ]|$)/ { if (NF != 2) { data_error(("bad NF = " NF)); } AGI = $2; if (AGI !~ /^aniseedV3_[0-9]+$/) { data_error(("bad AniseedGeneID = " AGI)); } next; } /^Notice:([ ]|$)/ { if (OBS != "") { data_warning(("multiple \"Notice:\" fields")); } lin = $0; gsub(/^Notice:[ ]*/, "", lin); OBS = (OBS == "" ? lin : (OBS " " lin)); next; } /^GeneNameManual:([ ]|$)/ { nGNM = get_field(nGNM, GNM); next; } /^GeneNameInferred:([ ]|$)/ { nGNI = get_field(nGNI, GNI); next; } /^CiproID:([ ]|$)/ { nCID = get_field(nCID, CID); next; } /^FullORFClone:([ ]|$)/ { nORF = get_field(nORF, ORF); next; } /^TranscriptModels:([ ]|$)/ { nTRM = get_field(nTRM, TRM); # Separate by class: for (i = 0; i < nTRM; i++) { v = TRM[i]; if (v ~/^ENS/) { TRM_EN[nTRM_EN] = v; nTRM_EN++; } else if (v ~/^KH/) { TRM_KH[nTRM_KH] = v; nTRM_KH++; } else if (v ~/^KYOTOGRAIL/) { TRM_KY[nTRM_KY] = v; nTRM_KY++; } else if (v ~/^ci/) { TRM_CI[nTRM_CI] = v; nTRM_CI++; } else { data_error(("unknown TRM class = " v)); } } next; } /R1Ci/ { if (NF != 2) { data_error(("bad NF = " NF)); } ac = $1; bc = $2; if (ac !~ /^ci[a-z0-9]+$/) { data_error(("bad ghost clone code = " ac)); } if (bc !~ /^[(]R1CiGC[0-9][0-9][a-z][0-9][0-9][)]$/) { data_error(("bad InSitu clone code = " bc)); } gsub(/^[(]R1CiGC/, "", bc); gsub(/[)]$/, "", bc); RCI[nRCI] = (bc "=" ac); nRCI++; next; } //{ data_error(("bad line format")); } END { if (abort >= 0) { exit abort; } dump_it(); } function clear_it() { AGI = ""; split("", GNM); nGNM = 0; split("", GNI); nGNI = 0; split("", CID); nCID = 0; split("", ORF); nORF = 0; split("", TRM); nTRM = 0; split("", TRM_EN); nTRM_EN = 0; split("", TRM_KH); nTRM_KH = 0; split("", TRM_KY); nTRM_KY = 0; split("", TRM_CI); nTRM_CI = 0; split("", RCI); nRCI = 0; OBS = ""; } function get_field(nARR,ARR, n,i,v) { if (nARR != 0) { data_warning(("multiple \"" $1 "\" fields")); } n = nARR; for (i = 2; i <= NF; i++) { v = $(i); # printf "a[%d] = %s\n", n, v > "/dev/stderr"; ARR[n] = v; n++; } return n; } function dump_it( i) { if (AGI != "") { printf "AniseedGeneID: %s\n", AGI; print_field("GeneNameManual", nGNM, GNM); print_field("GeneNameInferred", nGNI, GNI); print_field("CiproID", nCID, CID); print_field("FullORFClone", nORF, ORF); print_field("TranscriptModels_EN", nTRM_EN, TRM_EN); print_field("TranscriptModels_KH", nTRM_KH, TRM_KH); print_field("TranscriptModels_KY", nTRM_KY, TRM_KY); print_field("TranscriptModels_CI", nTRM_CI, TRM_CI); print_field("RCI", nRCI, RCI); printf "Notice: %s\n", OBS; printf "\n"; } } function print_field(tag,n,v, i) { printf "%s:", tag; for (i = 0; i < n; i++) { printf " %s", v[i]; }; printf "\n" } function data_warning(msg) { printf "%s:%d: !! %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; abort = 1; exit abort; }