#! /usr/bin/gawk -f # Last edited on 2011-12-15 03:57:46 by stolfi # Usage: describe-wells.gawk {DATFILE}... BEGIN{ abort = -1; # Read from the extracted Aniseed card files "NN/NNN.dat": split("", ci_anis); # Aniseed card id, indexed by InSitu plate/well code. split("", ci_orfs); # Full ORFs, indexed by InSitu plate/well code. split("", ci_khns); # KH gene names, indexed by InSitu plate/well code. split("", ci_gman); # Manual gene names, indexed by InSitu plate/well code. split("", ci_ginf); # Inferred gene names, indexed by InSitu plate/well code. alpha = "abcdefghijklmnopqrstuvwxyz"; # Plate row letters. ncards = 0; # Number of ANISEED cards read. nnoci = 0; # Number of ANISEED cards without InSitu codes. } (abort >= 0) { exit abort; } # Check for start of new card: (FNR == 1) { if (ncards > 0) { save_card(); } start_card(); ncards++; } function start_card( ) { # Called at the beginning of a new card. # Clears all card data. printf "start of card %s\n", FILENAME > "/dev/stderr"; anis = ""; orfs = ""; khns = ""; rcis = ""; gman = ""; ginf = ""; } function save_card( nci,fld,k,rcik,ci_code) { # Called at the end of every card. # Saves the card data into the {ci_XXX} tables. gsub(/[,]$/, "", khns); gsub(/[,]$/, "", rcis); # Scan the InSitu plate/row/column codes: nci = split(rcis,fld,/[ ]+/); if (nci == 0) { data_warning(("aniseed card \"" anis "\" without InSitu codes")); nnoci++; } for (k = 1; k <= nci; k++) { rcik = fld[k]; if (rcik !~ /^[0-9][0-9][a-p][0-9][0-9][=][a-z0-9]+$/){ data_error(("invalid InSitu clone id \"" rcik "\"")); } # Extract the InSitu plate/row/column code {ci-code}: ci_code = rcik; gsub(/[=].*$/, "", ci_code); if (ci_anis[ci_code] != "") { data_warning(("previous aniseed card \"" ci_anis[ci_code] "\" for same InSitu code \"" ci_code "\"")); } # Save data for this well: ci_anis[ci_code] = clean_blanks(ci_anis[ci_code] " " anis); ci_orfs[ci_code] = clean_blanks(ci_orfs[ci_code] " " orfs); ci_khns[ci_code] = clean_blanks(ci_khns[ci_code] " " khns); ci_gman[ci_code] = clean_blanks(ci_gman[ci_code] " " gman); ci_ginf[ci_code] = clean_blanks(ci_ginf[ci_code] " " ginf); # printf " %-30s A: %s O: %s K: %s M: %s I: %s\n", \ # ci_code, \ # ci_anis[ci_code], ci_orfs[ci_code], ci_khns[ci_code], ci_gman[ci_code], ci_ginf[ci_code] \ # > "/dev/stderr"; } } function clean_blanks(x) { gsub(/^[ ]+/, "", x); gsub(/[ ]+$/, "", x); gsub(/[ ][ ]+/, " ", x); gsub(/^[(]NONE[)]$/, "", x); return x; } # Process lines of a card. # Discard comment lines: /^[ \011]*([\#]|$)/ { next; } # Cleanup: // { gsub(/[\015]/, ""); gsub(/[\011]/, " "); $0 = clean_blanks($0); } /AniseedGeneID:($|[ ])/ { if (NF != 2) { data_error(("wrong number of fields")); } if (anis != "") { data_warning(("repeated \"" $1 "\" entry")); } anis = $2; if (anis !~ /^aniseedV3_[0-9]+$/) { data_error(("bad aniseed id = \"" anis "\"")); } next; } /GeneNameManual:($|[ ])/ { if (gman != "") { data_warning(("repeated \"" $1 "\" entry")); } gman = grab_card_entry_value($0); next; } /GeneNameInferred:($|[ ])/ { if (ginf != "") { data_warning(("repeated \"" $1 "\" entry")); } ginf = grab_card_entry_value($0); next; } /FullORFClone:($|[ ])/ { if (orfs != "") { data_warning(("repeated \"" $1 "\" entry")); } orfs = grab_card_entry_value($0); next; } /TranscriptModels_KH:($|[ ])/ { if (khns != "") { data_warning(("repeated \"" $1 "\" entry")); } khns = grab_card_entry_value($0); next; } /RCI:($|[ ])/ { if (rcis != "") { data_warning(("repeated \"" $1 "\" entry")); } rcis = grab_card_entry_value($0); next; } /CiproID:($|[ ])/ { next; } /TranscriptModels_EN:($|[ ])/ { next; } /TranscriptModels_KY:($|[ ])/ { next; } /TranscriptModels_CI:($|[ ])/ { next; } /Notice:($|[ ])/ { next; } // { data_error(("bad line format")); } function grab_card_entry_value(lin, v) { # Grabs the value field of an aniseed card entry, which # may be zero or more fields. # Checks for quotes, turns them into apostrophes. v = lin; gsub(/^[A-Za-z0-9_]+[:][ ]*/, "", v); v = clean_blanks(v); if (v ~ /["]/) { data_warning(("quotes in value field \"" v "\" turned into apostrophes")); gsub(/["]/, "'", v); } return v; } END{ if (abort >= 0) { exit(abort); } printf "read %d cards\n", ncards > "/dev/stderr"; output_csv_header(); split("", ci_here); # Well presence either "1" or undef; indexed by ints {plt,row,col}. max_plt = 0; max_row = 16; max_col = 24; ncis = 0; # Number of InSitu wells present in the cards. nnogn = 0; # InSitu wells without any manual or inferred gene names. nnokh = 0; # InSitu wells without any KH gene names. nnoor = 0; # InSitu wells without any full ORFs. for (ci_name in ci_anis) { ncis++; output_csv( \ ci_name, ci_anis[ci_name], ci_orfs[ci_name], ci_khns[ci_name], \ ci_gman[ci_name], ci_ginf[ci_name] ); # Extract integer well indices {plt,row,col} from {ci_name}: plt = ci_name; gsub(/[a-z][0-9][0-9]$/, "", plt); if (plt !~ /^[0-9][0-9]$/) { prog_error(("plt extraction \"" ci_name " " plt "\"")); } plt = plt + 0; if ((plt < 1) || (plt > 99)) { prog_error(("plt conversion \"" ci_name " " plt "\"")); } if (plt > max_plt) { max_plt = plt; } row = ci_name; gsub(/^[0-9][0-9]/, "", row); gsub(/[0-9][0-9]$/, "", row); if (row !~ /^[a-z]$/) { prog_error(("row extraction \"" ci_name " " row "\"")); } row = index(alpha, row); if ((row < 1) || (row > 26)) { prog_error(("row conversion \"" ci_name " " row "\"")); } if (row > max_row) { max_row = row; } col = ci_name; gsub(/^[0-9][0-9][a-z]/, "", col); if (col !~ /^[0-9][0-9]$/) { prog_error(("col extraction \"" ci_name " " col "\"")); } col = col + 0; if ((col < 1) || (col > 26)) { prog_error(("col conversion \"" ci_name " " col "\"")); } if (col > max_col) { max_col = col; } ci_here[plt,row,col] = 1; if ((ci_gman[ci_name] == "") && (ci_ginf[ci_name] == "")) { printf "!! well %-5s (A: %s) has no gene names\n", \ ci_name, ci_anis[ci_name] \ > "/dev/stderr"; nnogn ++; } if (ci_orfs[ci_name] == "") { nnoor ++; } if (ci_khns[ci_name] == "") { nnokh ++; } } printf "there were %d InSitu clone codes.\n", ncis > "/dev/stderr"; printf "there were %d cards without InSitu clone codes.\n", nnoci > "/dev/stderr"; printf "there were %d InSitu clones without manual or inferred gene names.\n", nnogn > "/dev/stderr"; printf "there were %d InSitu clones without KH gene names.\n", nnokh > "/dev/stderr"; printf "there were %d InSitu clones without full ORFs.\n", nnoor > "/dev/stderr"; printf "\n" > "/dev/stderr"; printf "max plate = %d\n", max_plt > "/dev/stderr"; printf "max row = %d (%s)\n", max_row, substr(alpha,max_row,1) > "/dev/stderr"; printf "max col = %d\n", max_col > "/dev/stderr"; show_plates(); } function show_plates( plt,col,row) { printf "PLATES\n" > "/dev/stderr"; for (plt = 1; plt <= max_plt; plt++) { printf "plate %02d\n", plt > "/dev/stderr"; printf " " > "/dev/stderr"; for (col = 1; col <= max_col; col++) { printf "%d", int(col/10) > "/dev/stderr"; } printf "\n" > "/dev/stderr"; printf " " > "/dev/stderr"; for (col = 1; col <= max_col; col++) { printf "%d", col % 10 > "/dev/stderr"; } printf "\n" > "/dev/stderr"; printf "\n" > "/dev/stderr"; for (row = 1; row <= max_row; row++) { printf "%s ", substr(alpha,row,1) > "/dev/stderr"; for (col = 1; col <= max_col; col++) { printf "%s", ((plt,row,col) in ci_here ? "@" : "ยท") > "/dev/stderr"; } printf "\n" > "/dev/stderr"; } printf "\n" > "/dev/stderr"; } } function output_csv_header() { printf "0,\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"\n", \ "InSitu clone", "ANISEED IDs", "Full ORFs", "Genes (KH)", "Genes (manual)", "Genes (inferred)"; } function output_csv(ciname,anis,orfs,khns,gman,ginf ) { if (ciname !~ /^[0-9][0-9][a-p][0-9][0-9]$/) { prog_error(("bad InSitu clone code")); } gsub(/aniseedV3_/, "A_", anis); printf "1,\"[%s]\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"\n", \ ciname, anis, orfs, khns, gman, ginf; } function data_warning(msg) { printf "%s:%d: !! %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; abort = 1; exit abort; } function tbl_error(f,n,lin,msg) { printf "%s:%d: %s\n", f, n, msg > "/dev/stderr"; printf " %s\n", lin > "/dev/stderr"; abort = 1; exit 1 } function prog_error(msg) { printf "%s:%d: ** PROGRAM ERROR - %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; }