#! /usr/bin/gawk -f # Last edited on 2011-12-15 04:03:29 by stolfi # Usage: match-ps-KH-to-cards.gawk -v collapse={BOOL} {DATFILE}... # Requires file "pb-KH-dir.txt" in current directory. BEGIN{ abort = -1; if (collapse == "") { arg_error(("must define {collapse}")); } if (collapse !~ /^[01]$/) { arg_error(("invalid {collapse} = \"" collapse "\"")); } collaps = collapse + 0; # Read from the probeset-to-KH table: split("", ps_scaff); # Scaffold of probeset, indexed by probeset name. split("", ps_start); # Start of probeset, indexed by probeset name. split("", ps_end); # End of probeset, indexed by probeset name. split("", ps_khnames); # KH gen names names, indexed by probeset name. np = read_ps_KH_table("ps-KH-dir.txt",ps_scaff,ps_start,ps_end,ps_khnames); # Read from the extracted Aniseed card files "NN/NNN.dat": split("", kh_anis); # Aniseed card id, indexed by KH gene name. split("", kh_orfs); # Full ORFs, indexed by KH gene name. split("", kh_rcis); # InSitu clones, indexed by KH gene name. split("", kh_gman); # Manual gene names, indexed by KH gene name. split("", kh_ginf); # Inferred gene names, indexed by KH gene name. ncards = 0; } function read_ps_KH_table(fname,ps_scaff,ps_start,ps_end,ps_khnames, ntbl,nlin,lin,fld,nfld,ps_name,tmp) { ntbl=0; # Number of entries in table. nlin=0; # Number of lines read. while((getline lin < fname) > 0) { nlin++; if (! match(lin, /^[ \011]*([\#]|$)/)) { gsub(/[\011]/, " ", lin); lin = clean_blanks(lin); nfld = split(lin, fld, /[ ]+/); if (nfld != 5) { tbl_error(fname, nlin, lin, "bad number of fields in table"); } if (fld[2] !~ /^Scaffold_[0-9]+$/) { tbl_error(fname, nlin, lin, "bad scaffold name"); } if ((fld[3] !~ /^[0-9]+$/) || (fld[4] !~ /^[0-9]+$/)) { tbl_error(fname, nlin, lin, ("bad position range = \"" fld[3] " " fld[4] "\"")); } ps_name = clean_blanks(fld[1]); if (ps_name in ps_start) { tbl_error(fname, nlin, lin, ("repeated probeset name = \"" ps_name "\"")); } ps_scaff[ps_name] = clean_blanks(fld[2]); ps_start[ps_name] = clean_blanks(fld[3]); ps_end[ps_name] = clean_blanks(fld[4]); ps_khnames[ps_name] = clean_blanks(fld[5]); # printf "%s %s %d %d %s\n", ps_name, \ # ps_scaff[ps_name], ps_start[ps_name], ps_end[ps_name], ps_khnames[ps_name] > "/dev/stderr"; ntbl++; } } if (ERRNO != "0") { tbl_error(fname, nlin, ERRNO); } close (fname); if (nlin == 0) { arg_error(("file \"" fname "\" empty or missing")); } printf "loaded %6d map pairs from %s\n", ntbl, fname > "/dev/stderr" return ntbl; } (abort >= 0) { exit abort; } # Check for start of new card: (FNR == 1) { if (ncards > 0) { save_card(); } start_card(); ncards++; } function start_card( ) { # Called at the beginning of a new card. # Clears all card data. printf "start of card %s\n", FILENAME > "/dev/stderr"; anis = ""; orfs = ""; khns = ""; rcis = ""; gman = ""; ginf = ""; } function save_card( nkh,fld,k,kh_name) { # Called at the end of every card. # Saves the card data into the {kh_XXX} tables. gsub(/[,]$/, "", khns); nkh = split(khns,fld,/[ ]+/); for (k = 1; k <= nkh; k++) { kh_name = fld[k]; if (kh_anis[kh_name] != "") { data_warning(("previous aniseed card \"" kh_anis[kh_name] "\" for same KH name \"" kh_name "\"")); } kh_anis[kh_name] = clean_blanks(kh_anis[kh_name] " " anis); kh_orfs[kh_name] = clean_blanks(kh_orfs[kh_name] " " orfs); kh_rcis[kh_name] = clean_blanks(kh_rcis[kh_name] " " rcis); kh_gman[kh_name] = clean_blanks(kh_gman[kh_name] " " gman); kh_ginf[kh_name] = clean_blanks(kh_ginf[kh_name] " " ginf); # printf " %-30s A: %s O: %s R: %s M: %s I: %s\n", \ # kh_name, \ # kh_anis[kh_name], kh_orfs[kh_name], kh_rcis[kh_name], kh_gman[kh_name], kh_ginf[kh_name] \ # > "/dev/stderr"; } } function clean_blanks(x) { gsub(/^[ ]+/, "", x); gsub(/[ ]+$/, "", x); gsub(/[ ][ ]+/, " ", x); gsub(/^[(]NONE[)]$/, "", x); return x; } # Process lines of a card. # Discard comment lines: /^[ \011]*([\#]|$)/ { next; } # Cleanup: // { gsub(/[\015]/, ""); gsub(/[\011]/, " "); $0 = clean_blanks($0); } /AniseedGeneID:($|[ ])/ { if (NF != 2) { data_error(("wrong number of fields")); } if (anis != "") { data_warning(("repeated \"" $1 "\" entry")); } anis = $2; if (anis !~ /^aniseedV3_[0-9]+$/) { data_error(("bad aniseed id = \"" anis "\"")); } next; } /GeneNameManual:($|[ ])/ { if (gman != "") { data_warning(("repeated \"" $1 "\" entry")); } gman = grab_card_entry_value($0); next; } /GeneNameInferred:($|[ ])/ { if (ginf != "") { data_warning(("repeated \"" $1 "\" entry")); } ginf = grab_card_entry_value($0); next; } /FullORFClone:($|[ ])/ { if (orfs != "") { data_warning(("repeated \"" $1 "\" entry")); } orfs = grab_card_entry_value($0); next; } /TranscriptModels_KH:($|[ ])/ { if (khns != "") { data_warning(("repeated \"" $1 "\" entry")); } khns = grab_card_entry_value($0); next; } /RCI:($|[ ])/ { if (rcis != "") { data_warning(("repeated \"" $1 "\" entry")); } rcis = grab_card_entry_value($0); next; } /CiproID:($|[ ])/ { next; } /TranscriptModels_EN:($|[ ])/ { next; } /TranscriptModels_KY:($|[ ])/ { next; } /TranscriptModels_CI:($|[ ])/ { next; } /Notice:($|[ ])/ { next; } // { data_error(("bad line format")); } function grab_card_entry_value(lin, v) { # Grabs the value field of an aniseed card entry, which # may be zero or more fields. # Checks for quotes, turns them into apostrophes. v = lin; gsub(/^[A-Za-z0-9_]+[:][ ]*/, "", v); v = clean_blanks(v); if (v ~ /["]/) { data_warning(("quotes in value field \"" v "\" turned into apostrophes")); gsub(/["]/, "'", v); } return v; } END{ if (abort >= 0) { exit(abort); } printf "read %d cards\n", ncards > "/dev/stderr"; output_csv_header(); nps = 0; # Number of probesets. npskh = 0; # Number of (KH model,probeset) pairs. nnokh = 0; # Probesets without any KH models. nnocd = 0; # Probesets without ANISEED cards. nknocd = 0; # Probeset/KH pairs without ANISEED cards. for (ps_name in ps_scaff) { nps++; tmp = ps_khnames[ps_name]; # Comma-separated. gsub(/^[,]+/, "", tmp); gsub(/[,]+$/, "", tmp); nkh = split(tmp,fld,/[,]/); npskh += nkh; if (nkh == 0) { output_csv( \ ps_name, ps_scaff[ps_name], ps_start[ps_name], ps_end[ps_name], \ "", "", "", "", "", "" ); printf "probeset %-30s (%-20s %12d %12d) has no KH gene models\n", \ ps_name, ps_scaff[ps_name], ps_start[ps_name], ps_end[ps_name] \ > "/dev/stderr"; nnokh ++; } else if (collapse) { # Consolidate data for all KH models of this probeset: khns = ""; anis = ""; orfs = ""; rcis = ""; gman = ""; ginf = ""; for (k = 1; k <= nkh; k++) { kh_name = fld[k]; khns = clean_blanks(khns " " kh_name); anis = clean_blanks(anis " " kh_anis[kh_name]); orfs = clean_blanks(orfs " " kh_orfs[kh_name]); rcis = clean_blanks(rcis " " kh_rcis[kh_name]); gman = clean_blanks(gman " " kh_gman[kh_name]); ginf = clean_blanks(ginf " " kh_ginf[kh_name]); if (kh_anis[kh_name] == "") { nknocd++; } } if (anis == "") { nnocd++; } output_csv( \ ps_name, ps_scaff[ps_name], ps_start[ps_name], ps_end[ps_name], \ khns, anis, orfs, rcis, gman, ginf ); } else { # Write a separate entry for each (KH model,probeset) pair: some_anis = 0; for (k = 1; k <= nkh; k++) { kh_name = fld[k]; khns = kh_name; anis = kh_anis[kh_name]; orfs = kh_orfs[kh_name]; rcis = kh_rcis[kh_name]; gman = kh_gman[kh_name]; ginf = kh_ginf[kh_name]; if (anis == "") { nknocd++; } else { some_anis = 1; } output_csv( \ ps_name, ps_scaff[ps_name], ps_start[ps_name], ps_end[ps_name], \ khns, anis, orfs, rcis, gman, ginf ); } if (some_anis == 0) { nnocd++; } } } printf "there were %d probesets.\n", nps > "/dev/stderr"; printf "there were %d probesets without KH models.\n", nnokh > "/dev/stderr"; printf "there were %d probesets without ANISEED cards.\n", nnocd > "/dev/stderr"; printf "there were %d (probesets,KH model) pairs.\n", npskh > "/dev/stderr"; printf "there were %d (probesets,KH model) pairs without ANISEED cards.\n", nknocd > "/dev/stderr"; } function output_csv_header() { if (collapse == 0) { # Separate line for each (Probeset,KH model) pair: printf "0,\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"\n", \ "KH model", "Probeset", \ "Scaffold", "Start", "End", \ "ANISEED IDs", "Full ORFs", "InSitu clones", \ "Genes (manual)", "Genes (inferred)"; } else { # One line for each probeset, all KH models: printf "0,\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"\n", \ "Probeset", \ "Scaffold", "Start", "End", \ "ANISEED IDs", "KH models", "Full ORFs", "InSitu clones", \ "Genes (manual)", "Genes (inferred)"; } } function output_csv(psname,psscaff,psstart,psend,khns,anis,orfs,rcis,gman,ginf ) { if (psscaff !~ /^Scaffold_[0-9]+$/) { prog_error(("bad probeset scaffold")); } if (psstart !~ /^[0-9]+$/) { prog_error(("bad probeset start")); } if (psend !~ /^[0-9]+$/) { prog_error(("bad probeset end")); } gsub(/Scaffold_/, "S_", psscaff); gsub(/aniseedV3_/, "A_", anis); if (collapse == 0) { # Separate line for each (Probeset,KH model) pair: printf "1,\"%s\",\"%s\",\"%s\",%d,%d,\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"\n", \ khns, psname, \ psscaff, psstart, psend, \ anis, orfs, rcis, gman, ginf; } else { # One line for each probeset, all KH models: printf "1,\"%s\",\"%s\",%d,%d,\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"\n", \ psname, \ psscaff, psstart, psend, \ anis, khns, orfs, rcis, gman, ginf; } } function arg_error(msg) { printf "** %s\n", msg > "/dev/stderr"; abort = 1; exit abort; } function data_warning(msg) { printf "%s:%d: !! %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; } function data_error(msg) { printf "%s:%d: ** %s\n", FILENAME, FNR, msg > "/dev/stderr"; printf " %s\n", $0 > "/dev/stderr"; abort = 1; exit abort; } function tbl_error(f,n,lin,msg) { printf "%s:%d: %s\n", f, n, msg > "/dev/stderr"; printf " %s\n", lin > "/dev/stderr"; abort = 1; exit 1 } function prog_error(msg) { printf "%s:%d: ** PROGRAM ERROR - %s\n", FILENAME, FNR, msg > "/dev/stderr"; abort = 1; exit abort; }