diff tools/ncbi_blast_plus/blastxml_to_tabular.py @ 11:4c4a0da938ff draft

Uploaded v0.0.22, now wraps BLAST+ 2.2.28 allowing extended tabular output to include the hit descriptions as column 25. Supports $GALAXY_SLOTS. Includes more tests and heavy use of macros.
author peterjc
date Thu, 05 Dec 2013 06:55:59 -0500
parents 70e7dcbf6573
children 623f727cdff1
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py	Mon Sep 23 06:14:13 2013 -0400
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py	Thu Dec 05 06:55:59 2013 -0500
@@ -31,7 +31,7 @@
 ====== ============= ===========================================
 Column NCBI name     Description
 ------ ------------- -------------------------------------------
-    13 sallseqid     All subject Seq-id(s), separated by a ';'
+    13 sallseqid     All subject Seq-id(s), separated by ';'
     14 score         Raw score
     15 nident        Number of identical matches
     16 positive      Number of positive-scoring matches
@@ -43,6 +43,7 @@
     22 sseq          Aligned part of subject sequence
     23 qlen          Query sequence length
     24 slen          Subject sequence length
+    25 salltitles    All subject titles, separated by '<>'
 ====== ============= ===========================================
 
 Most of these fields are given explicitly in the XML file, others some like
@@ -63,7 +64,7 @@
 import re
 
 if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.0.12"
+    print "v0.0.22"
     sys.exit(0)
 
 if sys.version_info[:2] >= ( 2, 5 ):
@@ -89,11 +90,11 @@
 if out_fmt == "std":
     extended = False
 elif out_fmt == "x22":
-    stop_err("Format argument x22 has been replaced with ext (extended 24 columns)")
+    stop_err("Format argument x22 has been replaced with ext (extended 25 columns)")
 elif out_fmt == "ext":
     extended = True
 else:
-    stop_err("Format argument should be std (12 column) or ext (extended 24 columns)")
+    stop_err("Format argument should be std (12 column) or ext (extended 25 columns), not: %r" % out_fmt)
 
 
 # get an iterable
@@ -157,6 +158,11 @@
             # <Hit_accession>Subject_1</Hit_accession>
             #
             #apparently depending on the parse_deflines switch
+            #
+            #Or, with BLAST 2.2.28+ can get this,
+            # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
+            # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
+            # <Hit_accession>2</Hit_accession>
             sseqid = hit.findtext("Hit_id").split(None,1)[0]
             hit_def = sseqid + " " + hit.findtext("Hit_def")
             if re_default_subject_id.match(sseqid) \
@@ -164,6 +170,11 @@
                 #Place holder ID, take the first word of the subject definition
                 hit_def = hit.findtext("Hit_def")
                 sseqid = hit_def.split(None,1)[0]
+            if sseqid.startswith("gnl|BL_ORD_ID|") \
+            and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
+                #Alternative place holder ID, again take the first word of hit_def
+                hit_def = hit.findtext("Hit_def")
+                sseqid = hit_def.split(None,1)[0]
             # for every <Hsp> within <Hit>
             for hsp in hit.findall("Hit_hsps/Hsp"):
                 nident = hsp.findtext("Hsp_identity")
@@ -228,7 +239,11 @@
                           ]
 
                 if extended:
-                    sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))
+                    try:
+                        sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(" >"))
+                        salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >"))
+                    except IndexError as e:
+                        stop_err("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e))
                     #print hit_def, "-->", sallseqid
                     positive = hsp.findtext("Hsp_positive")
                     ppos = "%0.2f" % (100*float(positive)/float(length))
@@ -252,6 +267,7 @@
                                    h_seq,
                                    str(qlen),
                                    str(slen),
+                                   salltitles,
                                    ])
                 #print "\t".join(values) 
                 outfile.write("\t".join(values) + "\n")