ncbi_blast_plus: tools/ncbi_blast_plus/blastxml_to

comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 11:4c4a0da938ff draft

Uploaded v0.0.22, now wraps BLAST+ 2.2.28 allowing extended tabular output to include the hit descriptions as column 25. Supports $GALAXY_SLOTS. Includes more tests and heavy use of macros.

author	peterjc
date	Thu, 05 Dec 2013 06:55:59 -0500
parents	70e7dcbf6573
children	623f727cdff1

comparison

equal deleted inserted replaced

-:70e7dcbf6573
+:4c4a0da938ff
 The additional columns offered in the Galaxy BLAST+ wrappers are:
 ====== ============= ===========================================
 Column NCBI name     Description
 ------ ------------- -------------------------------------------
-13 sallseqid     All subject Seq-id(s), separated by a ';'
+13 sallseqid     All subject Seq-id(s), separated by ';'
 14 score         Raw score
 15 nident        Number of identical matches
 16 positive      Number of positive-scoring matches
 17 gaps          Total number of gaps
 18 ppos          Percentage of positive-scoring matches
 20 sframe        Subject frame
 21 qseq          Aligned part of query sequence
 22 sseq          Aligned part of subject sequence
 23 qlen          Query sequence length
 24 slen          Subject sequence length
+25 salltitles    All subject titles, separated by '&lt;&gt;'
 ====== ============= ===========================================
 Most of these fields are given explicitly in the XML file, others some like
 the percentage identity and the number of gap openings must be calculated.
 """
 import sys
 import re
 if "-v" in sys.argv or "--version" in sys.argv:
-print "v0.0.12"
+print "v0.0.22"
 sys.exit(0)
 if sys.version_info[:2] >= ( 2, 5 ):
 try:
 from xml.etree import cElementTree as ElementTree
 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)")
 if out_fmt == "std":
 extended = False
 elif out_fmt == "x22":
-stop_err("Format argument x22 has been replaced with ext (extended 24 columns)")
+stop_err("Format argument x22 has been replaced with ext (extended 25 columns)")
 elif out_fmt == "ext":
 extended = True
 else:
-stop_err("Format argument should be std (12 column) or ext (extended 24 columns)")
+stop_err("Format argument should be std (12 column) or ext (extended 25 columns), not: %r" % out_fmt)
 # get an iterable
 try:
 context = ElementTree.iterparse(in_file, events=("start", "end"))
 # <Hit_id>Subject_1</Hit_id>
 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
 # <Hit_accession>Subject_1</Hit_accession>
 #
 #apparently depending on the parse_deflines switch
+#
+#Or, with BLAST 2.2.28+ can get this,
+# <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
+# <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
+# <Hit_accession>2</Hit_accession>
 sseqid = hit.findtext("Hit_id").split(None,1)[0]
 hit_def = sseqid + " " + hit.findtext("Hit_def")
 if re_default_subject_id.match(sseqid) \
 and sseqid == hit.findtext("Hit_accession"):
 #Place holder ID, take the first word of the subject definition
+hit_def = hit.findtext("Hit_def")
+sseqid = hit_def.split(None,1)[0]
+if sseqid.startswith("gnl|BL_ORD_ID|") \
+and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
+#Alternative place holder ID, again take the first word of hit_def
 hit_def = hit.findtext("Hit_def")
 sseqid = hit_def.split(None,1)[0]
 # for every <Hsp> within <Hit>
 for hsp in hit.findall("Hit_hsps/Hsp"):
 nident = hsp.findtext("Hsp_identity")
 evalue, #hsp.findtext("Hsp_evalue") in scientific notation
 bitscore, #hsp.findtext("Hsp_bit-score") rounded
 ]
 if extended:
-sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))
+try:
+sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(" >"))
+salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >"))
+except IndexError as e:
+stop_err("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e))
 #print hit_def, "-->", sallseqid
 positive = hsp.findtext("Hsp_positive")
 ppos = "%0.2f" % (100*float(positive)/float(length))
 qframe = hsp.findtext("Hsp_query-frame")
 sframe = hsp.findtext("Hsp_hit-frame")
 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking
 q_seq,
 h_seq,
 str(qlen),
 str(slen),
+salltitles,
 ])
 #print "\t".join(values)
 outfile.write("\t".join(values) + "\n")
 # prevents ElementTree from growing large datastructure
 root.clear()

Mercurial > repos > devteam > ncbi_blast_plus

comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 11:4c4a0da938ff draft