comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 11:4c4a0da938ff draft

Uploaded v0.0.22, now wraps BLAST+ 2.2.28 allowing extended tabular output to include the hit descriptions as column 25. Supports $GALAXY_SLOTS. Includes more tests and heavy use of macros.
author peterjc
date Thu, 05 Dec 2013 06:55:59 -0500
parents 70e7dcbf6573
children 623f727cdff1
comparison
equal deleted inserted replaced
10:70e7dcbf6573 11:4c4a0da938ff
29 The additional columns offered in the Galaxy BLAST+ wrappers are: 29 The additional columns offered in the Galaxy BLAST+ wrappers are:
30 30
31 ====== ============= =========================================== 31 ====== ============= ===========================================
32 Column NCBI name Description 32 Column NCBI name Description
33 ------ ------------- ------------------------------------------- 33 ------ ------------- -------------------------------------------
34 13 sallseqid All subject Seq-id(s), separated by a ';' 34 13 sallseqid All subject Seq-id(s), separated by ';'
35 14 score Raw score 35 14 score Raw score
36 15 nident Number of identical matches 36 15 nident Number of identical matches
37 16 positive Number of positive-scoring matches 37 16 positive Number of positive-scoring matches
38 17 gaps Total number of gaps 38 17 gaps Total number of gaps
39 18 ppos Percentage of positive-scoring matches 39 18 ppos Percentage of positive-scoring matches
41 20 sframe Subject frame 41 20 sframe Subject frame
42 21 qseq Aligned part of query sequence 42 21 qseq Aligned part of query sequence
43 22 sseq Aligned part of subject sequence 43 22 sseq Aligned part of subject sequence
44 23 qlen Query sequence length 44 23 qlen Query sequence length
45 24 slen Subject sequence length 45 24 slen Subject sequence length
46 25 salltitles All subject titles, separated by '<>'
46 ====== ============= =========================================== 47 ====== ============= ===========================================
47 48
48 Most of these fields are given explicitly in the XML file, others some like 49 Most of these fields are given explicitly in the XML file, others some like
49 the percentage identity and the number of gap openings must be calculated. 50 the percentage identity and the number of gap openings must be calculated.
50 51
61 """ 62 """
62 import sys 63 import sys
63 import re 64 import re
64 65
65 if "-v" in sys.argv or "--version" in sys.argv: 66 if "-v" in sys.argv or "--version" in sys.argv:
66 print "v0.0.12" 67 print "v0.0.22"
67 sys.exit(0) 68 sys.exit(0)
68 69
69 if sys.version_info[:2] >= ( 2, 5 ): 70 if sys.version_info[:2] >= ( 2, 5 ):
70 try: 71 try:
71 from xml.etree import cElementTree as ElementTree 72 from xml.etree import cElementTree as ElementTree
87 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)") 88 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)")
88 89
89 if out_fmt == "std": 90 if out_fmt == "std":
90 extended = False 91 extended = False
91 elif out_fmt == "x22": 92 elif out_fmt == "x22":
92 stop_err("Format argument x22 has been replaced with ext (extended 24 columns)") 93 stop_err("Format argument x22 has been replaced with ext (extended 25 columns)")
93 elif out_fmt == "ext": 94 elif out_fmt == "ext":
94 extended = True 95 extended = True
95 else: 96 else:
96 stop_err("Format argument should be std (12 column) or ext (extended 24 columns)") 97 stop_err("Format argument should be std (12 column) or ext (extended 25 columns), not: %r" % out_fmt)
97 98
98 99
99 # get an iterable 100 # get an iterable
100 try: 101 try:
101 context = ElementTree.iterparse(in_file, events=("start", "end")) 102 context = ElementTree.iterparse(in_file, events=("start", "end"))
155 # <Hit_id>Subject_1</Hit_id> 156 # <Hit_id>Subject_1</Hit_id>
156 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> 157 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
157 # <Hit_accession>Subject_1</Hit_accession> 158 # <Hit_accession>Subject_1</Hit_accession>
158 # 159 #
159 #apparently depending on the parse_deflines switch 160 #apparently depending on the parse_deflines switch
161 #
162 #Or, with BLAST 2.2.28+ can get this,
163 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
164 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
165 # <Hit_accession>2</Hit_accession>
160 sseqid = hit.findtext("Hit_id").split(None,1)[0] 166 sseqid = hit.findtext("Hit_id").split(None,1)[0]
161 hit_def = sseqid + " " + hit.findtext("Hit_def") 167 hit_def = sseqid + " " + hit.findtext("Hit_def")
162 if re_default_subject_id.match(sseqid) \ 168 if re_default_subject_id.match(sseqid) \
163 and sseqid == hit.findtext("Hit_accession"): 169 and sseqid == hit.findtext("Hit_accession"):
164 #Place holder ID, take the first word of the subject definition 170 #Place holder ID, take the first word of the subject definition
171 hit_def = hit.findtext("Hit_def")
172 sseqid = hit_def.split(None,1)[0]
173 if sseqid.startswith("gnl|BL_ORD_ID|") \
174 and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
175 #Alternative place holder ID, again take the first word of hit_def
165 hit_def = hit.findtext("Hit_def") 176 hit_def = hit.findtext("Hit_def")
166 sseqid = hit_def.split(None,1)[0] 177 sseqid = hit_def.split(None,1)[0]
167 # for every <Hsp> within <Hit> 178 # for every <Hsp> within <Hit>
168 for hsp in hit.findall("Hit_hsps/Hsp"): 179 for hsp in hit.findall("Hit_hsps/Hsp"):
169 nident = hsp.findtext("Hsp_identity") 180 nident = hsp.findtext("Hsp_identity")
226 evalue, #hsp.findtext("Hsp_evalue") in scientific notation 237 evalue, #hsp.findtext("Hsp_evalue") in scientific notation
227 bitscore, #hsp.findtext("Hsp_bit-score") rounded 238 bitscore, #hsp.findtext("Hsp_bit-score") rounded
228 ] 239 ]
229 240
230 if extended: 241 if extended:
231 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">")) 242 try:
243 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(" >"))
244 salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >"))
245 except IndexError as e:
246 stop_err("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e))
232 #print hit_def, "-->", sallseqid 247 #print hit_def, "-->", sallseqid
233 positive = hsp.findtext("Hsp_positive") 248 positive = hsp.findtext("Hsp_positive")
234 ppos = "%0.2f" % (100*float(positive)/float(length)) 249 ppos = "%0.2f" % (100*float(positive)/float(length))
235 qframe = hsp.findtext("Hsp_query-frame") 250 qframe = hsp.findtext("Hsp_query-frame")
236 sframe = hsp.findtext("Hsp_hit-frame") 251 sframe = hsp.findtext("Hsp_hit-frame")
250 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking 265 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking
251 q_seq, 266 q_seq,
252 h_seq, 267 h_seq,
253 str(qlen), 268 str(qlen),
254 str(slen), 269 str(slen),
270 salltitles,
255 ]) 271 ])
256 #print "\t".join(values) 272 #print "\t".join(values)
257 outfile.write("\t".join(values) + "\n") 273 outfile.write("\t".join(values) + "\n")
258 # prevents ElementTree from growing large datastructure 274 # prevents ElementTree from growing large datastructure
259 root.clear() 275 root.clear()