comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 20:3034ce97dd33 draft

Uploaded v0.1.08, can search multiple local databases, fixes a pipe problem in blastdbcmd, and minor internal changes.
author peterjc
date Mon, 07 Nov 2016 11:31:37 -0500
parents c16c30e9ad5b
children 7538e2bfcd41
comparison
equal deleted inserted replaced
19:7f3c448e119b 20:3034ce97dd33
6 in the Galaxy BLAST+ wrappers). 6 in the Galaxy BLAST+ wrappers).
7 7
8 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart 8 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
9 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which 9 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
10 mean: 10 mean:
11 11
12 ====== ========= ============================================ 12 ====== ========= ============================================
13 Column NCBI name Description 13 Column NCBI name Description
14 ------ --------- -------------------------------------------- 14 ------ --------- --------------------------------------------
15 1 qseqid Query Seq-id (ID of your sequence) 15 1 qseqid Query Seq-id (ID of your sequence)
16 2 sseqid Subject Seq-id (ID of the database hit) 16 2 sseqid Subject Seq-id (ID of the database hit)
64 import re 64 import re
65 import os 65 import os
66 from optparse import OptionParser 66 from optparse import OptionParser
67 67
68 if "-v" in sys.argv or "--version" in sys.argv: 68 if "-v" in sys.argv or "--version" in sys.argv:
69 print "v0.1.04" 69 print "v0.1.08"
70 sys.exit(0) 70 sys.exit(0)
71 71
72 if sys.version_info[:2] >= ( 2, 5 ): 72 if sys.version_info[:2] >= (2, 5):
73 try: 73 try:
74 from xml.etree import cElementTree as ElementTree 74 from xml.etree import cElementTree as ElementTree
75 except ImportError: 75 except ImportError:
76 from xml.etree import ElementTree as ElementTree 76 from xml.etree import ElementTree as ElementTree
77 else: 77 else:
78 from galaxy import eggs 78 from galaxy import eggs
79 import pkg_resources; pkg_resources.require( "elementtree" ) 79 import pkg_resources
80 pkg_resources.require("elementtree")
80 from elementtree import ElementTree 81 from elementtree import ElementTree
81 82
82 def stop_err( msg ):
83 sys.stderr.write("%s\n" % msg)
84 sys.exit(1)
85
86 if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]: 83 if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]:
87 #False positive if user really has a BLAST XML file called 'std' or 'ext'... 84 # False positive if user really has a BLAST XML file called 'std' or 'ext'...
88 stop_err("""ERROR: The script API has changed, sorry. 85 sys.exit("""ERROR: The script API has changed, sorry.
89 86
90 Instead of the old style: 87 Instead of the old style:
91 88
92 $ python blastxml_to_tabular.py input.xml output.tabular std 89 $ python blastxml_to_tabular.py input.xml output.tabular std
93 90
114 (options, args) = parser.parse_args() 111 (options, args) = parser.parse_args()
115 112
116 colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',') 113 colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',')
117 114
118 if len(args) < 1: 115 if len(args) < 1:
119 stop_err("ERROR: No BLASTXML input files given; run with --help to see options.") 116 sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.")
120 117
121 out_fmt = options.columns 118 out_fmt = options.columns
122 if out_fmt == "std": 119 if out_fmt == "std":
123 extended = False 120 extended = False
124 cols = None 121 cols = None
125 elif out_fmt == "x22": 122 elif out_fmt == "x22":
126 stop_err("Format argument x22 has been replaced with ext (extended 25 columns)") 123 sys.exit("Format argument x22 has been replaced with ext (extended 25 columns)")
127 elif out_fmt == "ext": 124 elif out_fmt == "ext":
128 extended = True 125 extended = True
129 cols = None 126 cols = None
130 else: 127 else:
131 cols = out_fmt.replace(" ", ",").split(",") #Allow space or comma separated 128 cols = out_fmt.replace(" ", ",").split(",") # Allow space or comma separated
132 #Remove any blank entries due to trailing comma, 129 # Remove any blank entries due to trailing comma,
133 #or annoying "None" dummy value from Galaxy if no columns 130 # or annoying "None" dummy value from Galaxy if no columns
134 cols = [c for c in cols if c and c != "None"] 131 cols = [c for c in cols if c and c != "None"]
135 extra = set(cols).difference(colnames) 132 extra = set(cols).difference(colnames)
136 if extra: 133 if extra:
137 stop_err("These are not recognised column names: %s" % ",".join(sorted(extra))) 134 sys.exit("These are not recognised column names: %s" % ",".join(sorted(extra)))
138 del extra 135 del extra
139 assert set(colnames).issuperset(cols), cols 136 assert set(colnames).issuperset(cols), cols
140 if not cols: 137 if not cols:
141 stop_err("No columns selected!") 138 sys.exit("No columns selected!")
142 extended = max(colnames.index(c) for c in cols) >= 12 #Do we need any higher columns? 139 extended = max(colnames.index(c) for c in cols) >= 12 # Do we need any higher columns?
143 del out_fmt 140 del out_fmt
144 141
145 for in_file in args: 142 for in_file in args:
146 if not os.path.isfile(in_file): 143 if not os.path.isfile(in_file):
147 stop_err("Input BLAST XML file not found: %s" % in_file) 144 sys.exit("Input BLAST XML file not found: %s" % in_file)
148 145
149 146
150 re_default_query_id = re.compile("^Query_\d+$") 147 re_default_query_id = re.compile("^Query_\d+$")
151 assert re_default_query_id.match("Query_101") 148 assert re_default_query_id.match("Query_101")
152 assert not re_default_query_id.match("Query_101a") 149 assert not re_default_query_id.match("Query_101a")
159 156
160 157
161 def convert(blastxml_filename, output_handle): 158 def convert(blastxml_filename, output_handle):
162 blast_program = None 159 blast_program = None
163 # get an iterable 160 # get an iterable
164 try: 161 try:
165 context = ElementTree.iterparse(blastxml_filename, events=("start", "end")) 162 context = ElementTree.iterparse(blastxml_filename, events=("start", "end"))
166 except: 163 except Exception:
167 stop_err("Invalid data format.") 164 sys.exit("Invalid data format.")
168 # turn it into an iterator 165 # turn it into an iterator
169 context = iter(context) 166 context = iter(context)
170 # get the root element 167 # get the root element
171 try: 168 try:
172 event, root = context.next() 169 event, root = context.next()
173 except: 170 except Exception:
174 stop_err( "Invalid data format." ) 171 sys.exit("Invalid data format.")
175 for event, elem in context: 172 for event, elem in context:
176 if event == "end" and elem.tag == "BlastOutput_program": 173 if event == "end" and elem.tag == "BlastOutput_program":
177 blast_program = elem.text 174 blast_program = elem.text
178 # for every <Iteration> tag 175 # for every <Iteration> tag
179 if event == "end" and elem.tag == "Iteration": 176 if event == "end" and elem.tag == "Iteration":
180 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA 177 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
181 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> 178 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
182 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> 179 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
183 # <Iteration_query-len>406</Iteration_query-len> 180 # <Iteration_query-len>406</Iteration_query-len>
184 # <Iteration_hits></Iteration_hits> 181 # <Iteration_hits></Iteration_hits>
185 # 182 #
186 #Or, from BLAST 2.2.24+ run online 183 # Or, from BLAST 2.2.24+ run online
187 # <Iteration_query-ID>Query_1</Iteration_query-ID> 184 # <Iteration_query-ID>Query_1</Iteration_query-ID>
188 # <Iteration_query-def>Sample</Iteration_query-def> 185 # <Iteration_query-def>Sample</Iteration_query-def>
189 # <Iteration_query-len>516</Iteration_query-len> 186 # <Iteration_query-len>516</Iteration_query-len>
190 # <Iteration_hits>... 187 # <Iteration_hits>...
191 qseqid = elem.findtext("Iteration_query-ID") 188 qseqid = elem.findtext("Iteration_query-ID")
192 if re_default_query_id.match(qseqid): 189 if re_default_query_id.match(qseqid):
193 #Place holder ID, take the first word of the query definition 190 # Place holder ID, take the first word of the query definition
194 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] 191 qseqid = elem.findtext("Iteration_query-def").split(None, 1)[0]
195 qlen = int(elem.findtext("Iteration_query-len")) 192 qlen = int(elem.findtext("Iteration_query-len"))
196 193
197 # for every <Hit> within <Iteration> 194 # for every <Hit> within <Iteration>
198 for hit in elem.findall("Iteration_hits/Hit"): 195 for hit in elem.findall("Iteration_hits/Hit"):
199 #Expecting either this, 196 # Expecting either this,
200 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> 197 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
201 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> 198 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
202 # <Hit_accession>P56514</Hit_accession> 199 # <Hit_accession>P56514</Hit_accession>
203 #or, 200 # or,
204 # <Hit_id>Subject_1</Hit_id> 201 # <Hit_id>Subject_1</Hit_id>
205 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> 202 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
206 # <Hit_accession>Subject_1</Hit_accession> 203 # <Hit_accession>Subject_1</Hit_accession>
207 # 204 #
208 #apparently depending on the parse_deflines switch 205 # apparently depending on the parse_deflines switch
209 # 206 #
210 #Or, with a local database not using -parse_seqids can get this, 207 # Or, with a local database not using -parse_seqids can get this,
211 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> 208 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
212 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> 209 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
213 # <Hit_accession>2</Hit_accession> 210 # <Hit_accession>2</Hit_accession>
214 sseqid = hit.findtext("Hit_id").split(None,1)[0] 211 sseqid = hit.findtext("Hit_id").split(None, 1)[0]
215 hit_def = sseqid + " " + hit.findtext("Hit_def") 212 hit_def = sseqid + " " + hit.findtext("Hit_def")
216 if re_default_subject_id.match(sseqid) \ 213 if re_default_subject_id.match(sseqid) \
217 and sseqid == hit.findtext("Hit_accession"): 214 and sseqid == hit.findtext("Hit_accession"):
218 #Place holder ID, take the first word of the subject definition 215 # Place holder ID, take the first word of the subject definition
219 hit_def = hit.findtext("Hit_def") 216 hit_def = hit.findtext("Hit_def")
220 sseqid = hit_def.split(None,1)[0] 217 sseqid = hit_def.split(None, 1)[0]
221 if sseqid.startswith("gnl|BL_ORD_ID|") \ 218 if sseqid.startswith("gnl|BL_ORD_ID|") \
222 and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): 219 and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
223 #Alternative place holder ID, again take the first word of hit_def 220 # Alternative place holder ID, again take the first word of hit_def
224 hit_def = hit.findtext("Hit_def") 221 hit_def = hit.findtext("Hit_def")
225 sseqid = hit_def.split(None,1)[0] 222 sseqid = hit_def.split(None, 1)[0]
226 # for every <Hsp> within <Hit> 223 # for every <Hsp> within <Hit>
227 for hsp in hit.findall("Hit_hsps/Hsp"): 224 for hsp in hit.findall("Hit_hsps/Hsp"):
228 nident = hsp.findtext("Hsp_identity") 225 nident = hsp.findtext("Hsp_identity")
229 length = hsp.findtext("Hsp_align-len") 226 length = hsp.findtext("Hsp_align-len")
230 pident = "%0.2f" % (100*float(nident)/float(length)) 227 pident = "%0.2f" % (100 * float(nident) / float(length))
231 228
232 q_seq = hsp.findtext("Hsp_qseq") 229 q_seq = hsp.findtext("Hsp_qseq")
233 h_seq = hsp.findtext("Hsp_hseq") 230 h_seq = hsp.findtext("Hsp_hseq")
234 m_seq = hsp.findtext("Hsp_midline") 231 m_seq = hsp.findtext("Hsp_midline")
235 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) 232 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
236 gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \ 233 gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 +
237 len(h_seq.replace('-', ' ').split())-1) 234 len(h_seq.replace('-', ' ').split()) - 1)
238 235
239 mismatch = m_seq.count(' ') + m_seq.count('+') \ 236 mismatch = m_seq.count(' ') + m_seq.count('+') \
240 - q_seq.count('-') - h_seq.count('-') 237 - q_seq.count('-') - h_seq.count('-')
241 #TODO - Remove this alternative mismatch calculation and test 238 # TODO - Remove this alternative mismatch calculation and test
242 #once satisifed there are no problems 239 # once satisifed there are no problems
243 expected_mismatch = len(q_seq) \ 240 expected_mismatch = len(q_seq) \
244 - sum(1 for q,h in zip(q_seq, h_seq) \ 241 - sum(1 for q, h in zip(q_seq, h_seq)
245 if q == h or q == "-" or h == "-") 242 if q == h or q == "-" or h == "-")
246 xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X") 243 xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X")
247 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): 244 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
248 stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \ 245 sys.exit("%s vs %s mismatches, expected %i <= %i <= %i"
249 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), 246 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
250 int(mismatch), expected_mismatch)) 247 int(mismatch), expected_mismatch))
251 248
252 #TODO - Remove this alternative identity calculation and test 249 # TODO - Remove this alternative identity calculation and test
253 #once satisifed there are no problems 250 # once satisifed there are no problems
254 expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h) 251 expected_identity = sum(1 for q, h in zip(q_seq, h_seq) if q == h)
255 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")): 252 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
256 stop_err("%s vs %s identities, expected %i <= %i <= %i" \ 253 sys.exit("%s vs %s identities, expected %i <= %i <= %i"
257 % (qseqid, sseqid, expected_identity, int(nident), 254 % (qseqid, sseqid, expected_identity, int(nident),
258 expected_identity + q_seq.count("X"))) 255 expected_identity + q_seq.count("X")))
259
260 256
261 evalue = hsp.findtext("Hsp_evalue") 257 evalue = hsp.findtext("Hsp_evalue")
262 if evalue == "0": 258 if evalue == "0":
263 evalue = "0.0" 259 evalue = "0.0"
264 else: 260 else:
265 evalue = "%0.0e" % float(evalue) 261 evalue = "%0.0e" % float(evalue)
266 262
267 bitscore = float(hsp.findtext("Hsp_bit-score")) 263 bitscore = float(hsp.findtext("Hsp_bit-score"))
268 if bitscore < 100: 264 if bitscore < 100:
269 #Seems to show one decimal place for lower scores 265 # Seems to show one decimal place for lower scores
270 bitscore = "%0.1f" % bitscore 266 bitscore = "%0.1f" % bitscore
271 else: 267 else:
272 #Note BLAST does not round to nearest int, it truncates 268 # Note BLAST does not round to nearest int, it truncates
273 bitscore = "%i" % bitscore 269 bitscore = "%i" % bitscore
274 270
275 values = [qseqid, 271 values = [qseqid,
276 sseqid, 272 sseqid,
277 pident, 273 pident,
278 length, #hsp.findtext("Hsp_align-len") 274 length, # hsp.findtext("Hsp_align-len")
279 str(mismatch), 275 str(mismatch),
280 gapopen, 276 gapopen,
281 hsp.findtext("Hsp_query-from"), #qstart, 277 hsp.findtext("Hsp_query-from"), # qstart,
282 hsp.findtext("Hsp_query-to"), #qend, 278 hsp.findtext("Hsp_query-to"), # qend,
283 hsp.findtext("Hsp_hit-from"), #sstart, 279 hsp.findtext("Hsp_hit-from"), # sstart,
284 hsp.findtext("Hsp_hit-to"), #send, 280 hsp.findtext("Hsp_hit-to"), # send,
285 evalue, #hsp.findtext("Hsp_evalue") in scientific notation 281 evalue, # hsp.findtext("Hsp_evalue") in scientific notation
286 bitscore, #hsp.findtext("Hsp_bit-score") rounded 282 bitscore, # hsp.findtext("Hsp_bit-score") rounded
287 ] 283 ]
288 284
289 if extended: 285 if extended:
290 try: 286 try:
291 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(" >")) 287 sallseqid = ";".join(name.split(None, 1)[0] for name in hit_def.split(" >"))
292 salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >")) 288 salltitles = "<>".join(name.split(None, 1)[1] for name in hit_def.split(" >"))
293 except IndexError as e: 289 except IndexError as e:
294 stop_err("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e)) 290 sys.exit("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e))
295 #print hit_def, "-->", sallseqid 291 # print hit_def, "-->", sallseqid
296 positive = hsp.findtext("Hsp_positive") 292 positive = hsp.findtext("Hsp_positive")
297 ppos = "%0.2f" % (100*float(positive)/float(length)) 293 ppos = "%0.2f" % (100 * float(positive) / float(length))
298 qframe = hsp.findtext("Hsp_query-frame") 294 qframe = hsp.findtext("Hsp_query-frame")
299 sframe = hsp.findtext("Hsp_hit-frame") 295 sframe = hsp.findtext("Hsp_hit-frame")
300 if blast_program == "blastp": 296 if blast_program == "blastp":
301 #Probably a bug in BLASTP that they use 0 or 1 depending on format 297 # Probably a bug in BLASTP that they use 0 or 1 depending on format
302 if qframe == "0": qframe = "1" 298 if qframe == "0":
303 if sframe == "0": sframe = "1" 299 qframe = "1"
300 if sframe == "0":
301 sframe = "1"
304 slen = int(hit.findtext("Hit_len")) 302 slen = int(hit.findtext("Hit_len"))
305 values.extend([sallseqid, 303 values.extend([sallseqid,
306 hsp.findtext("Hsp_score"), #score, 304 hsp.findtext("Hsp_score"), # score,
307 nident, 305 nident,
308 positive, 306 positive,
309 hsp.findtext("Hsp_gaps"), #gaps, 307 hsp.findtext("Hsp_gaps"), # gaps,
310 ppos, 308 ppos,
311 qframe, 309 qframe,
312 sframe, 310 sframe,
313 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking 311 # NOTE - for blastp, XML shows original seq, tabular uses XXX masking
314 q_seq, 312 q_seq,
315 h_seq, 313 h_seq,
316 str(qlen), 314 str(qlen),
317 str(slen), 315 str(slen),
318 salltitles, 316 salltitles,
319 ]) 317 ])
320 if cols: 318 if cols:
321 #Only a subset of the columns are needed 319 # Only a subset of the columns are needed
322 values = [values[colnames.index(c)] for c in cols] 320 values = [values[colnames.index(c)] for c in cols]
323 #print "\t".join(values) 321 # print "\t".join(values)
324 output_handle.write("\t".join(values) + "\n") 322 output_handle.write("\t".join(values) + "\n")
325 # prevents ElementTree from growing large datastructure 323 # prevents ElementTree from growing large datastructure
326 root.clear() 324 root.clear()
327 elem.clear() 325 elem.clear()
328 326
337 convert(in_file, outfile) 335 convert(in_file, outfile)
338 336
339 if options.output: 337 if options.output:
340 outfile.close() 338 outfile.close()
341 else: 339 else:
342 #Using stdout 340 # Using stdout
343 pass 341 pass
344