Mercurial > repos > devteam > ncbi_blast_plus
comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 20:3034ce97dd33 draft
Uploaded v0.1.08, can search multiple local databases, fixes a pipe problem in blastdbcmd, and minor internal changes.
author | peterjc |
---|---|
date | Mon, 07 Nov 2016 11:31:37 -0500 |
parents | c16c30e9ad5b |
children | 7538e2bfcd41 |
comparison
equal
deleted
inserted
replaced
19:7f3c448e119b | 20:3034ce97dd33 |
---|---|
6 in the Galaxy BLAST+ wrappers). | 6 in the Galaxy BLAST+ wrappers). |
7 | 7 |
8 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart | 8 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart |
9 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which | 9 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which |
10 mean: | 10 mean: |
11 | 11 |
12 ====== ========= ============================================ | 12 ====== ========= ============================================ |
13 Column NCBI name Description | 13 Column NCBI name Description |
14 ------ --------- -------------------------------------------- | 14 ------ --------- -------------------------------------------- |
15 1 qseqid Query Seq-id (ID of your sequence) | 15 1 qseqid Query Seq-id (ID of your sequence) |
16 2 sseqid Subject Seq-id (ID of the database hit) | 16 2 sseqid Subject Seq-id (ID of the database hit) |
64 import re | 64 import re |
65 import os | 65 import os |
66 from optparse import OptionParser | 66 from optparse import OptionParser |
67 | 67 |
68 if "-v" in sys.argv or "--version" in sys.argv: | 68 if "-v" in sys.argv or "--version" in sys.argv: |
69 print "v0.1.04" | 69 print "v0.1.08" |
70 sys.exit(0) | 70 sys.exit(0) |
71 | 71 |
72 if sys.version_info[:2] >= ( 2, 5 ): | 72 if sys.version_info[:2] >= (2, 5): |
73 try: | 73 try: |
74 from xml.etree import cElementTree as ElementTree | 74 from xml.etree import cElementTree as ElementTree |
75 except ImportError: | 75 except ImportError: |
76 from xml.etree import ElementTree as ElementTree | 76 from xml.etree import ElementTree as ElementTree |
77 else: | 77 else: |
78 from galaxy import eggs | 78 from galaxy import eggs |
79 import pkg_resources; pkg_resources.require( "elementtree" ) | 79 import pkg_resources |
80 pkg_resources.require("elementtree") | |
80 from elementtree import ElementTree | 81 from elementtree import ElementTree |
81 | 82 |
82 def stop_err( msg ): | |
83 sys.stderr.write("%s\n" % msg) | |
84 sys.exit(1) | |
85 | |
86 if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]: | 83 if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]: |
87 #False positive if user really has a BLAST XML file called 'std' or 'ext'... | 84 # False positive if user really has a BLAST XML file called 'std' or 'ext'... |
88 stop_err("""ERROR: The script API has changed, sorry. | 85 sys.exit("""ERROR: The script API has changed, sorry. |
89 | 86 |
90 Instead of the old style: | 87 Instead of the old style: |
91 | 88 |
92 $ python blastxml_to_tabular.py input.xml output.tabular std | 89 $ python blastxml_to_tabular.py input.xml output.tabular std |
93 | 90 |
114 (options, args) = parser.parse_args() | 111 (options, args) = parser.parse_args() |
115 | 112 |
116 colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',') | 113 colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',') |
117 | 114 |
118 if len(args) < 1: | 115 if len(args) < 1: |
119 stop_err("ERROR: No BLASTXML input files given; run with --help to see options.") | 116 sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.") |
120 | 117 |
121 out_fmt = options.columns | 118 out_fmt = options.columns |
122 if out_fmt == "std": | 119 if out_fmt == "std": |
123 extended = False | 120 extended = False |
124 cols = None | 121 cols = None |
125 elif out_fmt == "x22": | 122 elif out_fmt == "x22": |
126 stop_err("Format argument x22 has been replaced with ext (extended 25 columns)") | 123 sys.exit("Format argument x22 has been replaced with ext (extended 25 columns)") |
127 elif out_fmt == "ext": | 124 elif out_fmt == "ext": |
128 extended = True | 125 extended = True |
129 cols = None | 126 cols = None |
130 else: | 127 else: |
131 cols = out_fmt.replace(" ", ",").split(",") #Allow space or comma separated | 128 cols = out_fmt.replace(" ", ",").split(",") # Allow space or comma separated |
132 #Remove any blank entries due to trailing comma, | 129 # Remove any blank entries due to trailing comma, |
133 #or annoying "None" dummy value from Galaxy if no columns | 130 # or annoying "None" dummy value from Galaxy if no columns |
134 cols = [c for c in cols if c and c != "None"] | 131 cols = [c for c in cols if c and c != "None"] |
135 extra = set(cols).difference(colnames) | 132 extra = set(cols).difference(colnames) |
136 if extra: | 133 if extra: |
137 stop_err("These are not recognised column names: %s" % ",".join(sorted(extra))) | 134 sys.exit("These are not recognised column names: %s" % ",".join(sorted(extra))) |
138 del extra | 135 del extra |
139 assert set(colnames).issuperset(cols), cols | 136 assert set(colnames).issuperset(cols), cols |
140 if not cols: | 137 if not cols: |
141 stop_err("No columns selected!") | 138 sys.exit("No columns selected!") |
142 extended = max(colnames.index(c) for c in cols) >= 12 #Do we need any higher columns? | 139 extended = max(colnames.index(c) for c in cols) >= 12 # Do we need any higher columns? |
143 del out_fmt | 140 del out_fmt |
144 | 141 |
145 for in_file in args: | 142 for in_file in args: |
146 if not os.path.isfile(in_file): | 143 if not os.path.isfile(in_file): |
147 stop_err("Input BLAST XML file not found: %s" % in_file) | 144 sys.exit("Input BLAST XML file not found: %s" % in_file) |
148 | 145 |
149 | 146 |
150 re_default_query_id = re.compile("^Query_\d+$") | 147 re_default_query_id = re.compile("^Query_\d+$") |
151 assert re_default_query_id.match("Query_101") | 148 assert re_default_query_id.match("Query_101") |
152 assert not re_default_query_id.match("Query_101a") | 149 assert not re_default_query_id.match("Query_101a") |
159 | 156 |
160 | 157 |
161 def convert(blastxml_filename, output_handle): | 158 def convert(blastxml_filename, output_handle): |
162 blast_program = None | 159 blast_program = None |
163 # get an iterable | 160 # get an iterable |
164 try: | 161 try: |
165 context = ElementTree.iterparse(blastxml_filename, events=("start", "end")) | 162 context = ElementTree.iterparse(blastxml_filename, events=("start", "end")) |
166 except: | 163 except Exception: |
167 stop_err("Invalid data format.") | 164 sys.exit("Invalid data format.") |
168 # turn it into an iterator | 165 # turn it into an iterator |
169 context = iter(context) | 166 context = iter(context) |
170 # get the root element | 167 # get the root element |
171 try: | 168 try: |
172 event, root = context.next() | 169 event, root = context.next() |
173 except: | 170 except Exception: |
174 stop_err( "Invalid data format." ) | 171 sys.exit("Invalid data format.") |
175 for event, elem in context: | 172 for event, elem in context: |
176 if event == "end" and elem.tag == "BlastOutput_program": | 173 if event == "end" and elem.tag == "BlastOutput_program": |
177 blast_program = elem.text | 174 blast_program = elem.text |
178 # for every <Iteration> tag | 175 # for every <Iteration> tag |
179 if event == "end" and elem.tag == "Iteration": | 176 if event == "end" and elem.tag == "Iteration": |
180 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA | 177 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA |
181 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> | 178 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> |
182 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> | 179 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> |
183 # <Iteration_query-len>406</Iteration_query-len> | 180 # <Iteration_query-len>406</Iteration_query-len> |
184 # <Iteration_hits></Iteration_hits> | 181 # <Iteration_hits></Iteration_hits> |
185 # | 182 # |
186 #Or, from BLAST 2.2.24+ run online | 183 # Or, from BLAST 2.2.24+ run online |
187 # <Iteration_query-ID>Query_1</Iteration_query-ID> | 184 # <Iteration_query-ID>Query_1</Iteration_query-ID> |
188 # <Iteration_query-def>Sample</Iteration_query-def> | 185 # <Iteration_query-def>Sample</Iteration_query-def> |
189 # <Iteration_query-len>516</Iteration_query-len> | 186 # <Iteration_query-len>516</Iteration_query-len> |
190 # <Iteration_hits>... | 187 # <Iteration_hits>... |
191 qseqid = elem.findtext("Iteration_query-ID") | 188 qseqid = elem.findtext("Iteration_query-ID") |
192 if re_default_query_id.match(qseqid): | 189 if re_default_query_id.match(qseqid): |
193 #Place holder ID, take the first word of the query definition | 190 # Place holder ID, take the first word of the query definition |
194 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] | 191 qseqid = elem.findtext("Iteration_query-def").split(None, 1)[0] |
195 qlen = int(elem.findtext("Iteration_query-len")) | 192 qlen = int(elem.findtext("Iteration_query-len")) |
196 | 193 |
197 # for every <Hit> within <Iteration> | 194 # for every <Hit> within <Iteration> |
198 for hit in elem.findall("Iteration_hits/Hit"): | 195 for hit in elem.findall("Iteration_hits/Hit"): |
199 #Expecting either this, | 196 # Expecting either this, |
200 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> | 197 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> |
201 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> | 198 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> |
202 # <Hit_accession>P56514</Hit_accession> | 199 # <Hit_accession>P56514</Hit_accession> |
203 #or, | 200 # or, |
204 # <Hit_id>Subject_1</Hit_id> | 201 # <Hit_id>Subject_1</Hit_id> |
205 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> | 202 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> |
206 # <Hit_accession>Subject_1</Hit_accession> | 203 # <Hit_accession>Subject_1</Hit_accession> |
207 # | 204 # |
208 #apparently depending on the parse_deflines switch | 205 # apparently depending on the parse_deflines switch |
209 # | 206 # |
210 #Or, with a local database not using -parse_seqids can get this, | 207 # Or, with a local database not using -parse_seqids can get this, |
211 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> | 208 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> |
212 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> | 209 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> |
213 # <Hit_accession>2</Hit_accession> | 210 # <Hit_accession>2</Hit_accession> |
214 sseqid = hit.findtext("Hit_id").split(None,1)[0] | 211 sseqid = hit.findtext("Hit_id").split(None, 1)[0] |
215 hit_def = sseqid + " " + hit.findtext("Hit_def") | 212 hit_def = sseqid + " " + hit.findtext("Hit_def") |
216 if re_default_subject_id.match(sseqid) \ | 213 if re_default_subject_id.match(sseqid) \ |
217 and sseqid == hit.findtext("Hit_accession"): | 214 and sseqid == hit.findtext("Hit_accession"): |
218 #Place holder ID, take the first word of the subject definition | 215 # Place holder ID, take the first word of the subject definition |
219 hit_def = hit.findtext("Hit_def") | 216 hit_def = hit.findtext("Hit_def") |
220 sseqid = hit_def.split(None,1)[0] | 217 sseqid = hit_def.split(None, 1)[0] |
221 if sseqid.startswith("gnl|BL_ORD_ID|") \ | 218 if sseqid.startswith("gnl|BL_ORD_ID|") \ |
222 and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): | 219 and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): |
223 #Alternative place holder ID, again take the first word of hit_def | 220 # Alternative place holder ID, again take the first word of hit_def |
224 hit_def = hit.findtext("Hit_def") | 221 hit_def = hit.findtext("Hit_def") |
225 sseqid = hit_def.split(None,1)[0] | 222 sseqid = hit_def.split(None, 1)[0] |
226 # for every <Hsp> within <Hit> | 223 # for every <Hsp> within <Hit> |
227 for hsp in hit.findall("Hit_hsps/Hsp"): | 224 for hsp in hit.findall("Hit_hsps/Hsp"): |
228 nident = hsp.findtext("Hsp_identity") | 225 nident = hsp.findtext("Hsp_identity") |
229 length = hsp.findtext("Hsp_align-len") | 226 length = hsp.findtext("Hsp_align-len") |
230 pident = "%0.2f" % (100*float(nident)/float(length)) | 227 pident = "%0.2f" % (100 * float(nident) / float(length)) |
231 | 228 |
232 q_seq = hsp.findtext("Hsp_qseq") | 229 q_seq = hsp.findtext("Hsp_qseq") |
233 h_seq = hsp.findtext("Hsp_hseq") | 230 h_seq = hsp.findtext("Hsp_hseq") |
234 m_seq = hsp.findtext("Hsp_midline") | 231 m_seq = hsp.findtext("Hsp_midline") |
235 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) | 232 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) |
236 gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \ | 233 gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 + |
237 len(h_seq.replace('-', ' ').split())-1) | 234 len(h_seq.replace('-', ' ').split()) - 1) |
238 | 235 |
239 mismatch = m_seq.count(' ') + m_seq.count('+') \ | 236 mismatch = m_seq.count(' ') + m_seq.count('+') \ |
240 - q_seq.count('-') - h_seq.count('-') | 237 - q_seq.count('-') - h_seq.count('-') |
241 #TODO - Remove this alternative mismatch calculation and test | 238 # TODO - Remove this alternative mismatch calculation and test |
242 #once satisifed there are no problems | 239 # once satisifed there are no problems |
243 expected_mismatch = len(q_seq) \ | 240 expected_mismatch = len(q_seq) \ |
244 - sum(1 for q,h in zip(q_seq, h_seq) \ | 241 - sum(1 for q, h in zip(q_seq, h_seq) |
245 if q == h or q == "-" or h == "-") | 242 if q == h or q == "-" or h == "-") |
246 xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X") | 243 xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X") |
247 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): | 244 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): |
248 stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \ | 245 sys.exit("%s vs %s mismatches, expected %i <= %i <= %i" |
249 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), | 246 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), |
250 int(mismatch), expected_mismatch)) | 247 int(mismatch), expected_mismatch)) |
251 | 248 |
252 #TODO - Remove this alternative identity calculation and test | 249 # TODO - Remove this alternative identity calculation and test |
253 #once satisifed there are no problems | 250 # once satisifed there are no problems |
254 expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h) | 251 expected_identity = sum(1 for q, h in zip(q_seq, h_seq) if q == h) |
255 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")): | 252 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")): |
256 stop_err("%s vs %s identities, expected %i <= %i <= %i" \ | 253 sys.exit("%s vs %s identities, expected %i <= %i <= %i" |
257 % (qseqid, sseqid, expected_identity, int(nident), | 254 % (qseqid, sseqid, expected_identity, int(nident), |
258 expected_identity + q_seq.count("X"))) | 255 expected_identity + q_seq.count("X"))) |
259 | |
260 | 256 |
261 evalue = hsp.findtext("Hsp_evalue") | 257 evalue = hsp.findtext("Hsp_evalue") |
262 if evalue == "0": | 258 if evalue == "0": |
263 evalue = "0.0" | 259 evalue = "0.0" |
264 else: | 260 else: |
265 evalue = "%0.0e" % float(evalue) | 261 evalue = "%0.0e" % float(evalue) |
266 | 262 |
267 bitscore = float(hsp.findtext("Hsp_bit-score")) | 263 bitscore = float(hsp.findtext("Hsp_bit-score")) |
268 if bitscore < 100: | 264 if bitscore < 100: |
269 #Seems to show one decimal place for lower scores | 265 # Seems to show one decimal place for lower scores |
270 bitscore = "%0.1f" % bitscore | 266 bitscore = "%0.1f" % bitscore |
271 else: | 267 else: |
272 #Note BLAST does not round to nearest int, it truncates | 268 # Note BLAST does not round to nearest int, it truncates |
273 bitscore = "%i" % bitscore | 269 bitscore = "%i" % bitscore |
274 | 270 |
275 values = [qseqid, | 271 values = [qseqid, |
276 sseqid, | 272 sseqid, |
277 pident, | 273 pident, |
278 length, #hsp.findtext("Hsp_align-len") | 274 length, # hsp.findtext("Hsp_align-len") |
279 str(mismatch), | 275 str(mismatch), |
280 gapopen, | 276 gapopen, |
281 hsp.findtext("Hsp_query-from"), #qstart, | 277 hsp.findtext("Hsp_query-from"), # qstart, |
282 hsp.findtext("Hsp_query-to"), #qend, | 278 hsp.findtext("Hsp_query-to"), # qend, |
283 hsp.findtext("Hsp_hit-from"), #sstart, | 279 hsp.findtext("Hsp_hit-from"), # sstart, |
284 hsp.findtext("Hsp_hit-to"), #send, | 280 hsp.findtext("Hsp_hit-to"), # send, |
285 evalue, #hsp.findtext("Hsp_evalue") in scientific notation | 281 evalue, # hsp.findtext("Hsp_evalue") in scientific notation |
286 bitscore, #hsp.findtext("Hsp_bit-score") rounded | 282 bitscore, # hsp.findtext("Hsp_bit-score") rounded |
287 ] | 283 ] |
288 | 284 |
289 if extended: | 285 if extended: |
290 try: | 286 try: |
291 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(" >")) | 287 sallseqid = ";".join(name.split(None, 1)[0] for name in hit_def.split(" >")) |
292 salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >")) | 288 salltitles = "<>".join(name.split(None, 1)[1] for name in hit_def.split(" >")) |
293 except IndexError as e: | 289 except IndexError as e: |
294 stop_err("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e)) | 290 sys.exit("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e)) |
295 #print hit_def, "-->", sallseqid | 291 # print hit_def, "-->", sallseqid |
296 positive = hsp.findtext("Hsp_positive") | 292 positive = hsp.findtext("Hsp_positive") |
297 ppos = "%0.2f" % (100*float(positive)/float(length)) | 293 ppos = "%0.2f" % (100 * float(positive) / float(length)) |
298 qframe = hsp.findtext("Hsp_query-frame") | 294 qframe = hsp.findtext("Hsp_query-frame") |
299 sframe = hsp.findtext("Hsp_hit-frame") | 295 sframe = hsp.findtext("Hsp_hit-frame") |
300 if blast_program == "blastp": | 296 if blast_program == "blastp": |
301 #Probably a bug in BLASTP that they use 0 or 1 depending on format | 297 # Probably a bug in BLASTP that they use 0 or 1 depending on format |
302 if qframe == "0": qframe = "1" | 298 if qframe == "0": |
303 if sframe == "0": sframe = "1" | 299 qframe = "1" |
300 if sframe == "0": | |
301 sframe = "1" | |
304 slen = int(hit.findtext("Hit_len")) | 302 slen = int(hit.findtext("Hit_len")) |
305 values.extend([sallseqid, | 303 values.extend([sallseqid, |
306 hsp.findtext("Hsp_score"), #score, | 304 hsp.findtext("Hsp_score"), # score, |
307 nident, | 305 nident, |
308 positive, | 306 positive, |
309 hsp.findtext("Hsp_gaps"), #gaps, | 307 hsp.findtext("Hsp_gaps"), # gaps, |
310 ppos, | 308 ppos, |
311 qframe, | 309 qframe, |
312 sframe, | 310 sframe, |
313 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking | 311 # NOTE - for blastp, XML shows original seq, tabular uses XXX masking |
314 q_seq, | 312 q_seq, |
315 h_seq, | 313 h_seq, |
316 str(qlen), | 314 str(qlen), |
317 str(slen), | 315 str(slen), |
318 salltitles, | 316 salltitles, |
319 ]) | 317 ]) |
320 if cols: | 318 if cols: |
321 #Only a subset of the columns are needed | 319 # Only a subset of the columns are needed |
322 values = [values[colnames.index(c)] for c in cols] | 320 values = [values[colnames.index(c)] for c in cols] |
323 #print "\t".join(values) | 321 # print "\t".join(values) |
324 output_handle.write("\t".join(values) + "\n") | 322 output_handle.write("\t".join(values) + "\n") |
325 # prevents ElementTree from growing large datastructure | 323 # prevents ElementTree from growing large datastructure |
326 root.clear() | 324 root.clear() |
327 elem.clear() | 325 elem.clear() |
328 | 326 |
337 convert(in_file, outfile) | 335 convert(in_file, outfile) |
338 | 336 |
339 if options.output: | 337 if options.output: |
340 outfile.close() | 338 outfile.close() |
341 else: | 339 else: |
342 #Using stdout | 340 # Using stdout |
343 pass | 341 pass |
344 |