Mercurial > repos > devteam > ncbi_blast_plus
comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 11:4c4a0da938ff draft
Uploaded v0.0.22, now wraps BLAST+ 2.2.28 allowing extended tabular output to include the hit descriptions as column 25.
Supports $GALAXY_SLOTS.
Includes more tests and heavy use of macros.
author | peterjc |
---|---|
date | Thu, 05 Dec 2013 06:55:59 -0500 |
parents | 70e7dcbf6573 |
children | 623f727cdff1 |
comparison
equal
deleted
inserted
replaced
10:70e7dcbf6573 | 11:4c4a0da938ff |
---|---|
29 The additional columns offered in the Galaxy BLAST+ wrappers are: | 29 The additional columns offered in the Galaxy BLAST+ wrappers are: |
30 | 30 |
31 ====== ============= =========================================== | 31 ====== ============= =========================================== |
32 Column NCBI name Description | 32 Column NCBI name Description |
33 ------ ------------- ------------------------------------------- | 33 ------ ------------- ------------------------------------------- |
34 13 sallseqid All subject Seq-id(s), separated by a ';' | 34 13 sallseqid All subject Seq-id(s), separated by ';' |
35 14 score Raw score | 35 14 score Raw score |
36 15 nident Number of identical matches | 36 15 nident Number of identical matches |
37 16 positive Number of positive-scoring matches | 37 16 positive Number of positive-scoring matches |
38 17 gaps Total number of gaps | 38 17 gaps Total number of gaps |
39 18 ppos Percentage of positive-scoring matches | 39 18 ppos Percentage of positive-scoring matches |
41 20 sframe Subject frame | 41 20 sframe Subject frame |
42 21 qseq Aligned part of query sequence | 42 21 qseq Aligned part of query sequence |
43 22 sseq Aligned part of subject sequence | 43 22 sseq Aligned part of subject sequence |
44 23 qlen Query sequence length | 44 23 qlen Query sequence length |
45 24 slen Subject sequence length | 45 24 slen Subject sequence length |
46 25 salltitles All subject titles, separated by '<>' | |
46 ====== ============= =========================================== | 47 ====== ============= =========================================== |
47 | 48 |
48 Most of these fields are given explicitly in the XML file, others some like | 49 Most of these fields are given explicitly in the XML file, others some like |
49 the percentage identity and the number of gap openings must be calculated. | 50 the percentage identity and the number of gap openings must be calculated. |
50 | 51 |
61 """ | 62 """ |
62 import sys | 63 import sys |
63 import re | 64 import re |
64 | 65 |
65 if "-v" in sys.argv or "--version" in sys.argv: | 66 if "-v" in sys.argv or "--version" in sys.argv: |
66 print "v0.0.12" | 67 print "v0.0.22" |
67 sys.exit(0) | 68 sys.exit(0) |
68 | 69 |
69 if sys.version_info[:2] >= ( 2, 5 ): | 70 if sys.version_info[:2] >= ( 2, 5 ): |
70 try: | 71 try: |
71 from xml.etree import cElementTree as ElementTree | 72 from xml.etree import cElementTree as ElementTree |
87 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)") | 88 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)") |
88 | 89 |
89 if out_fmt == "std": | 90 if out_fmt == "std": |
90 extended = False | 91 extended = False |
91 elif out_fmt == "x22": | 92 elif out_fmt == "x22": |
92 stop_err("Format argument x22 has been replaced with ext (extended 24 columns)") | 93 stop_err("Format argument x22 has been replaced with ext (extended 25 columns)") |
93 elif out_fmt == "ext": | 94 elif out_fmt == "ext": |
94 extended = True | 95 extended = True |
95 else: | 96 else: |
96 stop_err("Format argument should be std (12 column) or ext (extended 24 columns)") | 97 stop_err("Format argument should be std (12 column) or ext (extended 25 columns), not: %r" % out_fmt) |
97 | 98 |
98 | 99 |
99 # get an iterable | 100 # get an iterable |
100 try: | 101 try: |
101 context = ElementTree.iterparse(in_file, events=("start", "end")) | 102 context = ElementTree.iterparse(in_file, events=("start", "end")) |
155 # <Hit_id>Subject_1</Hit_id> | 156 # <Hit_id>Subject_1</Hit_id> |
156 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> | 157 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> |
157 # <Hit_accession>Subject_1</Hit_accession> | 158 # <Hit_accession>Subject_1</Hit_accession> |
158 # | 159 # |
159 #apparently depending on the parse_deflines switch | 160 #apparently depending on the parse_deflines switch |
161 # | |
162 #Or, with BLAST 2.2.28+ can get this, | |
163 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> | |
164 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> | |
165 # <Hit_accession>2</Hit_accession> | |
160 sseqid = hit.findtext("Hit_id").split(None,1)[0] | 166 sseqid = hit.findtext("Hit_id").split(None,1)[0] |
161 hit_def = sseqid + " " + hit.findtext("Hit_def") | 167 hit_def = sseqid + " " + hit.findtext("Hit_def") |
162 if re_default_subject_id.match(sseqid) \ | 168 if re_default_subject_id.match(sseqid) \ |
163 and sseqid == hit.findtext("Hit_accession"): | 169 and sseqid == hit.findtext("Hit_accession"): |
164 #Place holder ID, take the first word of the subject definition | 170 #Place holder ID, take the first word of the subject definition |
171 hit_def = hit.findtext("Hit_def") | |
172 sseqid = hit_def.split(None,1)[0] | |
173 if sseqid.startswith("gnl|BL_ORD_ID|") \ | |
174 and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): | |
175 #Alternative place holder ID, again take the first word of hit_def | |
165 hit_def = hit.findtext("Hit_def") | 176 hit_def = hit.findtext("Hit_def") |
166 sseqid = hit_def.split(None,1)[0] | 177 sseqid = hit_def.split(None,1)[0] |
167 # for every <Hsp> within <Hit> | 178 # for every <Hsp> within <Hit> |
168 for hsp in hit.findall("Hit_hsps/Hsp"): | 179 for hsp in hit.findall("Hit_hsps/Hsp"): |
169 nident = hsp.findtext("Hsp_identity") | 180 nident = hsp.findtext("Hsp_identity") |
226 evalue, #hsp.findtext("Hsp_evalue") in scientific notation | 237 evalue, #hsp.findtext("Hsp_evalue") in scientific notation |
227 bitscore, #hsp.findtext("Hsp_bit-score") rounded | 238 bitscore, #hsp.findtext("Hsp_bit-score") rounded |
228 ] | 239 ] |
229 | 240 |
230 if extended: | 241 if extended: |
231 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">")) | 242 try: |
243 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(" >")) | |
244 salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >")) | |
245 except IndexError as e: | |
246 stop_err("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e)) | |
232 #print hit_def, "-->", sallseqid | 247 #print hit_def, "-->", sallseqid |
233 positive = hsp.findtext("Hsp_positive") | 248 positive = hsp.findtext("Hsp_positive") |
234 ppos = "%0.2f" % (100*float(positive)/float(length)) | 249 ppos = "%0.2f" % (100*float(positive)/float(length)) |
235 qframe = hsp.findtext("Hsp_query-frame") | 250 qframe = hsp.findtext("Hsp_query-frame") |
236 sframe = hsp.findtext("Hsp_hit-frame") | 251 sframe = hsp.findtext("Hsp_hit-frame") |
250 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking | 265 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking |
251 q_seq, | 266 q_seq, |
252 h_seq, | 267 h_seq, |
253 str(qlen), | 268 str(qlen), |
254 str(slen), | 269 str(slen), |
270 salltitles, | |
255 ]) | 271 ]) |
256 #print "\t".join(values) | 272 #print "\t".join(values) |
257 outfile.write("\t".join(values) + "\n") | 273 outfile.write("\t".join(values) + "\n") |
258 # prevents ElementTree from growing large datastructure | 274 # prevents ElementTree from growing large datastructure |
259 root.clear() | 275 root.clear() |