Mercurial > repos > peterjc > blastxml_to_top_descr
comparison tools/blastxml_to_top_descr/blastxml_to_top_descr.py @ 11:98f8431dab44 draft
Uploaded v0.1.0, now also handles extended tabular BLAST output.
author | peterjc |
---|---|
date | Fri, 13 Jun 2014 07:07:35 -0400 |
parents | |
children | 8dc4ba7eba5d |
comparison
equal
deleted
inserted
replaced
10:09a68a90d552 | 11:98f8431dab44 |
---|---|
1 #!/usr/bin/env python | |
2 """Convert a BLAST XML file to a top hits description table. | |
3 | |
4 Takes three command line options, input BLAST XML filename, output tabular | |
5 BLAST filename, number of hits to collect the descriptions of. | |
6 | |
7 Assumes the hits are pre-sorted, so "best" 3 hits gives first 3 hits. | |
8 """ | |
9 import os | |
10 import sys | |
11 import re | |
12 from optparse import OptionParser | |
13 | |
14 if "-v" in sys.argv or "--version" in sys.argv: | |
15 print "v0.1.0" | |
16 sys.exit(0) | |
17 | |
18 if sys.version_info[:2] >= ( 2, 5 ): | |
19 import xml.etree.cElementTree as ElementTree | |
20 else: | |
21 from galaxy import eggs | |
22 import pkg_resources; pkg_resources.require( "elementtree" ) | |
23 from elementtree import ElementTree | |
24 | |
25 def stop_err( msg ): | |
26 sys.stderr.write("%s\n" % msg) | |
27 sys.exit(1) | |
28 | |
29 usage = """Use as follows: | |
30 | |
31 $ blastxml_to_top_descr.py [-t 3] -o example.tabular input.xml | |
32 | |
33 Or, | |
34 | |
35 $ blastxml_to_top_descr.py [--topN 3] --output example.tabular input.xml | |
36 | |
37 This will take the top 3 BLAST descriptions from the input BLAST XML file, | |
38 writing them to the specified output file in tabular format. | |
39 """ | |
40 | |
41 parser = OptionParser(usage=usage) | |
42 parser.add_option("-t", "--topN", dest="topN", default=3, | |
43 help="Number of descriptions to collect (in order from file)") | |
44 parser.add_option("-o", "--output", dest="out_file", default=None, | |
45 help="Output filename for tabular file", | |
46 metavar="FILE") | |
47 parser.add_option("-f", "--format", dest="format", default="blastxml", | |
48 help="Input format (blastxml or tabular)") | |
49 parser.add_option("-q", "--qseqid", dest="qseqid", default="1", | |
50 help="Column for query 'qseqid' (for tabular input; default 1)") | |
51 parser.add_option("-s", "--sseqid", dest="sseqid", default="2", | |
52 help="Column for subject 'sseqid' (for tabular input; default 2)") | |
53 parser.add_option("-d", "--salltitles", dest="salltitles", default="25", | |
54 help="Column for descriptions 'salltitles' (for tabular input; default 25)") | |
55 (options, args) = parser.parse_args() | |
56 | |
57 if len(sys.argv) == 4 and len(args) == 3 and not options.out_file: | |
58 stop_err("""The API has changed, replace this: | |
59 | |
60 $ python blastxml_to_top_descr.py input.xml output.tab 3 | |
61 | |
62 with: | |
63 | |
64 $ python blastxml_to_top_descr.py -o output.tab -t 3 input.xml | |
65 | |
66 Sorry. | |
67 """) | |
68 | |
69 if not args: | |
70 stop_err("Input filename missing, try -h") | |
71 if len(args) > 1: | |
72 stop_err("Expects a single argument, one input filename") | |
73 in_file = args[0] | |
74 out_file = options.out_file | |
75 topN = options.topN | |
76 | |
77 try: | |
78 topN = int(topN) | |
79 except ValueError: | |
80 stop_err("Number of hits argument should be an integer (at least 1)") | |
81 if topN < 1: | |
82 stop_err("Number of hits argument should be an integer (at least 1)") | |
83 | |
84 if not os.path.isfile(in_file): | |
85 stop_err("Missing input file: %r" % in_file) | |
86 | |
87 | |
88 def get_column(value): | |
89 """Convert column number on command line to Python index.""" | |
90 if value.startswith("c"): | |
91 # Ignore c prefix, e.g. "c1" for "1" | |
92 value = value[1:] | |
93 try: | |
94 col = int(value) | |
95 except: | |
96 stop_err("Expected an integer column number, not %r" % value) | |
97 if col < 1: | |
98 stop_err("Expect column numbers to be at least one, not %r" % value) | |
99 return col - 1 # Python counting! | |
100 | |
101 def tabular_hits(in_file, qseqid, sseqid, salltitles): | |
102 """Parse key data from tabular BLAST output. | |
103 | |
104 Iterator returning tuples (qseqid, list_of_subject_description) | |
105 """ | |
106 current_query = None | |
107 current_hits = [] | |
108 with open(in_file) as input: | |
109 for line in input: | |
110 parts = line.rstrip("\n").split("\t") | |
111 query = parts[qseqid] | |
112 descr = "%s %s" % (parts[sseqid], parts[salltitles]) | |
113 if current_query is None: | |
114 # First hit | |
115 current_query = query | |
116 current_hits = [descr] | |
117 elif current_query == query: | |
118 # Another hit | |
119 current_hits.append(descr) | |
120 else: | |
121 # New query | |
122 yield current_query, current_hits | |
123 current_query = query | |
124 current_hits = [descr] | |
125 if current_query is not None: | |
126 # Final query | |
127 yield current_query, current_hits | |
128 | |
129 def blastxml_hits(in_file): | |
130 """Parse key data from BLAST XML output. | |
131 | |
132 Iterator returning tuples (qseqid, list_of_subject_description) | |
133 """ | |
134 try: | |
135 context = ElementTree.iterparse(in_file, events=("start", "end")) | |
136 except: | |
137 with open(in_file) as handle: | |
138 header = handle.read(100) | |
139 stop_err("Invalid data format in XML file %r which starts: %r" % (in_file, header)) | |
140 # turn it into an iterator | |
141 context = iter(context) | |
142 # get the root element | |
143 try: | |
144 event, root = context.next() | |
145 except: | |
146 with open(in_file) as handle: | |
147 header = handle.read(100) | |
148 stop_err("Unable to get root element from XML file %r which starts: %r" % (in_file, header)) | |
149 | |
150 re_default_query_id = re.compile("^Query_\d+$") | |
151 assert re_default_query_id.match("Query_101") | |
152 assert not re_default_query_id.match("Query_101a") | |
153 assert not re_default_query_id.match("MyQuery_101") | |
154 re_default_subject_id = re.compile("^Subject_\d+$") | |
155 assert re_default_subject_id.match("Subject_1") | |
156 assert not re_default_subject_id.match("Subject_") | |
157 assert not re_default_subject_id.match("Subject_12a") | |
158 assert not re_default_subject_id.match("TheSubject_1") | |
159 | |
160 count = 0 | |
161 pos_count = 0 | |
162 current_query = None | |
163 hit_descrs = [] | |
164 for event, elem in context: | |
165 # for every <Iteration> tag | |
166 if event == "end" and elem.tag == "Iteration": | |
167 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA | |
168 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> | |
169 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> | |
170 # <Iteration_query-len>406</Iteration_query-len> | |
171 # <Iteration_hits></Iteration_hits> | |
172 # | |
173 # Or, from BLAST 2.2.24+ run online | |
174 # <Iteration_query-ID>Query_1</Iteration_query-ID> | |
175 # <Iteration_query-def>Sample</Iteration_query-def> | |
176 # <Iteration_query-len>516</Iteration_query-len> | |
177 # <Iteration_hits>... | |
178 qseqid = elem.findtext("Iteration_query-ID") | |
179 if qseqid is None: | |
180 stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)") | |
181 if re_default_query_id.match(qseqid): | |
182 #Place holder ID, take the first word of the query definition | |
183 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] | |
184 if current_query is None: | |
185 # First hit | |
186 current_query = qseqid | |
187 hit_descrs = [] | |
188 elif current_query != qseqid: | |
189 # New hit | |
190 yield current_query, hit_descrs | |
191 current_query = qseqid | |
192 hit_descrs = [] | |
193 else: | |
194 # Continuation of previous query | |
195 # i.e. This BLAST XML did not use one <Iteration> per query | |
196 # sys.stderr.write("Multiple <Iteration> blocks for %s\n" % qseqid) | |
197 pass | |
198 # for every <Hit> within <Iteration> | |
199 for hit in elem.findall("Iteration_hits/Hit"): | |
200 # Expecting either this, | |
201 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> | |
202 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> | |
203 # <Hit_accession>P56514</Hit_accession> | |
204 # or, | |
205 # <Hit_id>Subject_1</Hit_id> | |
206 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> | |
207 # <Hit_accession>Subject_1</Hit_accession> | |
208 # | |
209 #apparently depending on the parse_deflines switch | |
210 sseqid = hit.findtext("Hit_id").split(None,1)[0] | |
211 hit_def = sseqid + " " + hit.findtext("Hit_def") | |
212 if re_default_subject_id.match(sseqid) \ | |
213 and sseqid == hit.findtext("Hit_accession"): | |
214 #Place holder ID, take the first word of the subject definition | |
215 hit_def = hit.findtext("Hit_def") | |
216 sseqid = hit_def.split(None,1)[0] | |
217 assert hit_def not in hit_descrs | |
218 hit_descrs.append(hit_def) | |
219 # prevents ElementTree from growing large datastructure | |
220 root.clear() | |
221 elem.clear() | |
222 if current_query is not None: | |
223 # Final query | |
224 yield current_query, hit_descrs | |
225 | |
226 if options.format == "blastxml": | |
227 hits = blastxml_hits(in_file) | |
228 elif options.format == "tabular": | |
229 qseqid = get_column(options.qseqid) | |
230 sseqid = get_column(options.sseqid) | |
231 salltitles = get_column(options.salltitles) | |
232 hits = tabular_hits(in_file, qseqid, sseqid, salltitles) | |
233 else: | |
234 stop_err("Unsupported format: %r" % options.format) | |
235 | |
236 | |
237 def best_hits(descriptions, topN): | |
238 if len(descriptions) < topN: | |
239 return descriptions + [""] * (topN - len(descriptions)) | |
240 else: | |
241 return descriptions[:topN] | |
242 | |
243 count = 0 | |
244 if out_file is None: | |
245 outfile = sys.stdout | |
246 else: | |
247 outfile = open(out_file, 'w') | |
248 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN))) | |
249 for query, descrs in hits: | |
250 count += 1 | |
251 outfile.write("%s\t%s\n" % (query, "\t".join(best_hits(descrs, topN)))) | |
252 if out_file is not None: | |
253 outfile.close() | |
254 # Queries with no hits are not present in tabular BLAST output | |
255 print("%i queries with BLAST results" % count) |