comparison tools/blastxml_to_top_descr/blastxml_to_top_descr.py @ 11:98f8431dab44 draft

Uploaded v0.1.0, now also handles extended tabular BLAST output.
author peterjc
date Fri, 13 Jun 2014 07:07:35 -0400
parents
children 8dc4ba7eba5d
comparison
equal deleted inserted replaced
10:09a68a90d552 11:98f8431dab44
1 #!/usr/bin/env python
2 """Convert a BLAST XML file to a top hits description table.
3
4 Takes three command line options, input BLAST XML filename, output tabular
5 BLAST filename, number of hits to collect the descriptions of.
6
7 Assumes the hits are pre-sorted, so "best" 3 hits gives first 3 hits.
8 """
9 import os
10 import sys
11 import re
12 from optparse import OptionParser
13
14 if "-v" in sys.argv or "--version" in sys.argv:
15 print "v0.1.0"
16 sys.exit(0)
17
18 if sys.version_info[:2] >= ( 2, 5 ):
19 import xml.etree.cElementTree as ElementTree
20 else:
21 from galaxy import eggs
22 import pkg_resources; pkg_resources.require( "elementtree" )
23 from elementtree import ElementTree
24
25 def stop_err( msg ):
26 sys.stderr.write("%s\n" % msg)
27 sys.exit(1)
28
29 usage = """Use as follows:
30
31 $ blastxml_to_top_descr.py [-t 3] -o example.tabular input.xml
32
33 Or,
34
35 $ blastxml_to_top_descr.py [--topN 3] --output example.tabular input.xml
36
37 This will take the top 3 BLAST descriptions from the input BLAST XML file,
38 writing them to the specified output file in tabular format.
39 """
40
41 parser = OptionParser(usage=usage)
42 parser.add_option("-t", "--topN", dest="topN", default=3,
43 help="Number of descriptions to collect (in order from file)")
44 parser.add_option("-o", "--output", dest="out_file", default=None,
45 help="Output filename for tabular file",
46 metavar="FILE")
47 parser.add_option("-f", "--format", dest="format", default="blastxml",
48 help="Input format (blastxml or tabular)")
49 parser.add_option("-q", "--qseqid", dest="qseqid", default="1",
50 help="Column for query 'qseqid' (for tabular input; default 1)")
51 parser.add_option("-s", "--sseqid", dest="sseqid", default="2",
52 help="Column for subject 'sseqid' (for tabular input; default 2)")
53 parser.add_option("-d", "--salltitles", dest="salltitles", default="25",
54 help="Column for descriptions 'salltitles' (for tabular input; default 25)")
55 (options, args) = parser.parse_args()
56
57 if len(sys.argv) == 4 and len(args) == 3 and not options.out_file:
58 stop_err("""The API has changed, replace this:
59
60 $ python blastxml_to_top_descr.py input.xml output.tab 3
61
62 with:
63
64 $ python blastxml_to_top_descr.py -o output.tab -t 3 input.xml
65
66 Sorry.
67 """)
68
69 if not args:
70 stop_err("Input filename missing, try -h")
71 if len(args) > 1:
72 stop_err("Expects a single argument, one input filename")
73 in_file = args[0]
74 out_file = options.out_file
75 topN = options.topN
76
77 try:
78 topN = int(topN)
79 except ValueError:
80 stop_err("Number of hits argument should be an integer (at least 1)")
81 if topN < 1:
82 stop_err("Number of hits argument should be an integer (at least 1)")
83
84 if not os.path.isfile(in_file):
85 stop_err("Missing input file: %r" % in_file)
86
87
88 def get_column(value):
89 """Convert column number on command line to Python index."""
90 if value.startswith("c"):
91 # Ignore c prefix, e.g. "c1" for "1"
92 value = value[1:]
93 try:
94 col = int(value)
95 except:
96 stop_err("Expected an integer column number, not %r" % value)
97 if col < 1:
98 stop_err("Expect column numbers to be at least one, not %r" % value)
99 return col - 1 # Python counting!
100
101 def tabular_hits(in_file, qseqid, sseqid, salltitles):
102 """Parse key data from tabular BLAST output.
103
104 Iterator returning tuples (qseqid, list_of_subject_description)
105 """
106 current_query = None
107 current_hits = []
108 with open(in_file) as input:
109 for line in input:
110 parts = line.rstrip("\n").split("\t")
111 query = parts[qseqid]
112 descr = "%s %s" % (parts[sseqid], parts[salltitles])
113 if current_query is None:
114 # First hit
115 current_query = query
116 current_hits = [descr]
117 elif current_query == query:
118 # Another hit
119 current_hits.append(descr)
120 else:
121 # New query
122 yield current_query, current_hits
123 current_query = query
124 current_hits = [descr]
125 if current_query is not None:
126 # Final query
127 yield current_query, current_hits
128
129 def blastxml_hits(in_file):
130 """Parse key data from BLAST XML output.
131
132 Iterator returning tuples (qseqid, list_of_subject_description)
133 """
134 try:
135 context = ElementTree.iterparse(in_file, events=("start", "end"))
136 except:
137 with open(in_file) as handle:
138 header = handle.read(100)
139 stop_err("Invalid data format in XML file %r which starts: %r" % (in_file, header))
140 # turn it into an iterator
141 context = iter(context)
142 # get the root element
143 try:
144 event, root = context.next()
145 except:
146 with open(in_file) as handle:
147 header = handle.read(100)
148 stop_err("Unable to get root element from XML file %r which starts: %r" % (in_file, header))
149
150 re_default_query_id = re.compile("^Query_\d+$")
151 assert re_default_query_id.match("Query_101")
152 assert not re_default_query_id.match("Query_101a")
153 assert not re_default_query_id.match("MyQuery_101")
154 re_default_subject_id = re.compile("^Subject_\d+$")
155 assert re_default_subject_id.match("Subject_1")
156 assert not re_default_subject_id.match("Subject_")
157 assert not re_default_subject_id.match("Subject_12a")
158 assert not re_default_subject_id.match("TheSubject_1")
159
160 count = 0
161 pos_count = 0
162 current_query = None
163 hit_descrs = []
164 for event, elem in context:
165 # for every <Iteration> tag
166 if event == "end" and elem.tag == "Iteration":
167 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
168 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
169 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
170 # <Iteration_query-len>406</Iteration_query-len>
171 # <Iteration_hits></Iteration_hits>
172 #
173 # Or, from BLAST 2.2.24+ run online
174 # <Iteration_query-ID>Query_1</Iteration_query-ID>
175 # <Iteration_query-def>Sample</Iteration_query-def>
176 # <Iteration_query-len>516</Iteration_query-len>
177 # <Iteration_hits>...
178 qseqid = elem.findtext("Iteration_query-ID")
179 if qseqid is None:
180 stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)")
181 if re_default_query_id.match(qseqid):
182 #Place holder ID, take the first word of the query definition
183 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
184 if current_query is None:
185 # First hit
186 current_query = qseqid
187 hit_descrs = []
188 elif current_query != qseqid:
189 # New hit
190 yield current_query, hit_descrs
191 current_query = qseqid
192 hit_descrs = []
193 else:
194 # Continuation of previous query
195 # i.e. This BLAST XML did not use one <Iteration> per query
196 # sys.stderr.write("Multiple <Iteration> blocks for %s\n" % qseqid)
197 pass
198 # for every <Hit> within <Iteration>
199 for hit in elem.findall("Iteration_hits/Hit"):
200 # Expecting either this,
201 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
202 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
203 # <Hit_accession>P56514</Hit_accession>
204 # or,
205 # <Hit_id>Subject_1</Hit_id>
206 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
207 # <Hit_accession>Subject_1</Hit_accession>
208 #
209 #apparently depending on the parse_deflines switch
210 sseqid = hit.findtext("Hit_id").split(None,1)[0]
211 hit_def = sseqid + " " + hit.findtext("Hit_def")
212 if re_default_subject_id.match(sseqid) \
213 and sseqid == hit.findtext("Hit_accession"):
214 #Place holder ID, take the first word of the subject definition
215 hit_def = hit.findtext("Hit_def")
216 sseqid = hit_def.split(None,1)[0]
217 assert hit_def not in hit_descrs
218 hit_descrs.append(hit_def)
219 # prevents ElementTree from growing large datastructure
220 root.clear()
221 elem.clear()
222 if current_query is not None:
223 # Final query
224 yield current_query, hit_descrs
225
226 if options.format == "blastxml":
227 hits = blastxml_hits(in_file)
228 elif options.format == "tabular":
229 qseqid = get_column(options.qseqid)
230 sseqid = get_column(options.sseqid)
231 salltitles = get_column(options.salltitles)
232 hits = tabular_hits(in_file, qseqid, sseqid, salltitles)
233 else:
234 stop_err("Unsupported format: %r" % options.format)
235
236
237 def best_hits(descriptions, topN):
238 if len(descriptions) < topN:
239 return descriptions + [""] * (topN - len(descriptions))
240 else:
241 return descriptions[:topN]
242
243 count = 0
244 if out_file is None:
245 outfile = sys.stdout
246 else:
247 outfile = open(out_file, 'w')
248 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN)))
249 for query, descrs in hits:
250 count += 1
251 outfile.write("%s\t%s\n" % (query, "\t".join(best_hits(descrs, topN))))
252 if out_file is not None:
253 outfile.close()
254 # Queries with no hits are not present in tabular BLAST output
255 print("%i queries with BLAST results" % count)