comparison blast2html.py @ 22:efce16c98650

rename blast2html
author Jan Kanis <jan.code@jankanis.nl>
date Thu, 15 May 2014 10:48:09 +0200
parents blast_html.py@53cd304c5f26
children 6995a6f34f3f
comparison
equal deleted inserted replaced
21:9596fea636bb 22:efce16c98650
1 #!/usr/bin/env python3
2
3 # Copyright The Hyve B.V. 2014
4 # License: GPL version 3 or higher
5
6 import sys
7 import math
8 import warnings
9 from os import path
10 from itertools import repeat
11 import argparse
12 from lxml import objectify
13 import jinja2
14
15
16
17 _filters = {}
18 def filter(func_or_name):
19 "Decorator to register a function as filter in the current jinja environment"
20 if isinstance(func_or_name, str):
21 def inner(func):
22 _filters[func_or_name] = func.__name__
23 return func
24 return inner
25 else:
26 _filters[func_or_name.__name__] = func_or_name.__name__
27 return func_or_name
28
29
30 def color_idx(length):
31 if length < 40:
32 return 0
33 elif length < 50:
34 return 1
35 elif length < 80:
36 return 2
37 elif length < 200:
38 return 3
39 return 4
40
41 @filter
42 def fmt(val, fmt):
43 return format(float(val), fmt)
44
45 @filter
46 def firsttitle(hit):
47 return hit.Hit_def.text.split('>')[0]
48
49 @filter
50 def othertitles(hit):
51 """Split a hit.Hit_def that contains multiple titles up, splitting out the hit ids from the titles."""
52 id_titles = hit.Hit_def.text.split('>')
53
54 titles = []
55 for t in id_titles[1:]:
56 fullid, title = t.split(' ', 1)
57 hitid, id = fullid.split('|', 2)[1:3]
58 titles.append(dict(id = id,
59 hitid = hitid,
60 fullid = fullid,
61 title = title))
62 return titles
63
64 @filter
65 def hitid(hit):
66 return hit.Hit_id.text.split('|', 2)[1]
67
68 @filter
69 def seqid(hit):
70 return hit.Hit_id.text.split('|', 2)[2]
71
72 @filter
73 def alignment_pre(hsp):
74 return (
75 "Query {:>7s} {} {}\n".format(hsp['Hsp_query-from'].text, hsp.Hsp_qseq, hsp['Hsp_query-to']) +
76 " {:7s} {}\n".format('', hsp.Hsp_midline) +
77 "Subject{:>7s} {} {}".format(hsp['Hsp_hit-from'].text, hsp.Hsp_hseq, hsp['Hsp_hit-to'])
78 )
79
80 @filter('len')
81 def blastxml_len(node):
82 if node.tag == 'Hsp':
83 return int(node['Hsp_align-len'])
84 elif node.tag == 'Iteration':
85 return int(node['Iteration_query-len'])
86 raise Exception("Unknown XML node type: "+node.tag)
87
88
89 @filter
90 def asframe(frame):
91 if frame == 1:
92 return 'Plus'
93 elif frame == -1:
94 return 'Minus'
95 raise Exception("frame should be either +1 or -1")
96
97 def genelink(hit, type='genbank', hsp=None):
98 if not isinstance(hit, str):
99 hit = hitid(hit)
100 link = "http://www.ncbi.nlm.nih.gov/nucleotide/{}?report={}&log$=nuclalign".format(hit, type)
101 if hsp != None:
102 link += "&from={}&to={}".format(hsp['Hsp_hit-from'], hsp['Hsp_hit-to'])
103 return link
104
105
106 # javascript escape filter based on Django's, from https://github.com/dsissitka/khan-website/blob/master/templatefilters.py#L112-139
107 # I've removed the html escapes, since html escaping is already being performed by the template engine.
108
109 _base_js_escapes = (
110 ('\\', r'\u005C'),
111 ('\'', r'\u0027'),
112 ('"', r'\u0022'),
113 # ('>', r'\u003E'),
114 # ('<', r'\u003C'),
115 # ('&', r'\u0026'),
116 # ('=', r'\u003D'),
117 # ('-', r'\u002D'),
118 # (';', r'\u003B'),
119 # (u'\u2028', r'\u2028'),
120 # (u'\u2029', r'\u2029')
121 )
122
123 # Escape every ASCII character with a value less than 32. This is
124 # needed a.o. to prevent html parsers from jumping out of javascript
125 # parsing mode.
126 _js_escapes = (_base_js_escapes +
127 tuple(('%c' % z, '\\u%04X' % z) for z in range(32)))
128
129 @filter
130 def js_string_escape(value):
131 """Escape javascript string literal escapes. Note that this only works
132 within javascript string literals, not in general javascript
133 snippets."""
134
135 value = str(value)
136
137 for bad, good in _js_escapes:
138 value = value.replace(bad, good)
139
140 return value
141
142 @filter
143 def hits(result):
144 # sort hits by longest hotspot first
145 return sorted(result.Iteration_hits.findall('Hit'),
146 key=lambda h: max(blastxml_len(hsp) for hsp in h.Hit_hsps.Hsp),
147 reverse=True)
148
149
150
151 class BlastVisualize:
152
153 colors = ('black', 'blue', 'green', 'magenta', 'red')
154
155 max_scale_labels = 10
156
157 def __init__(self, input, templatedir, templatename):
158 self.input = input
159 self.templatename = templatename
160
161 self.blast = objectify.parse(self.input).getroot()
162 self.loader = jinja2.FileSystemLoader(searchpath=templatedir)
163 self.environment = jinja2.Environment(loader=self.loader,
164 lstrip_blocks=True, trim_blocks=True, autoescape=True)
165
166 self._addfilters(self.environment)
167
168
169 def _addfilters(self, environment):
170 for filtername, funcname in _filters.items():
171 try:
172 environment.filters[filtername] = getattr(self, funcname)
173 except AttributeError:
174 environment.filters[filtername] = globals()[funcname]
175
176 def render(self, output):
177 template = self.environment.get_template(self.templatename)
178
179 params = (('Query ID', self.blast["BlastOutput_query-ID"]),
180 ('Query definition', self.blast["BlastOutput_query-def"]),
181 ('Query length', self.blast["BlastOutput_query-len"]),
182 ('Program', self.blast.BlastOutput_version),
183 ('Database', self.blast.BlastOutput_db),
184 )
185
186 output.write(template.render(blast=self.blast,
187 iterations=self.blast.BlastOutput_iterations.Iteration,
188 colors=self.colors,
189 # match_colors=self.match_colors(),
190 # hit_info=self.hit_info(),
191 genelink=genelink,
192 params=params))
193
194 @filter
195 def match_colors(self, result):
196 """
197 An iterator that yields lists of length-color pairs.
198 """
199
200 query_length = blastxml_len(result)
201
202 percent_multiplier = 100 / query_length
203
204 for hit in hits(result):
205 # sort hotspots from short to long, so we can overwrite index colors of
206 # short matches with those of long ones.
207 hotspots = sorted(hit.Hit_hsps.Hsp, key=lambda hsp: blastxml_len(hsp))
208 table = bytearray([255]) * query_length
209 for hsp in hotspots:
210 frm = hsp['Hsp_query-from'] - 1
211 to = int(hsp['Hsp_query-to'])
212 table[frm:to] = repeat(color_idx(blastxml_len(hsp)), to - frm)
213
214 matches = []
215 last = table[0]
216 count = 0
217 for i in range(query_length):
218 if table[i] == last:
219 count += 1
220 continue
221 matches.append((count * percent_multiplier, self.colors[last] if last != 255 else 'transparent'))
222 last = table[i]
223 count = 1
224 matches.append((count * percent_multiplier, self.colors[last] if last != 255 else 'transparent'))
225
226 yield dict(colors=matches, link="#hit"+hit.Hit_num.text, defline=firsttitle(hit))
227
228 @filter
229 def queryscale(self, result):
230 query_length = blastxml_len(result)
231 skip = math.ceil(query_length / self.max_scale_labels)
232 percent_multiplier = 100 / query_length
233 for i in range(1, query_length+1):
234 if i % skip == 0:
235 yield dict(label = i, width = skip * percent_multiplier, shorter = False\)
236 if query_length % skip != 0:
237 yield dict(label = query_length,
238 width = (query_length % skip) * percent_multiplier,
239 shorter = True)
240
241 @filter
242 def hit_info(self, result):
243
244 query_length = blastxml_len(result)
245
246 for hit in hits(result):
247 hsps = hit.Hit_hsps.Hsp
248
249 cover = [False] * query_length
250 for hsp in hsps:
251 cover[hsp['Hsp_query-from']-1 : int(hsp['Hsp_query-to'])] = repeat(True, blastxml_len(hsp))
252 cover_count = cover.count(True)
253
254 def hsp_val(path):
255 return (float(hsp[path]) for hsp in hsps)
256
257 yield dict(hit = hit,
258 title = firsttitle(hit),
259 link_id = hit.Hit_num,
260 maxscore = "{:.1f}".format(max(hsp_val('Hsp_bit-score'))),
261 totalscore = "{:.1f}".format(sum(hsp_val('Hsp_bit-score'))),
262 cover = "{:.0%}".format(cover_count / query_length),
263 e_value = "{:.4g}".format(min(hsp_val('Hsp_evalue'))),
264 # FIXME: is this the correct formula vv?
265 ident = "{:.0%}".format(float(min(hsp.Hsp_identity / blastxml_len(hsp) for hsp in hsps))),
266 accession = hit.Hit_accession)
267
268 def main():
269
270 parser = argparse.ArgumentParser(description="Convert a BLAST XML result into a nicely readable html page",
271 usage="{} [-i] INPUT [-o OUTPUT]".format(sys.argv[0]))
272 input_group = parser.add_mutually_exclusive_group(required=True)
273 input_group.add_argument('positional_arg', metavar='INPUT', nargs='?', type=argparse.FileType(mode='r'),
274 help='The input Blast XML file, same as -i/--input')
275 input_group.add_argument('-i', '--input', type=argparse.FileType(mode='r'),
276 help='The input Blast XML file')
277 parser.add_argument('-o', '--output', type=argparse.FileType(mode='w'), default=sys.stdout,
278 help='The output html file')
279 # We just want the file name here, so jinja can open the file
280 # itself. But it is easier to just use a FileType so argparse can
281 # handle the errors. This introduces a small race condition when
282 # jinja later tries to re-open the template file, but we don't
283 # care too much.
284 parser.add_argument('--template', type=argparse.FileType(mode='r'), default='blast2html.html.jinja',
285 help='The template file to use. Defaults to blast_html.html.jinja')
286
287 args = parser.parse_args()
288 if args.input == None:
289 args.input = args.positional_arg
290 if args.input == None:
291 parser.error('no input specified')
292
293 templatedir, templatename = path.split(args.template.name)
294 args.template.close()
295 if not templatedir:
296 templatedir = '.'
297
298 b = BlastVisualize(args.input, templatedir, templatename)
299 b.render(args.output)
300
301
302 if __name__ == '__main__':
303 main()
304