Mercurial > repos > jankanis > blast2html
comparison blast2html.py @ 95:4378d11f0ed7
implement configurable gene bank links
author | Jan Kanis <jan.code@jankanis.nl> |
---|---|
date | Mon, 30 Jun 2014 16:49:45 +0200 |
parents | 9fb1a7d67317 |
children | 02b795b784e1 |
comparison
equal
deleted
inserted
replaced
94:df9fd5f35967 | 95:4378d11f0ed7 |
---|---|
13 import warnings | 13 import warnings |
14 import six, codecs | 14 import six, codecs |
15 from six.moves import builtins | 15 from six.moves import builtins |
16 from os import path | 16 from os import path |
17 from itertools import repeat | 17 from itertools import repeat |
18 from collections import defaultdict | |
18 import argparse | 19 import argparse |
19 from lxml import objectify | 20 from lxml import objectify |
20 import jinja2 | 21 import jinja2 |
22 | |
23 builtin_str = str | |
24 str = six.text_type | |
21 | 25 |
22 | 26 |
23 | 27 |
24 _filters = dict(float='float') | 28 _filters = dict(float='float') |
25 def filter(func_or_name): | 29 def filter(func_or_name): |
73 """Split a hit.Hit_def that contains multiple titles up, splitting out the hit ids from the titles.""" | 77 """Split a hit.Hit_def that contains multiple titles up, splitting out the hit ids from the titles.""" |
74 id_titles = hit.Hit_def.text.split('>') | 78 id_titles = hit.Hit_def.text.split('>') |
75 | 79 |
76 titles = [] | 80 titles = [] |
77 for t in id_titles[1:]: | 81 for t in id_titles[1:]: |
78 fullid, title = t.split(' ', 1) | 82 id, title = t.split(' ', 1) |
79 hitid, id = fullid.split('|', 2)[1:3] | 83 titles.append(argparse.Namespace(Hit_id = id, |
80 titles.append(dict(id = id, | 84 Hit_def = title, |
81 hitid = hitid, | 85 Hit_accession = '', |
82 fullid = fullid, | 86 getroottree = hit.getroottree)) |
83 title = title)) | |
84 return titles | 87 return titles |
85 | 88 |
86 @filter | 89 @filter |
87 def hitid(hit): | 90 def hitid(hit): |
88 hitid = hit.Hit_id.text | 91 return str(hit.Hit_id) |
89 s = hitid.split('|', 2) | |
90 if len(s) >= 2: | |
91 return s[1] | |
92 return hitid | |
93 | |
94 @filter | |
95 def seqid(hit): | |
96 hitid = hit.Hit_id.text | |
97 s = hitid.split('|', 2) | |
98 if len(s) >= 3: | |
99 return s[2] | |
100 return hitid | |
101 | 92 |
102 | 93 |
103 @filter | 94 @filter |
104 def alignment_pre(hsp): | 95 def alignment_pre(hsp): |
105 """Create the preformatted alignment blocks""" | 96 """Create the preformatted alignment blocks""" |
175 return 'Plus' | 166 return 'Plus' |
176 elif frame == -1: | 167 elif frame == -1: |
177 return 'Minus' | 168 return 'Minus' |
178 raise Exception("frame should be either +1 or -1") | 169 raise Exception("frame should be either +1 or -1") |
179 | 170 |
180 def genelink(hit, type='genbank', hsp=None): | 171 # def genelink(hit, type='genbank', hsp=None): |
181 if not isinstance(hit, six.string_types): | 172 # if not isinstance(hit, six.string_types): |
182 hit = hitid(hit) | 173 # hit = hitid(hit) |
183 link = "http://www.ncbi.nlm.nih.gov/nucleotide/{0}?report={1}&log$=nuclalign".format(hit, type) | 174 # link = "http://www.ncbi.nlm.nih.gov/nucleotide/{0}?report={1}&log$=nuclalign".format(hit, type) |
184 if hsp != None: | 175 # if hsp != None: |
185 link += "&from={0}&to={1}".format(hsp['Hsp_hit-from'], hsp['Hsp_hit-to']) | 176 # link += "&from={0}&to={1}".format(hsp['Hsp_hit-from'], hsp['Hsp_hit-to']) |
186 return link | 177 # return link |
187 | 178 |
188 | 179 |
189 # javascript escape filter based on Django's, from https://github.com/dsissitka/khan-website/blob/master/templatefilters.py#L112-139 | 180 # javascript escape filter based on Django's, from https://github.com/dsissitka/khan-website/blob/master/templatefilters.py#L112-139 |
190 # I've removed the html escapes, since html escaping is already being performed by the template engine. | 181 # I've removed the html escapes, since html escaping is already being performed by the template engine. |
191 | 182 |
216 Javascript string literal escape. Note that this only escapes data | 207 Javascript string literal escape. Note that this only escapes data |
217 for embedding within javascript string literals, not in general | 208 for embedding within javascript string literals, not in general |
218 javascript snippets. | 209 javascript snippets. |
219 """ | 210 """ |
220 | 211 |
221 value = six.text_type(value) | 212 value = str(value) |
222 | 213 |
223 for bad, good in _js_escapes: | 214 for bad, good in _js_escapes: |
224 value = value.replace(bad, good) | 215 value = value.replace(bad, good) |
225 | 216 |
226 return value | 217 return value |
238 | 229 |
239 colors = ('black', 'blue', 'green', 'magenta', 'red') | 230 colors = ('black', 'blue', 'green', 'magenta', 'red') |
240 | 231 |
241 max_scale_labels = 10 | 232 max_scale_labels = 10 |
242 | 233 |
243 def __init__(self, input, templatedir, templatename): | 234 def __init__(self, input, templatedir, templatename, genelinks={}): |
244 self.input = input | 235 self.input = input |
245 self.templatename = templatename | 236 self.templatename = templatename |
237 self.genelinks = genelinks | |
246 | 238 |
247 self.blast = objectify.parse(self.input).getroot() | 239 self.blast = objectify.parse(self.input).getroot() |
248 self.loader = jinja2.FileSystemLoader(searchpath=templatedir) | 240 self.loader = jinja2.FileSystemLoader(searchpath=templatedir) |
249 self.environment = jinja2.Environment(loader=self.loader, | 241 self.environment = jinja2.Environment(loader=self.loader, |
250 lstrip_blocks=True, trim_blocks=True, autoescape=True) | 242 lstrip_blocks=True, trim_blocks=True, autoescape=True) |
273 ) | 265 ) |
274 | 266 |
275 result = template.render(blast=self.blast, | 267 result = template.render(blast=self.blast, |
276 iterations=self.blast.BlastOutput_iterations.Iteration, | 268 iterations=self.blast.BlastOutput_iterations.Iteration, |
277 colors=self.colors, | 269 colors=self.colors, |
278 genelink=genelink, | |
279 params=params) | 270 params=params) |
280 if six.PY2: | 271 if six.PY2: |
281 result = result.encode('utf-8') | 272 result = result.encode('utf-8') |
282 output.write(result) | 273 output.write(result) |
283 | 274 |
349 totalscore = "{0:.1f}".format(sum(hsp_val('Hsp_bit-score'))), | 340 totalscore = "{0:.1f}".format(sum(hsp_val('Hsp_bit-score'))), |
350 cover = "{0:.0%}".format(cover_count / query_length), | 341 cover = "{0:.0%}".format(cover_count / query_length), |
351 e_value = "{0:.4g}".format(min(hsp_val('Hsp_evalue'))), | 342 e_value = "{0:.4g}".format(min(hsp_val('Hsp_evalue'))), |
352 # FIXME: is this the correct formula vv? | 343 # FIXME: is this the correct formula vv? |
353 # float(...) because non-flooring division doesn't work with lxml elements in python 2.6 | 344 # float(...) because non-flooring division doesn't work with lxml elements in python 2.6 |
354 ident = "{0:.0%}".format(float(min(float(hsp.Hsp_identity) / blastxml_len(hsp) for hsp in hsps))), | 345 ident = "{0:.0%}".format(float(min(float(hsp.Hsp_identity) / blastxml_len(hsp) for hsp in hsps)))) |
355 accession = hit.Hit_accession) | 346 |
347 @filter | |
348 def genelink(self, hit, text=None, clas=None, display_nolink=True): | |
349 if text is None: | |
350 text = hitid(hit) | |
351 db = hit.getroottree().getroot().BlastOutput_db | |
352 if isinstance(self.genelinks, six.string_types): | |
353 template = self.genelinks | |
354 else: | |
355 template = self.genelinks.get(db) | |
356 if template is None: | |
357 return text if display_nolink else '' | |
358 args = dict(id=hitid(hit).split('|'), | |
359 fullid=hitid(hit), | |
360 defline=str(hit.Hit_def).split('|'), | |
361 fulldefline=str(hit.Hit_def), | |
362 accession=str(hit.Hit_accession)) | |
363 try: | |
364 link = template.format(**args) | |
365 except Exception as e: | |
366 warnings.warn('Error in formatting gene bank link {} with {}: {}'.format(template, args, e)) | |
367 return text if display_nolink else '' | |
368 classattr = 'class="{}" '.format(jinja2.escape(clas)) if clas is not None else '' | |
369 return jinja2.Markup("<a {}href=\"{}\">{}</a>".format(classattr, jinja2.escape(link), jinja2.escape(text))) | |
370 | |
371 | |
372 def read_genelinks(dir): | |
373 links = {} | |
374 for f in ('blastdb.loc', 'blastdb_p.loc', 'blastdb_d.loc'): | |
375 try: | |
376 f = open(path.join(dir, f)) | |
377 for l in f.readlines(): | |
378 if l.strip().startswith('#'): | |
379 continue | |
380 line = l.split('\t') | |
381 try: | |
382 links[line[2]] = line[3] | |
383 except IndexError: | |
384 continue | |
385 f.close() | |
386 except OSError: | |
387 continue | |
388 if not links: | |
389 warnings.warn("No gene bank link templates found") | |
390 return links | |
356 | 391 |
357 | 392 |
358 def main(): | 393 def main(): |
359 default_template = path.join(path.dirname(__file__), 'blast2html.html.jinja') | 394 default_template = path.join(path.dirname(__file__), 'blast2html.html.jinja') |
360 | 395 |
372 # handle the errors. This introduces a small race condition when | 407 # handle the errors. This introduces a small race condition when |
373 # jinja later tries to re-open the template file, but we don't | 408 # jinja later tries to re-open the template file, but we don't |
374 # care too much. | 409 # care too much. |
375 parser.add_argument('--template', type=argparse.FileType(mode='r'), default=default_template, | 410 parser.add_argument('--template', type=argparse.FileType(mode='r'), default=default_template, |
376 help='The template file to use. Defaults to blast_html.html.jinja') | 411 help='The template file to use. Defaults to blast_html.html.jinja') |
377 | 412 |
413 dblink_group = parser.add_mutually_exclusive_group() | |
414 dblink_group.add_argument('--genelink-template', default='http://www.ncbi.nlm.nih.gov/nucleotide/{accession}?report=genbank&log$=nuclalign', | |
415 help="""A link template to link hits to a gene bank webpage. The template string is a | |
416 Python format string. It can contain the following replacement elements: {id[N]}, {fullid}, | |
417 {defline[N]}, {fulldefline}, {accession}, where N is a number. id[N] and defline[N] will be | |
418 replaced by the Nth element of the id or defline, where '|' is the field separator. | |
419 | |
420 The default is 'http://www.ncbi.nlm.nih.gov/nucleotide/{accession}?report=genbank&log$=nuclalign', | |
421 which is a link to the NCBI nucleotide database.""") | |
422 | |
423 dblink_group.add_argument('--db-config-dir', | |
424 help="""The directory where databases are configured in blastdb*.loc files. These files | |
425 are consulted for creating a gene bank link. The files should be tab-separated tables (with lines | |
426 starting with '#' ignored), where the third field of a line should be a database path and the fourth | |
427 a genebank link template conforming to the --genelink-template option syntax. | |
428 | |
429 This option is incompatible with --genelink-template.""") | |
430 | |
378 args = parser.parse_args() | 431 args = parser.parse_args() |
379 if args.input == None: | 432 if args.input == None: |
380 args.input = args.positional_arg | 433 args.input = args.positional_arg |
381 if args.input == None: | 434 if args.input == None: |
382 parser.error('no input specified') | 435 parser.error('no input specified') |
384 templatedir, templatename = path.split(args.template.name) | 437 templatedir, templatename = path.split(args.template.name) |
385 args.template.close() | 438 args.template.close() |
386 if not templatedir: | 439 if not templatedir: |
387 templatedir = '.' | 440 templatedir = '.' |
388 | 441 |
389 b = BlastVisualize(args.input, templatedir, templatename) | 442 if args.db_config_dir is None: |
443 genelinks = args.genelink_template | |
444 elif not path.isdir(args.db_config_dir): | |
445 parser.error('db-config-dir does not exist or is not a directory') | |
446 else: | |
447 genelinks = read_genelinks(args.db_config_dir) | |
448 | |
449 b = BlastVisualize(args.input, templatedir, templatename, genelinks) | |
390 b.render(args.output) | 450 b.render(args.output) |
391 | 451 |
392 | 452 |
393 if __name__ == '__main__': | 453 if __name__ == '__main__': |
394 main() | 454 main() |