comparison blast2html.py @ 95:4378d11f0ed7

implement configurable gene bank links
author Jan Kanis <jan.code@jankanis.nl>
date Mon, 30 Jun 2014 16:49:45 +0200
parents 9fb1a7d67317
children 02b795b784e1
comparison
equal deleted inserted replaced
94:df9fd5f35967 95:4378d11f0ed7
13 import warnings 13 import warnings
14 import six, codecs 14 import six, codecs
15 from six.moves import builtins 15 from six.moves import builtins
16 from os import path 16 from os import path
17 from itertools import repeat 17 from itertools import repeat
18 from collections import defaultdict
18 import argparse 19 import argparse
19 from lxml import objectify 20 from lxml import objectify
20 import jinja2 21 import jinja2
22
23 builtin_str = str
24 str = six.text_type
21 25
22 26
23 27
24 _filters = dict(float='float') 28 _filters = dict(float='float')
25 def filter(func_or_name): 29 def filter(func_or_name):
73 """Split a hit.Hit_def that contains multiple titles up, splitting out the hit ids from the titles.""" 77 """Split a hit.Hit_def that contains multiple titles up, splitting out the hit ids from the titles."""
74 id_titles = hit.Hit_def.text.split('>') 78 id_titles = hit.Hit_def.text.split('>')
75 79
76 titles = [] 80 titles = []
77 for t in id_titles[1:]: 81 for t in id_titles[1:]:
78 fullid, title = t.split(' ', 1) 82 id, title = t.split(' ', 1)
79 hitid, id = fullid.split('|', 2)[1:3] 83 titles.append(argparse.Namespace(Hit_id = id,
80 titles.append(dict(id = id, 84 Hit_def = title,
81 hitid = hitid, 85 Hit_accession = '',
82 fullid = fullid, 86 getroottree = hit.getroottree))
83 title = title))
84 return titles 87 return titles
85 88
86 @filter 89 @filter
87 def hitid(hit): 90 def hitid(hit):
88 hitid = hit.Hit_id.text 91 return str(hit.Hit_id)
89 s = hitid.split('|', 2)
90 if len(s) >= 2:
91 return s[1]
92 return hitid
93
94 @filter
95 def seqid(hit):
96 hitid = hit.Hit_id.text
97 s = hitid.split('|', 2)
98 if len(s) >= 3:
99 return s[2]
100 return hitid
101 92
102 93
103 @filter 94 @filter
104 def alignment_pre(hsp): 95 def alignment_pre(hsp):
105 """Create the preformatted alignment blocks""" 96 """Create the preformatted alignment blocks"""
175 return 'Plus' 166 return 'Plus'
176 elif frame == -1: 167 elif frame == -1:
177 return 'Minus' 168 return 'Minus'
178 raise Exception("frame should be either +1 or -1") 169 raise Exception("frame should be either +1 or -1")
179 170
180 def genelink(hit, type='genbank', hsp=None): 171 # def genelink(hit, type='genbank', hsp=None):
181 if not isinstance(hit, six.string_types): 172 # if not isinstance(hit, six.string_types):
182 hit = hitid(hit) 173 # hit = hitid(hit)
183 link = "http://www.ncbi.nlm.nih.gov/nucleotide/{0}?report={1}&log$=nuclalign".format(hit, type) 174 # link = "http://www.ncbi.nlm.nih.gov/nucleotide/{0}?report={1}&log$=nuclalign".format(hit, type)
184 if hsp != None: 175 # if hsp != None:
185 link += "&from={0}&to={1}".format(hsp['Hsp_hit-from'], hsp['Hsp_hit-to']) 176 # link += "&from={0}&to={1}".format(hsp['Hsp_hit-from'], hsp['Hsp_hit-to'])
186 return link 177 # return link
187 178
188 179
189 # javascript escape filter based on Django's, from https://github.com/dsissitka/khan-website/blob/master/templatefilters.py#L112-139 180 # javascript escape filter based on Django's, from https://github.com/dsissitka/khan-website/blob/master/templatefilters.py#L112-139
190 # I've removed the html escapes, since html escaping is already being performed by the template engine. 181 # I've removed the html escapes, since html escaping is already being performed by the template engine.
191 182
216 Javascript string literal escape. Note that this only escapes data 207 Javascript string literal escape. Note that this only escapes data
217 for embedding within javascript string literals, not in general 208 for embedding within javascript string literals, not in general
218 javascript snippets. 209 javascript snippets.
219 """ 210 """
220 211
221 value = six.text_type(value) 212 value = str(value)
222 213
223 for bad, good in _js_escapes: 214 for bad, good in _js_escapes:
224 value = value.replace(bad, good) 215 value = value.replace(bad, good)
225 216
226 return value 217 return value
238 229
239 colors = ('black', 'blue', 'green', 'magenta', 'red') 230 colors = ('black', 'blue', 'green', 'magenta', 'red')
240 231
241 max_scale_labels = 10 232 max_scale_labels = 10
242 233
243 def __init__(self, input, templatedir, templatename): 234 def __init__(self, input, templatedir, templatename, genelinks={}):
244 self.input = input 235 self.input = input
245 self.templatename = templatename 236 self.templatename = templatename
237 self.genelinks = genelinks
246 238
247 self.blast = objectify.parse(self.input).getroot() 239 self.blast = objectify.parse(self.input).getroot()
248 self.loader = jinja2.FileSystemLoader(searchpath=templatedir) 240 self.loader = jinja2.FileSystemLoader(searchpath=templatedir)
249 self.environment = jinja2.Environment(loader=self.loader, 241 self.environment = jinja2.Environment(loader=self.loader,
250 lstrip_blocks=True, trim_blocks=True, autoescape=True) 242 lstrip_blocks=True, trim_blocks=True, autoescape=True)
273 ) 265 )
274 266
275 result = template.render(blast=self.blast, 267 result = template.render(blast=self.blast,
276 iterations=self.blast.BlastOutput_iterations.Iteration, 268 iterations=self.blast.BlastOutput_iterations.Iteration,
277 colors=self.colors, 269 colors=self.colors,
278 genelink=genelink,
279 params=params) 270 params=params)
280 if six.PY2: 271 if six.PY2:
281 result = result.encode('utf-8') 272 result = result.encode('utf-8')
282 output.write(result) 273 output.write(result)
283 274
349 totalscore = "{0:.1f}".format(sum(hsp_val('Hsp_bit-score'))), 340 totalscore = "{0:.1f}".format(sum(hsp_val('Hsp_bit-score'))),
350 cover = "{0:.0%}".format(cover_count / query_length), 341 cover = "{0:.0%}".format(cover_count / query_length),
351 e_value = "{0:.4g}".format(min(hsp_val('Hsp_evalue'))), 342 e_value = "{0:.4g}".format(min(hsp_val('Hsp_evalue'))),
352 # FIXME: is this the correct formula vv? 343 # FIXME: is this the correct formula vv?
353 # float(...) because non-flooring division doesn't work with lxml elements in python 2.6 344 # float(...) because non-flooring division doesn't work with lxml elements in python 2.6
354 ident = "{0:.0%}".format(float(min(float(hsp.Hsp_identity) / blastxml_len(hsp) for hsp in hsps))), 345 ident = "{0:.0%}".format(float(min(float(hsp.Hsp_identity) / blastxml_len(hsp) for hsp in hsps))))
355 accession = hit.Hit_accession) 346
347 @filter
348 def genelink(self, hit, text=None, clas=None, display_nolink=True):
349 if text is None:
350 text = hitid(hit)
351 db = hit.getroottree().getroot().BlastOutput_db
352 if isinstance(self.genelinks, six.string_types):
353 template = self.genelinks
354 else:
355 template = self.genelinks.get(db)
356 if template is None:
357 return text if display_nolink else ''
358 args = dict(id=hitid(hit).split('|'),
359 fullid=hitid(hit),
360 defline=str(hit.Hit_def).split('|'),
361 fulldefline=str(hit.Hit_def),
362 accession=str(hit.Hit_accession))
363 try:
364 link = template.format(**args)
365 except Exception as e:
366 warnings.warn('Error in formatting gene bank link {} with {}: {}'.format(template, args, e))
367 return text if display_nolink else ''
368 classattr = 'class="{}" '.format(jinja2.escape(clas)) if clas is not None else ''
369 return jinja2.Markup("<a {}href=\"{}\">{}</a>".format(classattr, jinja2.escape(link), jinja2.escape(text)))
370
371
372 def read_genelinks(dir):
373 links = {}
374 for f in ('blastdb.loc', 'blastdb_p.loc', 'blastdb_d.loc'):
375 try:
376 f = open(path.join(dir, f))
377 for l in f.readlines():
378 if l.strip().startswith('#'):
379 continue
380 line = l.split('\t')
381 try:
382 links[line[2]] = line[3]
383 except IndexError:
384 continue
385 f.close()
386 except OSError:
387 continue
388 if not links:
389 warnings.warn("No gene bank link templates found")
390 return links
356 391
357 392
358 def main(): 393 def main():
359 default_template = path.join(path.dirname(__file__), 'blast2html.html.jinja') 394 default_template = path.join(path.dirname(__file__), 'blast2html.html.jinja')
360 395
372 # handle the errors. This introduces a small race condition when 407 # handle the errors. This introduces a small race condition when
373 # jinja later tries to re-open the template file, but we don't 408 # jinja later tries to re-open the template file, but we don't
374 # care too much. 409 # care too much.
375 parser.add_argument('--template', type=argparse.FileType(mode='r'), default=default_template, 410 parser.add_argument('--template', type=argparse.FileType(mode='r'), default=default_template,
376 help='The template file to use. Defaults to blast_html.html.jinja') 411 help='The template file to use. Defaults to blast_html.html.jinja')
377 412
413 dblink_group = parser.add_mutually_exclusive_group()
414 dblink_group.add_argument('--genelink-template', default='http://www.ncbi.nlm.nih.gov/nucleotide/{accession}?report=genbank&log$=nuclalign',
415 help="""A link template to link hits to a gene bank webpage. The template string is a
416 Python format string. It can contain the following replacement elements: {id[N]}, {fullid},
417 {defline[N]}, {fulldefline}, {accession}, where N is a number. id[N] and defline[N] will be
418 replaced by the Nth element of the id or defline, where '|' is the field separator.
419
420 The default is 'http://www.ncbi.nlm.nih.gov/nucleotide/{accession}?report=genbank&log$=nuclalign',
421 which is a link to the NCBI nucleotide database.""")
422
423 dblink_group.add_argument('--db-config-dir',
424 help="""The directory where databases are configured in blastdb*.loc files. These files
425 are consulted for creating a gene bank link. The files should be tab-separated tables (with lines
426 starting with '#' ignored), where the third field of a line should be a database path and the fourth
427 a genebank link template conforming to the --genelink-template option syntax.
428
429 This option is incompatible with --genelink-template.""")
430
378 args = parser.parse_args() 431 args = parser.parse_args()
379 if args.input == None: 432 if args.input == None:
380 args.input = args.positional_arg 433 args.input = args.positional_arg
381 if args.input == None: 434 if args.input == None:
382 parser.error('no input specified') 435 parser.error('no input specified')
384 templatedir, templatename = path.split(args.template.name) 437 templatedir, templatename = path.split(args.template.name)
385 args.template.close() 438 args.template.close()
386 if not templatedir: 439 if not templatedir:
387 templatedir = '.' 440 templatedir = '.'
388 441
389 b = BlastVisualize(args.input, templatedir, templatename) 442 if args.db_config_dir is None:
443 genelinks = args.genelink_template
444 elif not path.isdir(args.db_config_dir):
445 parser.error('db-config-dir does not exist or is not a directory')
446 else:
447 genelinks = read_genelinks(args.db_config_dir)
448
449 b = BlastVisualize(args.input, templatedir, templatename, genelinks)
390 b.render(args.output) 450 b.render(args.output)
391 451
392 452
393 if __name__ == '__main__': 453 if __name__ == '__main__':
394 main() 454 main()