comparison weblogolib/__init__.py @ 4:4d47ab2b7bcc

Uploaded
author davidmurphy
date Fri, 13 Jan 2012 07:18:19 -0500
parents
children 5149eb3a89c2
comparison
equal deleted inserted replaced
3:09d2dac9ef73 4:4d47ab2b7bcc
1 #!/usr/bin/env python
2
3 # -------------------------------- WebLogo --------------------------------
4
5 # Copyright (c) 2003-2004 The Regents of the University of California.
6 # Copyright (c) 2005 Gavin E. Crooks
7 # Copyright (c) 2006, The Regents of the University of California, through print
8 # Lawrence Berkeley National Laboratory (subject to receipt of any required
9 # approvals from the U.S. Dept. of Energy). All rights reserved.
10
11 # This software is distributed under the new BSD Open Source License.
12 # <http://www.opensource.org/licenses/bsd-license.html>
13 #
14 # Redistribution and use in source and binary forms, with or without
15 # modification, are permitted provided that the following conditions are met:
16 #
17 # (1) Redistributions of source code must retain the above copyright notice,
18 # this list of conditions and the following disclaimer.
19 #
20 # (2) Redistributions in binary form must reproduce the above copyright
21 # notice, this list of conditions and the following disclaimer in the
22 # documentation and or other materials provided with the distribution.
23 #
24 # (3) Neither the name of the University of California, Lawrence Berkeley
25 # National Laboratory, U.S. Dept. of Energy nor the names of its contributors
26 # may be used to endorse or promote products derived from this software
27 # without specific prior written permission.
28 #
29 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 # POSSIBILITY OF SUCH DAMAGE.
40
41 # Replicates README.txt
42
43 """
44 WebLogo (http://code.google.com/p/weblogo/) is a tool for creating sequence
45 logos from biological sequence alignments. It can be run on the command line,
46 as a standalone webserver, as a CGI webapp, or as a python library.
47
48 The main WebLogo webserver is located at http://bespoke.lbl.gov/weblogo/
49
50
51 Codonlogo is based on Weblogo.
52
53 Please consult the manual for installation instructions and more information:
54 (Also located in the weblogolib/htdocs subdirectory.)
55
56 For help on the command line interface run
57 ./codonlogo --help
58
59 To build a simple logo run
60 ./codonlogo < cap.fa > logo0.eps
61
62 To run as a standalone webserver at localhost:8080
63 ./codonlogo --server
64
65 To create a logo in python code:
66 >>> from weblogolib import *
67 >>> fin = open('cap.fa')
68 >>> seqs = read_seq_data(fin)
69 >>> data = LogoData.from_seqs(seqs)
70 >>> options = LogoOptions()
71 >>> options.title = "A Logo Title"
72 >>> format = LogoFormat(data, options)
73 >>> fout = open('cap.eps', 'w')
74 >>> eps_formatter( data, format, fout)
75
76
77 -- Distribution and Modification --
78 This package is distributed under the new BSD Open Source License.
79 Please see the LICENSE.txt file for details on copyright and licensing.
80 The WebLogo source code can be downloaded from http://code.google.com/p/weblogo/
81 WebLogo requires Python 2.3, 2.4 or 2.5, the corebio python toolkit for computational
82 biology (http://code.google.com/p/corebio), and the python array package
83 'numpy' (http://www.scipy.org/Download)
84
85 # -------------------------------- CodonLogo --------------------------------
86
87
88 """
89
90 from math import *
91 import random
92 from itertools import izip, count
93 import sys
94 import copy
95 import os
96 from itertools import product
97 from datetime import datetime
98 from StringIO import StringIO
99
100
101
102 from corebio.data import rna_letters, dna_letters, amino_acid_letters
103 import random
104
105 # python2.3 compatability
106 from corebio._future import Template
107 from corebio._future.subprocess import *
108 from corebio._future import resource_string, resource_filename
109
110
111 from math import log, sqrt
112
113 # Avoid 'from numpy import *' since numpy has lots of names defined
114 from numpy import array, asarray, float64, ones, zeros, int32,all,any, shape
115 import numpy as na
116
117 from corebio.utils.deoptparse import DeOptionParser
118 from optparse import OptionGroup
119
120 from color import *
121 from colorscheme import *
122 from corebio.seq import Alphabet, Seq, SeqList
123 from corebio import seq_io
124 from corebio.utils import isfloat, find_command, ArgumentError
125 from corebio.moremath import *
126 from corebio.data import amino_acid_composition
127 from corebio.seq import unambiguous_rna_alphabet, unambiguous_dna_alphabet, unambiguous_protein_alphabet
128
129 codon_alphabetU=['AAA', 'AAC', 'AAG', 'AAU', 'ACA', 'ACC', 'ACG', 'ACU', 'AGA', 'AGC', 'AGG', 'AGU', 'AUA', 'AUC', 'AUG', 'AUU', 'CAA', 'CAC', 'CAG', 'CAU', 'CCA', 'CCC', 'CCG', 'CCU', 'CGA', 'CGC', 'CGG', 'CGU', 'CUA', 'CUC', 'CUG', 'CUU', 'GAA', 'GAC', 'GAG', 'GAU', 'GCA', 'GCC', 'GCG', 'GCU', 'GGA', 'GGC', 'GGG', 'GGU', 'GUA', 'GUC', 'GUG', 'GUU', 'UAA', 'UAC', 'UAG', 'UAU', 'UCA', 'UCC', 'UCG', 'UCU', 'UGA', 'UGC', 'UGG', 'UGU', 'UUA', 'UUC', 'UUG', 'UUU']
130 codon_alphabetT=['AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA', 'AGC', 'AGG', 'AGT', 'ATA', 'ATC', 'ATG', 'ATT', 'CAA', 'CAC', 'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT', 'CGA', 'CGC', 'CGG', 'CGT', 'CTA', 'CTC', 'CTG', 'CTT', 'GAA', 'GAC', 'GAG', 'GAT', 'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG', 'GGT', 'GTA', 'GTC', 'GTG', 'GTT', 'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC', 'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG', 'TTT']
131
132 altype="codonsT"
133 offset=0
134 isreversed=False
135 col=[]
136
137
138
139 __all__ = ['LogoSize',
140 'LogoOptions',
141 'description',
142 '__version__',
143 'LogoFormat',
144 'LogoData',
145 'Dirichlet',
146 'GhostscriptAPI',
147 'std_color_schemes',
148 'default_color_schemes',
149 'classic',
150 'std_units',
151 'std_sizes',
152 'std_alphabets',
153 'std_percentCG',
154 'pdf_formatter',
155 'jpeg_formatter',
156 'png_formatter',
157 'png_print_formatter',
158 'txt_formatter',
159 'eps_formatter',
160 'formatters',
161 'default_formatter',
162 'base_distribution',
163 'equiprobable_distribution',
164 'read_seq_data',
165 'which_alphabet',
166 'color',
167 'colorscheme',
168 ]
169
170 description = "Create sequence logos from biological sequence alignments."
171
172 __version__ = "1.0"
173
174 # These keywords are subsituted by subversion.
175 # The date and revision will only tell the truth after a branch or tag,
176 # since different files in trunk will have been changed at different times
177 release_date ="$Date: 2011-09-17 16:30:00 -0700 (Tue, 14 Oct 2008) $".split()[1]
178 release_build = "$Revision: 53 $".split()[1]
179 release_description = "CodonLogo %s (%s)" % (__version__, release_date)
180
181
182
183 def cgi(htdocs_directory) :
184 import weblogolib._cgi
185 weblogolib._cgi.main(htdocs_directory)
186
187 class GhostscriptAPI(object) :
188 """Interface to the command line program Ghostscript ('gs')"""
189
190 formats = ('png', 'pdf', 'jpeg')
191
192 def __init__(self, path=None) :
193 try:
194 command = find_command('gs', path=path)
195 except EnvironmentError:
196 try:
197 command = find_command('gswin32c.exe', path=path)
198 except EnvironmentError:
199 raise EnvironmentError("Could not find Ghostscript on path."
200 " There should be either a gs executable or a gswin32c.exe on your system's path")
201
202 self.command = command
203
204 def version(self) :
205 args = [self.command, '--version']
206 try :
207 p = Popen(args, stdout=PIPE)
208 (out,err) = p.communicate()
209 except OSError :
210 raise RuntimeError("Cannot communicate with ghostscript.")
211 return out.strip()
212
213 def convert(self, format, fin, fout, width, height, resolution=300) :
214 device_map = { 'png':'png16m', 'pdf':'pdfwrite', 'jpeg':'jpeg'}
215
216 try :
217 device = device_map[format]
218 except KeyError:
219 raise ValueError("Unsupported format.")
220
221 args = [self.command,
222 "-sDEVICE=%s" % device,
223 "-dPDFSETTINGS=/printer",
224 #"-q", # Quite: Do not dump messages to stdout.
225 "-sstdout=%stderr", # Redirect messages and errors to stderr
226 "-sOutputFile=-", # Stdout
227 "-dDEVICEWIDTHPOINTS=%s" % str(width),
228 "-dDEVICEHEIGHTPOINTS=%s" % str(height),
229 "-dSAFER", # For added security
230 "-dNOPAUSE",]
231
232 if device != 'pdf' :
233 args.append("-r%s" % str(resolution) )
234 if resolution < 300 : # Antialias if resolution is Less than 300 DPI
235 args.append("-dGraphicsAlphaBits=4")
236 args.append("-dTextAlphaBits=4")
237 args.append("-dAlignToPixels=0")
238
239 args.append("-") # Read from stdin. Must be last argument.
240
241 error_msg = "Unrecoverable error : Ghostscript conversion failed " \
242 "(Invalid postscript?). %s" % " ".join(args)
243
244 source = fin.read()
245
246 try :
247 p = Popen(args, stdin=PIPE, stdout = PIPE, stderr= PIPE)
248 (out,err) = p.communicate(source)
249 except OSError :
250 raise RuntimeError(error_msg)
251
252 if p.returncode != 0 :
253 error_msg += '\nReturn code: %i\n' % p.returncode
254 if err is not None : error_msg += err
255 raise RuntimeError(error_msg)
256
257 print >>fout, out
258 # end class Ghostscript
259
260
261 aa_composition = [ amino_acid_composition[_k] for _k in
262 unambiguous_protein_alphabet]
263
264
265
266 # ------ DATA ------
267
268 classic = ColorScheme([
269 ColorGroup("G", "orange" ),
270 ColorGroup("TU", "red"),
271 ColorGroup("C", "blue"),
272 ColorGroup("A", "green")
273 ] )
274
275 std_color_schemes = {"auto": None, # Depends on sequence type
276 "monochrome": monochrome,
277 "base pairing": base_pairing,
278 "classic": classic,
279 "hydrophobicity" : hydrophobicity,
280 "chemistry" : chemistry,
281 "charge" : charge,
282 }#
283
284 default_color_schemes = {
285 unambiguous_protein_alphabet: hydrophobicity,
286 unambiguous_rna_alphabet: base_pairing,
287 unambiguous_dna_alphabet: base_pairing
288 #codon_alphabet:codonsU
289 }
290
291
292 std_units = {
293 "bits" : 1./log(2),
294 "nats" : 1.,
295 "digits" : 1./log(10),
296 "kT" : 1.,
297 "kJ/mol" : 8.314472 *298.15 /1000.,
298 "kcal/mol": 1.987 *298.15 /1000.,
299 "probability" : None,
300 }
301
302 class LogoSize(object) :
303 def __init__(self, stack_width, stack_height) :
304 self.stack_width = stack_width
305 self.stack_height = stack_height
306
307 def __repr__(self):
308 return stdrepr(self)
309
310 # The base stack width is set equal to 9pt Courier.
311 # (Courier has a width equal to 3/5 of the point size.)
312 # Check that can get 80 characters in journal page @small
313 # 40 chacaters in a journal column
314 std_sizes = {
315 "small" : LogoSize( stack_width = 10, stack_height = 10*1*5),
316 "medium" : LogoSize( stack_width = 10*2, stack_height = 10*2*5),
317 "large" : LogoSize( stack_width = 10*3, stack_height = 10*3*5),
318 }
319
320
321 std_alphabets = {
322 'protein': unambiguous_protein_alphabet,
323 'rna': unambiguous_rna_alphabet,
324 'dna': unambiguous_dna_alphabet,
325 'codonsU':codon_alphabetU,
326 'codonsT':codon_alphabetT
327 }
328
329 std_percentCG = {
330 'H. sapiens' : 40.,
331 'E. coli' : 50.5,
332 'S. cerevisiae' : 38.,
333 'C. elegans' : 36.,
334 'D. melanogaster': 43.,
335 'M. musculus' : 42.,
336 'T. thermophilus' : 69.4,
337 }
338
339 # Thermus thermophilus: Henne A, Bruggemann H, Raasch C, Wiezer A, Hartsch T,
340 # Liesegang H, Johann A, Lienard T, Gohl O, Martinez-Arias R, Jacobi C,
341 # Starkuviene V, Schlenczeck S, Dencker S, Huber R, Klenk HP, Kramer W,
342 # Merkl R, Gottschalk G, Fritz HJ: The genome sequence of the extreme
343 # thermophile Thermus thermophilus.
344 # Nat Biotechnol 2004, 22:547-53
345
346 def stdrepr(obj) :
347 attr = vars(obj).items()
348
349
350 attr.sort()
351 args = []
352 for a in attr :
353 if a[0][0]=='_' : continue
354 args.append( '%s=%s' % ( a[0], repr(a[1])) )
355 args = ',\n'.join(args).replace('\n', '\n ')
356 return '%s(\n %s\n)' % (obj.__class__.__name__, args)
357
358
359 class LogoOptions(object) :
360 """ A container for all logo formating options. Not all of these
361 are directly accessible through the CLI or web interfaces.
362
363 To display LogoOption defaults:
364 >>> from weblogolib import *
365 >>> LogoOptions()
366
367
368 Attributes:
369 o alphabet
370 o creator_text -- Embedded as comment in figures.
371 o logo_title
372 o logo_label
373 o stacks_per_line
374 o unit_name
375 o show_yaxis
376 o yaxis_label -- Default depends on other settings.
377 o yaxis_tic_interval
378 o yaxis_minor_tic_ratio
379 o yaxis_scale
380 o show_xaxis
381 o xaxis_label
382 o xaxis_tic_interval
383 o rotate_numbers
384 o number_interval
385 o show_ends
386 o show_fineprint
387 o fineprint
388 o show_boxes
389 o shrink_fraction
390 o show_errorbars
391 o errorbar_fraction
392 o errorbar_width_fraction
393 o errorbar_gray
394 o resolution -- Dots per inch
395 o default_color
396 o color_scheme
397 o debug
398 o logo_margin
399 o stroke_width
400 o tic_length
401 o size
402 o stack_margin
403 o pad_right
404 o small_fontsize
405 o fontsize
406 o title_fontsize
407 o number_fontsize
408 o text_font
409 o logo_font
410 o title_font
411 o first_index
412 o logo_start
413 o logo_end
414 o scale_width
415 """
416
417 def __init__(self, **kwargs) :
418 """ Create a new LogoOptions instance.
419
420 >>> L = LogoOptions(logo_title = "Some Title String")
421 >>> L.show_yaxis = False
422 >>> repr(L)
423 """
424
425 self.creator_text = release_description,
426 self.alphabet = None
427
428 self.logo_title = ""
429 self.logo_label = ""
430 self.stacks_per_line = 20
431
432 self.unit_name = "bits"
433
434 self.show_yaxis = True
435 # yaxis_lable default depends on other settings. See LogoFormat
436 self.yaxis_label = None
437 self.yaxis_tic_interval = 1.
438 self.yaxis_minor_tic_ratio = 5
439 self.yaxis_scale = None
440
441 self.show_xaxis = True
442 self.xaxis_label = ""
443 self.xaxis_tic_interval =1
444 self.rotate_numbers = False
445 self.number_interval = 5
446 self.show_ends = False
447
448 self.show_fineprint = True
449 self.fineprint = "CodonLogo "+__version__
450
451 self.show_boxes = False
452 self.shrink_fraction = 0.5
453
454 self.show_errorbars = True
455 self.altype = True
456
457 self.errorbar_fraction = 0.90
458 self.errorbar_width_fraction = 0.25
459 self.errorbar_gray = 0.50
460
461 self.resolution = 96. # Dots per inch
462
463 self.default_color = Color.by_name("black")
464 self.color_scheme = None
465 #self.show_color_key = False # NOT yet implemented
466
467 self.debug = False
468
469 self.logo_margin = 2
470 self.stroke_width = 0.5
471 self.tic_length = 5
472
473 self.size = std_sizes["medium"]
474
475 self.stack_margin = 0.5
476 self.pad_right = False
477
478 self.small_fontsize = 6
479 self.fontsize = 10
480 self.title_fontsize = 12
481 self.number_fontsize = 8
482
483 self.text_font = "ArialMT"
484 self.logo_font = "Arial-BoldMT"
485 self.title_font = "ArialMT"
486
487 self.first_index = 1
488 self.logo_start = None
489 self.logo_end=None
490
491 # Scale width of characters proportional to gaps
492 self.scale_width = True
493
494 from corebio.utils import update
495 update(self, **kwargs)
496
497 def __repr__(self) :
498 attr = vars(self).items()
499 attr.sort()
500 args = []
501 for a in attr :
502 if a[0][0]=='_' : continue
503 args.append( '%s=%s' % ( a[0], repr(a[1])) )
504 args = ',\n'.join(args).replace('\n', '\n ')
505 return '%s(\n %s\n)' % (self.__class__.__name__, args)
506 # End class LogoOptions
507
508
509
510
511 class LogoFormat(LogoOptions) :
512 """ Specifies the format of the logo. Requires a LogoData and LogoOptions
513 objects.
514
515 >>> data = LogoData.from_seqs(seqs )
516 >>> options = LogoOptions()
517 >>> options.title = "A Logo Title"
518 >>> format = LogoFormat(data, options)
519 """
520 # TODO: Raise ArgumentErrors instead of ValueError and document
521 def __init__(self, data, options= None) :
522
523 LogoOptions.__init__(self)
524 #global offset
525 if options is not None :
526 self.__dict__.update(options.__dict__)
527
528 #offset=options.frame
529
530 self.alphabet = data.alphabet
531 self.seqlen = data.length
532 self.altype = True
533 self.show_title = False
534 self.show_xaxis_label = False
535 self.yaxis_minor_tic_interval = None
536 self.lines_per_logo = None
537 self.char_width = None
538 self.line_margin_left = None
539 self.line_margin_right = None
540 self.line_margin_bottom = None
541 self.line_margin_top = None
542 self.title_height = None
543 self.xaxis_label_height = None
544 self.line_height = None
545 self.line_width = None
546 self.logo_height = None
547 self.logo_width = None
548 self.creation_date = None
549 self.end_type = None
550
551 if self.stacks_per_line< 1 :
552 raise ArgumentError("Stacks per line should be greater than zero.",
553 "stacks_per_line" )
554
555 if self.size.stack_height<=0.0 :
556 raise ArgumentError(
557 "Stack height must be greater than zero.", "stack_height")
558 if (self.small_fontsize <= 0 or self.fontsize <=0 or
559 self.title_fontsize<=0 ):
560 raise ValueError("Font sizes must be positive.")
561
562 if self.errorbar_fraction<0.0 or self.errorbar_fraction>1.0 :
563 raise ValueError(
564 "The visible fraction of the error bar must be between zero and one.")
565
566 if self.yaxis_tic_interval<=0.0 :
567 raise ArgumentError( "The yaxis tic interval cannot be negative.",
568 'yaxis_tic_interval')
569
570 if self.size.stack_width <= 0.0 :
571 raise ValueError(
572 "The width of a stack should be a positive number.")
573
574 if self.yaxis_minor_tic_interval and \
575 self.yaxis_minor_tic_interval<=0.0 :
576 raise ValueError("Distances cannot be negative.")
577
578 if self.xaxis_tic_interval<=0 :
579 raise ValueError("Tic interval must be greater than zero.")
580
581 if self.number_interval<=0 :
582 raise ValueError("Invalid interval between numbers.")
583
584 if self.shrink_fraction<0.0 or self.shrink_fraction>1.0 :
585 raise ValueError("Invalid shrink fraction.")
586
587 if self.stack_margin<=0.0 :
588 raise ValueError("Invalid stack margin." )
589
590 if self.logo_margin<=0.0 :
591 raise ValueError("Invalid logo margin." )
592
593 if self.stroke_width<=0.0 :
594 raise ValueError("Invalid stroke width.")
595
596 if self.tic_length<=0.0 :
597 raise ValueError("Invalid tic length.")
598
599 # FIXME: More validation
600
601 # Inclusive upper and lower bounds
602 # FIXME: Validate here. Move from eps_formatter
603 if self.logo_start is None: self.logo_start = self.first_index
604
605 if self.logo_end is None :
606 self.logo_end = self.seqlen + self.first_index -1
607
608 self.total_stacks = self.logo_end - self.logo_start +1
609
610 if self.logo_start - self.first_index <0 :
611 raise ArgumentError(
612 "Logo range extends before start of available sequence.",
613 'logo_range')
614
615 if self.logo_end - self.first_index >= self.seqlen :
616 raise ArgumentError(
617 "Logo range extends beyond end of available sequence.",
618 'logo_range')
619
620 if self.logo_title : self.show_title = True
621 if not self.fineprint : self.show_fineprint = False
622 if self.xaxis_label : self.show_xaxis_label = True
623
624 if self.yaxis_label is None :
625 self.yaxis_label = self.unit_name
626
627 if self.yaxis_label :
628 self.show_yaxis_label = True
629 else :
630 self.show_yaxis_label = False
631 self.show_ends = False
632
633 if not self.yaxis_scale :
634 conversion_factor = std_units[self.unit_name]
635 if conversion_factor :
636 self.yaxis_scale=log(len(self.alphabet))*conversion_factor
637 #self.yaxis_scale=max(data.entropy)*conversion_factor
638 #marker# this is where I handle the max height. needs revision.
639 else :
640 self.yaxis_scale=1.0 # probability units
641
642 if self.yaxis_scale<=0.0 :
643 raise ValueError(('yaxis_scale', "Invalid yaxis scale"))
644 if self.yaxis_tic_interval >= self.yaxis_scale:
645 self.yaxis_tic_interval /= 2.
646
647 self.yaxis_minor_tic_interval \
648 = float(self.yaxis_tic_interval)/self.yaxis_minor_tic_ratio
649
650 if self.color_scheme is None :
651 #if self.alphabet in default_color_schemes :
652 #self.color_scheme = default_color_schemes[self.alphabet]
653 #else :
654 self.color_scheme = codonsT
655 #else:
656 #for color, symbols, desc in options.colors:
657 #try :
658 #self.color_scheme.append( ColorGroup(symbols, color, desc) )
659 #print >>sys.stderr, color_scheme.groups[2]
660 #except ValueError :
661 #raise ValueError(
662 #"error: option --color: invalid value: '%s'" % color )
663
664
665 self.lines_per_logo = 1+ ( (self.total_stacks-1) / self.stacks_per_line)
666
667 if self.lines_per_logo==1 and not self.pad_right:
668 self.stacks_per_line = min(self.stacks_per_line, self.total_stacks)
669
670 self.char_width = self.size.stack_width - 2* self.stack_margin
671
672
673 if self.show_yaxis :
674 self.line_margin_left = self.fontsize * 3.0
675 else :
676 self.line_margin_left = 0
677
678 if self.show_ends :
679 self.line_margin_right = self.fontsize *1.5
680 else :
681 self.line_margin_right = self.fontsize
682
683 if self.show_xaxis :
684 if self.rotate_numbers :
685 self.line_margin_bottom = self.number_fontsize *2.5
686 else:
687 self.line_margin_bottom = self.number_fontsize *1.5
688 else :
689 self.line_margin_bottom = 4
690
691 self.line_margin_top = 4
692
693 if self.show_title :
694 self.title_height = self.title_fontsize
695 else :
696 self.title_height = 0
697
698 self.xaxis_label_height =0.
699 if self.show_xaxis_label :
700 self.xaxis_label_height += self.fontsize
701 if self.show_fineprint :
702 self.xaxis_label_height += self.small_fontsize
703
704 self.line_height = (self.size.stack_height + self.line_margin_top +
705 self.line_margin_bottom )
706 self.line_width = (self.size.stack_width*self.stacks_per_line +
707 self.line_margin_left + self.line_margin_right )
708
709 self.logo_height = int(2*self.logo_margin + self.title_height \
710 + self.xaxis_label_height + self.line_height*self.lines_per_logo)
711 self.logo_width = int(2*self.logo_margin + self.line_width )
712
713
714 self.creation_date = datetime.now().isoformat(' ')
715
716 end_type = '-'
717 end_types = {
718 unambiguous_protein_alphabet: 'p',
719 unambiguous_rna_alphabet: '-',
720 unambiguous_dna_alphabet: 'd'
721 }
722 if self.show_ends and self.alphabet in end_types:
723 end_type = end_types[self.alphabet]
724 self.end_type = end_type
725 # End __init__
726 # End class LogoFormat
727
728
729
730 # ------ Logo Formaters ------
731 # Each formatter is a function f(LogoData, LogoFormat, output file).
732 # that draws a represntation of the logo into the given file.
733 # The main graphical formatter is eps_formatter. A mapping 'formatters'
734 # containing all available formatters is located after the formatter
735 # definitions.
736
737 def pdf_formatter(data, format, fout) :
738 """ Generate a logo in PDF format."""
739
740 feps = StringIO()
741 eps_formatter(data, format, feps)
742 feps.seek(0)
743
744 gs = GhostscriptAPI()
745 gs.convert('pdf', feps, fout, format.logo_width, format.logo_height)
746
747
748 def _bitmap_formatter(data, format, fout, device) :
749 feps = StringIO()
750 eps_formatter(data, format, feps)
751 feps.seek(0)
752
753 gs = GhostscriptAPI()
754 gs.convert(device, feps, fout,
755 format.logo_width, format.logo_height, format.resolution)
756
757
758 def jpeg_formatter(data, format, fout) :
759 """ Generate a logo in JPEG format."""
760 _bitmap_formatter(data, format, fout, device="jpeg")
761
762
763 def png_formatter(data, format, fout) :
764 """ Generate a logo in PNG format."""
765
766 _bitmap_formatter(data, format, fout, device="png")
767
768
769 def png_print_formatter(data, format, fout) :
770 """ Generate a logo in PNG format with print quality (600 DPI) resolution."""
771 format.resolution = 600
772 _bitmap_formatter(data, format, fout, device="png")
773
774
775 def txt_formatter( logodata, format, fout) :
776 """ Create a text representation of the logo data.
777 """
778 print >>fout, str(logodata)
779
780
781
782
783 def eps_formatter( logodata, format, fout) :
784 """ Generate a logo in Encapsulated Postscript (EPS)"""
785
786 subsitutions = {}
787 from_format =[
788 "creation_date", "logo_width", "logo_height",
789 "lines_per_logo", "line_width", "line_height",
790 "line_margin_right","line_margin_left", "line_margin_bottom",
791 "line_margin_top", "title_height", "xaxis_label_height",
792 "creator_text", "logo_title", "logo_margin",
793 "stroke_width", "tic_length",
794 "stacks_per_line", "stack_margin",
795 "yaxis_label", "yaxis_tic_interval", "yaxis_minor_tic_interval",
796 "xaxis_label", "xaxis_tic_interval", "number_interval",
797 "fineprint", "shrink_fraction", "errorbar_fraction",
798 "errorbar_width_fraction",
799 "errorbar_gray", "small_fontsize", "fontsize",
800 "title_fontsize", "number_fontsize", "text_font",
801 "logo_font", "title_font",
802 "logo_label", "yaxis_scale", "end_type",
803 "debug", "show_title", "show_xaxis",
804 "show_xaxis_label", "show_yaxis", "show_yaxis_label",
805 "show_boxes", "show_errorbars", "show_fineprint",
806 "rotate_numbers", "show_ends", "altype",
807
808 ]
809
810 for s in from_format :
811 subsitutions[s] = getattr(format,s)
812
813
814 from_format_size = ["stack_height", "stack_width"]
815 for s in from_format_size :
816 subsitutions[s] = getattr(format.size,s)
817
818 subsitutions["shrink"] = str(format.show_boxes).lower()
819
820
821 # --------- COLORS --------------
822 def format_color(color):
823 return " ".join( ("[",str(color.red) , str(color.green),
824 str(color.blue), "]"))
825
826 subsitutions["default_color"] = format_color(format.default_color)
827 global col
828 colors = []
829
830 if altype=="codonsT" or altype=="codonsU":
831 for group in col:
832 cf = format_color(group.color)
833 colors.append( " ("+group.symbols+") " + cf )
834 for group in format.color_scheme.groups :
835 cf = format_color(group.color)
836
837 colors.append( " ("+group.symbols+") " + cf )
838 #print >>sys.stderr,opts.colors
839 #print >>sys.stderr,logodata.options
840 #print >>sys.stderr, group.symbols
841 #print >>sys.stderr, cf
842
843
844
845 else:
846 for group in format.color_scheme.groups :
847 cf = format_color(group.color)
848 for s in group.symbols :
849 colors.append( " ("+s+") " + cf )
850
851 subsitutions["color_dict"] = "\n".join(colors)
852 data = []
853
854 # Unit conversion. 'None' for probability units
855 conv_factor = std_units[format.unit_name]
856
857 data.append("StartLine")
858
859
860 seq_from = format.logo_start- format.first_index
861 seq_to = format.logo_end - format.first_index +1
862
863 # seq_index : zero based index into sequence data
864 # logo_index : User visible coordinate, first_index based
865 # stack_index : zero based index of visible stacks
866 for seq_index in range(seq_from, seq_to) :
867 logo_index = seq_index + format.first_index
868 stack_index = seq_index - seq_from
869
870 if stack_index!=0 and (stack_index % format.stacks_per_line) ==0 :
871 data.append("")
872 data.append("EndLine")
873 data.append("StartLine")
874 data.append("")
875
876
877 if isreversed :
878 data.append("(%d) StartStack" % logo_index)
879 else :
880 data.append("(%d) StartStack" % logo_index)
881
882
883
884 if conv_factor:
885 stack_height = logodata.entropy[seq_index] * std_units[format.unit_name]
886 else :
887 stack_height = 1.0 # Probability
888
889 # if logodata.entropy_interval is not None and conv_factor:
890 # Draw Error bars
891 # low, high = logodata.entropy_interval[seq_index]
892 # center = logodata.entropy[seq_index]
893
894
895 # down = (center - low) * conv_factor
896 # up = (high - center) * conv_factor
897 # data.append(" %f %f %f DrawErrorbarFirst" % (down, up, stack_height) )
898
899 s = zip(logodata.counts[seq_index], logodata.alphabet)
900 def mycmp( c1, c2 ) :
901 # Sort by frequency. If equal frequency then reverse alphabetic
902 if c1[0] == c2[0] : return cmp(c2[1], c1[1])
903 return cmp(c1[0], c2[0])
904
905 s.sort(mycmp)
906
907 C = float(sum(logodata.counts[seq_index]))
908 if C > 0.0 :
909 fraction_width = 1.0
910 if format.scale_width :
911 fraction_width = logodata.weight[seq_index]
912 for c in s:
913 data.append(" %f %f (%s) ShowSymbol" % (fraction_width, c[0]*stack_height/C, c[1]) )
914
915 # Draw error bar on top of logo. Replaced by DrawErrorbarFirst above.
916 if logodata.entropy_interval is not None and conv_factor:
917 low, high = logodata.entropy_interval[seq_index]
918 center = logodata.entropy[seq_index]
919
920 down = (center - low) * conv_factor
921 up = (high - center) * conv_factor
922 data.append(" %f %f DrawErrorbar" % (down, up) )
923
924 data.append("EndStack")
925 data.append("")
926
927 data.append("EndLine")
928 subsitutions["logo_data"] = "\n".join(data)
929
930
931 # Create and output logo
932 template = resource_string( __name__, 'template.eps', __file__)
933 logo = Template(template).substitute(subsitutions)
934 print >>fout, logo
935
936
937 # map between output format names and logo
938 formatters = {
939 'eps': eps_formatter,
940 'pdf': pdf_formatter,
941 'png': png_formatter,
942 'png_print' : png_print_formatter,
943 'jpeg' : jpeg_formatter,
944 'txt' : txt_formatter,
945 }
946
947 default_formatter = eps_formatter
948
949
950
951
952
953 def parse_prior(composition, alphabet, weight=None) :
954 """ Parse a description of the expected monomer distribution of a sequence.
955
956 Valid compositions:
957
958 - None or 'none' : No composition sepecified
959 - 'auto' or 'automatic': Use the typical average distribution
960 for proteins and an equiprobable distribution for
961 everything else.
962 - 'equiprobable' : All monomers have the same probability.
963 - a percentage, e.g. '45%' or a fraction '0.45':
964 The fraction of CG bases for nucleotide alphabets
965 - a species name, e.g. 'E. coli', 'H. sapiens' :
966 Use the average CG percentage for the specie's
967 genome.
968 - An explicit distribution, e.g. {'A':10, 'C':40, 'G':40, 'T':10}
969 """
970
971
972 if composition is None: return None
973 comp = composition.strip()
974
975 if comp.lower() == 'none': return None
976
977 if weight is None and alphabet is not None: weight = float(len(alphabet))
978 if comp.lower() == 'equiprobable' :
979 prior = weight * equiprobable_distribution(len(alphabet))
980
981
982 elif comp.lower() == 'auto' or comp.lower() == 'automatic':
983 if alphabet == unambiguous_protein_alphabet :
984 prior = weight * asarray(aa_composition, float64)
985 else :
986 prior = weight * equiprobable_distribution(len(alphabet))
987 elif comp in std_percentCG :
988 prior = weight * base_distribution(std_percentCG[comp])
989
990 elif comp[-1] == '%' :
991 prior = weight * base_distribution( float(comp[:-1]))
992
993 elif isfloat(comp) :
994 prior = weight * base_distribution( float(comp)*100. )
995
996 elif composition[0] == '{' and composition[-1] == '}' :
997 explicit = composition[1: -1]
998 explicit = explicit.replace(',',' ').replace("'", ' ').replace('"',' ').replace(':', ' ').split()
999
1000 if len(explicit) != len(alphabet)*2 :
1001 #print explicit
1002 raise ValueError("Explicit prior does not match length of alphabet")
1003 prior = - ones(len(alphabet), float64)
1004 try :
1005 for r in range(len(explicit)/2) :
1006 letter = explicit[r*2]
1007 index = alphabet.index(letter)
1008 value = float(explicit[r*2 +1])
1009 prior[index] = value
1010 except ValueError :
1011 raise ValueError("Cannot parse explicit composition")
1012
1013 if any(prior==-1.) :
1014 raise ValueError("Explicit prior does not match alphabet")
1015 prior/= sum(prior)
1016 prior *= weight
1017
1018
1019 else :
1020 raise ValueError("Unknown or malformed composition: %s"%composition)
1021 if len(prior) != len(alphabet) :
1022 raise ValueError(
1023 "The sequence alphabet and composition are incompatible.")
1024
1025 return prior
1026
1027
1028 def base_distribution(percentCG) :
1029 A = (1. - (percentCG/100.))/2.
1030 C = (percentCG/100.)/2.
1031 G = (percentCG/100.)/2.
1032 T = (1. - (percentCG/100))/2.
1033 return asarray((A,C,G,T), float64)
1034
1035 def equiprobable_distribution( length) :
1036 return ones( (length), float64) /length
1037
1038
1039
1040
1041 def read_seq_data(fin, input_parser=seq_io.read,alphabet=None, ignore_lower_case=False, max_file_size=0):
1042 # TODO: Document this method and enviroment variable
1043 max_file_size =int(os.environ.get("WEBLOGO_MAX_FILE_SIZE", max_file_size))
1044
1045 # If max_file_size is set, or if fin==stdin (which is non-seekable), we
1046 # read the data and replace fin with a StringIO object.
1047
1048 if(max_file_size>0) :
1049 data = fin.read(max_file_size)
1050
1051 more_data = fin.read(2)
1052 if more_data != "" :
1053 raise IOError("File exceeds maximum allowed size: %d bytes" % max_file_size)
1054
1055 fin = StringIO(data)
1056 elif fin == sys.stdin:
1057 fin = StringIO(fin.read())
1058
1059 seqs = input_parser(fin)
1060
1061 if seqs is None or len(seqs) ==0 :
1062 raise ValueError("Please provide a multiple sequence alignment")
1063
1064 if ignore_lower_case :
1065 # Case is significant. Do not count lower case letters.
1066 for i,s in enumerate(seqs) :
1067 seqs[i] = s.mask()
1068
1069 global altype
1070 if(altype=="codonsT" or altype=="codonsU"):
1071 if 'T' in seqs[0] or 't' in seqs[0]:
1072 altype="codonsT"
1073 if 'U' in seqs[0] or 'u' in seqs[0]:
1074 altype="codonsU"
1075 global offset
1076 global isreversed
1077 seq2=[""]*len(seqs)
1078 seq2 = []
1079 for i in xrange(len(seqs)):
1080 seq2.append([])
1081 if(offset%6>2):
1082 for x in range(0,len(seqs)):
1083 backs=seqs[x][::-1]
1084 for y in range(0,len(backs)):
1085 seq2[x].append(str(backs[y]))
1086
1087 if(altype=="codonsU"):
1088 for x in range(0,len(seq2)):
1089 for y in range(0,len(seq2[x])):
1090 if(cmp(seq2[x][y],'G')==0):
1091 seq2[x][y]="C"
1092 elif(cmp(seq2[x][y],'A')==0):
1093 seq2[x][y]='U'
1094 elif(cmp(seq2[x][y],'U')==0):
1095 seq2[x][y]='A'
1096 elif(cmp(seq2[x][y],'C')==0):
1097 seq2[x][y]='G'
1098 if(altype=="codonsT"):
1099 for x in range(0,len(seq2)):
1100 for y in range(0,len(seq2[x])):
1101 if(cmp(seq2[x][y],'G')==0):
1102 seq2[x][y]='C'
1103 elif(cmp(seq2[x][y],'A')==0):
1104 seq2[x][y]='T'
1105 elif(cmp(seq2[x][y],'T')==0):
1106 seq2[x][y]='A'
1107 elif(cmp(seq2[x][y],'C')==0):
1108 seq2[x][y]='G'
1109 offset=offset%3
1110 isreversed=True
1111 for x in range(0,len(seqs)):
1112 seqs[x]=Seq("".join(seq2[x]))
1113
1114
1115 # Add alphabet to seqs.
1116 if alphabet :
1117 seqs.alphabet = alphabet
1118 else :
1119 seqs.alphabet = which_alphabet(seqs)
1120
1121 return seqs
1122
1123
1124
1125 #TODO: Move to seq_io?
1126 # Would have to check that no symbol outside of full alphabet?
1127 def which_alphabet(seqs) :
1128 """ Returns the most appropriate unambiguous protien, rna or dna alphabet
1129 for a Seq or SeqList.
1130 """
1131 alphabets = (unambiguous_protein_alphabet,
1132 unambiguous_rna_alphabet,
1133 unambiguous_dna_alphabet
1134 )
1135 # Heuristic
1136 # Count occurances of each letter. Downweight longer alphabet.
1137 #for x in seqs:
1138
1139
1140 if( altype=="codonsU"):
1141 return codon_alphabetU
1142 if( altype=="codonsT"):
1143 return codon_alphabetT
1144 else:
1145 score = [1.0*asarray(seqs.tally(a)).sum()/sqrt(len(a)) for a in alphabets]
1146 #print score
1147 best = argmax(score) # Ties go to last in list.
1148 a = alphabets[best]
1149 return a
1150
1151
1152
1153 class LogoData(object) :
1154 """The data needed to generate a sequence logo.
1155
1156 - alphabet
1157 - length
1158 - counts -- An array of character counts
1159 - entropy -- The relative entropy of each column
1160 - entropy_interval -- entropy confidence interval
1161 """
1162
1163 def __init__(self, length=None, alphabet = None, counts =None,
1164 entropy =None, entropy_interval = None, weight=None) :
1165 """Creates a new LogoData object"""
1166 self.length = length
1167 self.alphabet = alphabet
1168 self.counts = counts
1169 self.entropy = entropy
1170 self.entropy_interval = entropy_interval
1171 self.weight = weight
1172
1173
1174
1175 #@classmethod
1176 def from_counts(cls, alphabet, counts, prior= None):
1177
1178 """Build a logodata object from counts."""
1179 seq_length, A = counts.shape
1180
1181 if prior is not None: prior = array(prior, float64)
1182 if prior is None :
1183 R = log(A)
1184 ent = zeros( seq_length, float64)
1185 entropy_interval = None
1186 for i in range (0, seq_length) :
1187 C = sum(counts[i])
1188 #FIXME: fixup corebio.moremath.entropy()?
1189 if C == 0 :
1190 ent[i] = 0.0
1191 else :
1192 ent[i] = R - entropy(counts[i])
1193 else :
1194 ent = zeros( seq_length, float64)
1195 entropy_interval = zeros( (seq_length,2) , float64)
1196
1197 R = log(A)
1198
1199 for i in range (0, seq_length) :
1200 alpha = array(counts[i] , float64)
1201 alpha += prior
1202
1203 posterior = Dirichlet(alpha)
1204 ent[i] = posterior.mean_relative_entropy(prior/sum(prior))
1205 entropy_interval[i][0], entropy_interval[i][1] = \
1206 posterior.interval_relative_entropy(prior/sum(prior), 0.95)
1207 weight = array( na.sum(counts,axis=1) , float)
1208
1209 weight /= max(weight)
1210
1211 return cls(seq_length, alphabet, counts, ent, entropy_interval, weight)
1212 from_counts = classmethod(from_counts)
1213
1214
1215 #@classmethod
1216 def from_seqs(cls, seqs, prior= None):
1217
1218
1219 alphabet=seqs.alphabet
1220
1221 #get the offset and if it's greater than 2 swap bases to get the reverse compliment and reverse.
1222 for x in range(0,len(seqs)):
1223 seqs[x]=seqs[x].upper()
1224 counter=0
1225
1226
1227 """Build a 2D array from a SeqList, a list of sequences."""
1228 # --- VALIDATE DATA ---
1229 # check that there is at least one sequence of length 1
1230 if len(seqs)==0 or len(seqs[0]) ==0:
1231 raise ValueError("No sequence data found.")
1232 sys.exit(0)
1233 # Check sequence lengths
1234 seq_length = len(seqs[0])
1235 for i,s in enumerate(seqs) :
1236 #print i,s, len(s)
1237 if seq_length != len(s) :
1238 raise ArgumentError(
1239 "Sequence number %d differs in length from the previous sequences" % (i+1) ,'sequences')
1240 sys.exit(0)
1241
1242 if(altype=="codonsT" or altype=="codonsU"):
1243 x = [[0]*64 for x in xrange(seq_length/3)]
1244 counter=offset
1245
1246 while counter+offset<seq_length:
1247 for i in range(0,len(seqs)):
1248 if len(str(seqs[i][(counter):(counter+3)]))==3 and len(seqs[i][(counter):(counter+3)].strip("GATUC"))==0 :
1249 if(str(seqs[i][(counter):(counter+3)]) in alphabet):
1250 x[counter/3][ (alphabet.index(str(seqs[i][(counter):(counter+3)]))) ]+=1
1251 counter=counter+3
1252 counts=asarray(x)
1253 else:
1254 counts = asarray(seqs.tally())
1255
1256 return cls.from_counts(alphabet, counts, prior)
1257 from_seqs = classmethod(from_seqs)
1258
1259
1260
1261 def __str__(self) :
1262 out = StringIO()
1263 print >>out, '## LogoData'
1264 print >>out, '# First column is position number, couting from zero'
1265 print >>out, '# Subsequent columns are raw symbol counts'
1266 print >>out, '# Entropy is mean entropy measured in nats.'
1267 print >>out, '# Low and High are the 95% confidence limits.'
1268 print >>out, '# Weight is the fraction of non-gap symbols in the column.'
1269 print >>out, '#\t'
1270 print >>out, '#\t',
1271 for a in self.alphabet :
1272 print >>out, a, '\t',
1273 print >>out, 'Entropy\tLow\tHigh\tWeight'
1274
1275 for i in range(self.length) :
1276 print >>out, i, '\t',
1277 for c in self.counts[i] : print >>out, c, '\t',
1278 print >>out, self.entropy[i], '\t',
1279 if self.entropy_interval is not None:
1280 print >>out, self.entropy_interval[i][0], '\t',
1281 print >>out, self.entropy_interval[i][1], '\t',
1282 else :
1283 print >>out, '\t','\t',
1284 if self.weight is not None :
1285 print >>out, self.weight[i],
1286 print >>out, ''
1287 print >>out, '# End LogoData'
1288
1289 return out.getvalue()
1290
1291 # ====================== Main: Parse Command line =============================
1292 def main():
1293 """CodonLogo command line interface """
1294
1295 # ------ Parse Command line ------
1296 parser = _build_option_parser()
1297 (opts, args) = parser.parse_args(sys.argv[1:])
1298 if args : parser.error("Unparsable arguments: %s " % args)
1299
1300 if opts.serve:
1301 httpd_serve_forever(opts.port) # Never returns?
1302 sys.exit(0)
1303
1304
1305 # ------ Create Logo ------
1306 try:
1307 data = _build_logodata(opts)
1308
1309
1310 format = _build_logoformat(data, opts)
1311
1312
1313 formatter = opts.formatter
1314 formatter(data, format, opts.fout)
1315
1316 except ValueError, err :
1317 print >>sys.stderr, 'Error:', err
1318 sys.exit(2)
1319 except KeyboardInterrupt, err:
1320 sys.exit(0)
1321 # End main()
1322
1323
1324 def httpd_serve_forever(port=8080) :
1325 """ Start a webserver on a local port."""
1326 import BaseHTTPServer
1327 import CGIHTTPServer
1328
1329 class __HTTPRequestHandler(CGIHTTPServer.CGIHTTPRequestHandler):
1330 def is_cgi(self) :
1331 if self.path == "/create.cgi":
1332 self.cgi_info = '', 'create.cgi'
1333 return True
1334 return False
1335
1336 # Add current directory to PYTHONPATH. This is
1337 # so that we can run the standalone server
1338 # without having to run the install script.
1339 pythonpath = os.getenv("PYTHONPATH", '')
1340 pythonpath += ":" + os.path.abspath(sys.path[0]).split()[0]
1341 os.environ["PYTHONPATH"] = pythonpath
1342
1343 htdocs = resource_filename(__name__, 'htdocs', __file__)
1344 os.chdir(htdocs)
1345
1346 HandlerClass = __HTTPRequestHandler
1347 ServerClass = BaseHTTPServer.HTTPServer
1348 httpd = ServerClass(('', port), HandlerClass)
1349 print "Serving HTTP on localhost:%d ..." % port
1350
1351 try :
1352 httpd.serve_forever()
1353 except KeyboardInterrupt:
1354 sys.exit(0)
1355 # end httpd_serve_forever()
1356
1357 def read_priors(finp, alphabet ,max_file_size=0):
1358
1359 max_file_size =int(os.environ.get("WEBLOGO_MAX_FILE_SIZE", max_file_size))
1360 if(max_file_size>0) :
1361 data = finp.read(max_file_size)
1362 more_data = finp.read(2)
1363 if more_data != "" :
1364 raise IOError("File exceeds maximum allowed size: %d bytes" % max_file_size)
1365 finp = StringIO(data)
1366 priordict={}
1367 while 1:
1368 line = finp.readline()
1369 if not line:
1370 break
1371 line = line.split()
1372 priordict[line[0]]=line[1]
1373 return priordict
1374
1375 def _build_logodata(options) :
1376 global offset
1377 offset=options.frame
1378 options.alphabet = None
1379 options.ignore_lower_case = False
1380 #options.default_color = Color.by_name("black")
1381 options.color_scheme=None
1382 #options.colors=[]
1383 options.show_ends=False
1384 seqs = read_seq_data(options.fin,
1385 options.input_parser.read,
1386 alphabet=options.alphabet,
1387 ignore_lower_case = options.ignore_lower_case)
1388 if(options.priorfile!=None):
1389 if(altype=="CodonsT"):
1390 options.composition= str(read_priors(options.priorfile,codon_alphabetT))
1391 options.alphabet = codon_alphabetT
1392 else:
1393 options.composition= str(read_priors(options.priorfile,codon_alphabetU))
1394 options.alphabet = codon_alphabetU
1395
1396 prior = parse_prior( options.composition,seqs.alphabet, options.weight)
1397 data = LogoData.from_seqs(seqs, prior)
1398 return data
1399
1400
1401 def _build_logoformat( logodata, opts) :
1402 """ Extract and process relevant option values and return a
1403 LogoFormat object."""
1404
1405 args = {}
1406 direct_from_opts = [
1407 "stacks_per_line",
1408 "logo_title",
1409 "yaxis_label",
1410 "show_xaxis",
1411 "show_yaxis",
1412 "xaxis_label",
1413 "show_ends",
1414 "fineprint",
1415 "show_errorbars",
1416 "show_boxes",
1417 "yaxis_tic_interval",
1418 "resolution",
1419 "alphabet",
1420 "debug",
1421 "show_ends",
1422 "default_color",
1423 #"show_color_key",
1424 "color_scheme",
1425 "unit_name",
1426 "logo_label",
1427 "yaxis_scale",
1428 "first_index",
1429 "logo_start",
1430 "logo_end",
1431 "scale_width",
1432 "frame",
1433 ]
1434
1435 for k in direct_from_opts:
1436 args[k] = opts.__dict__[k]
1437 logo_size = copy.copy(opts.__dict__['logo_size'])
1438 size_from_opts = ["stack_width", "stack_height"]
1439 for k in size_from_opts :
1440 length = getattr(opts, k)
1441 if length : setattr( logo_size, k, length )
1442 args["size"] = logo_size
1443
1444 global col
1445
1446 if opts.colors:
1447 color_scheme = ColorScheme()
1448 for color, symbols, desc in opts.colors:
1449 try :
1450 #c = Color.from_string(color)
1451 color_scheme.groups.append( ColorGroup(symbols, color, desc) )
1452 #print >> sys.stderr, color
1453
1454 col.append( ColorGroup(symbols, color, desc) )
1455
1456 except ValueError :
1457 raise ValueError(
1458 "error: option --color: invalid value: '%s'" % color )
1459 if(altype!="codonsU" and altype!="codonsT") :
1460 args["color_scheme"] = color_scheme
1461
1462 #cf = colorscheme.format_color(col[0])
1463 #col.append( " ("+group.symbols+") " + cf )
1464
1465 logooptions = LogoOptions()
1466 for a, v in args.iteritems() :
1467 setattr(logooptions,a,v)
1468
1469
1470
1471 theformat = LogoFormat(logodata, logooptions )
1472
1473 return theformat
1474
1475
1476
1477
1478
1479
1480 # ========================== OPTIONS ==========================
1481 def _build_option_parser() :
1482 defaults = LogoOptions()
1483 parser = DeOptionParser(usage="%prog [options] < sequence_data.fa > sequence_logo.eps",
1484 description = description,
1485 version = __version__ ,
1486 add_verbose_options = False
1487 )
1488
1489 io_grp = OptionGroup(parser, "Input/Output Options",)
1490 data_grp = OptionGroup(parser, "Logo Data Options",)
1491 format_grp = OptionGroup(parser, "Logo Format Options",
1492 "These options control the format and display of the logo.")
1493 color_grp = OptionGroup(parser, "Color Options",
1494 "Colors can be specified using CSS2 syntax. e.g. 'red', '#FF0000', etc.")
1495 advanced_grp = OptionGroup(parser, "Advanced Format Options",
1496 "These options provide fine control over the display of the logo. ")
1497 server_grp = OptionGroup(parser, "CodonLogo Server",
1498 "Run a standalone webserver on a local port.")
1499
1500
1501 parser.add_option_group(io_grp)
1502 parser.add_option_group(data_grp)
1503 parser.add_option_group(format_grp)
1504 parser.add_option_group(color_grp)
1505 parser.add_option_group(advanced_grp)
1506 parser.add_option_group(server_grp)
1507
1508 # ========================== IO OPTIONS ==========================
1509
1510
1511
1512 io_grp.add_option( "-f", "--fin",
1513 dest="fin",
1514 action="store",
1515 type="file_in",
1516 default=sys.stdin,
1517 help="Sequence input file (default: stdin)",
1518 metavar="FILENAME")
1519
1520 io_grp.add_option( "-R", "--prior",
1521 dest="priorfile",
1522 action="store",
1523 type="file_in",
1524 help="A file with 64 codons and their prior probabilities, one per line, each codon followed by a space and it's probability.",
1525 metavar="FILENAME")
1526
1527 io_grp.add_option("", "--fin-format",
1528 dest="input_parser",
1529 action="store", type ="dict",
1530 default = seq_io,
1531 choices = seq_io.format_names(),
1532 help="Multiple sequence alignment format: (%s)" %
1533 ', '.join([ f.names[0] for f in seq_io.formats]),
1534 metavar="FORMAT")
1535
1536 io_grp.add_option("-o", "--fout", dest="fout",
1537 type="file_out",
1538 default=sys.stdout,
1539 help="Output file (default: stdout)",
1540 metavar="FILENAME")
1541
1542
1543
1544 io_grp.add_option( "-F", "--format",
1545 dest="formatter",
1546 action="store",
1547 type="dict",
1548 choices = formatters,
1549 metavar= "FORMAT",
1550 help="Format of output: eps (default), png, png_print, pdf, jpeg, txt",
1551 default = default_formatter)
1552
1553
1554 # ========================== Data OPTIONS ==========================
1555
1556 data_grp.add_option("-m", "--frame",
1557 dest="frame",
1558 action="store",
1559 type="int",
1560 default=0,
1561 help="Offset of the reading frame you wish to look in (default: 0) 3-6 indicate the reverse complement in which case codonlogo will read from the last symbol in the sequences backwards and replace each base with it's complement.",
1562 metavar="NUMBER")
1563
1564 #data_grp.add_option("-T", "--type",
1565 #dest="altype",
1566 #action="store",
1567 #type="boolean",
1568 #default=True,
1569 #help="Generate a codon logo rather than a sequence logo (default: True)",
1570 #metavar="YES/NO")
1571
1572
1573
1574 #data_grp.add_option( "-A", "--sequence-type",
1575 #dest="alphabet",
1576 #action="store",
1577 #type="dict",
1578 #choices = std_alphabets,
1579 #help="The type of sequence data: 'protein', 'rna' or 'dna'.",
1580 #metavar="TYPE")
1581
1582 #data_grp.add_option( "-a", "--alphabet",
1583 #dest="alphabet",
1584 #action="store",
1585 #help="The set of symbols to count, e.g. 'AGTC'. "
1586 #"All characters not in the alphabet are ignored. "
1587 #"If neither the alphabet nor sequence-type are specified then codonlogo will examine the input data and make an educated guess. "
1588 #"See also --sequence-type, --ignore-lower-case" )
1589
1590 # FIXME Add test?
1591 #data_grp.add_option( "", "--ignore-lower-case",
1592 #dest="ignore_lower_case",
1593 #action="store",
1594 #type = "boolean",
1595 #default=False,
1596 #metavar = "YES/NO",
1597 #help="Disregard lower case letters and only count upper case letters in sequences? (Default: No)"
1598 #)
1599
1600 data_grp.add_option( "-U", "--units",
1601 dest="unit_name",
1602 action="store",
1603 choices = std_units.keys(),
1604 type="choice",
1605 default = defaults.unit_name,
1606 help="A unit of entropy ('bits' (default), 'nats', 'digits'), or a unit of free energy ('kT', 'kJ/mol', 'kcal/mol'), or 'probability' for probabilities",
1607 metavar = "NUMBER")
1608
1609
1610 data_grp.add_option( "", "--composition",
1611 dest="composition",
1612 action="store",
1613 type="string",
1614 default = "auto",
1615 help="The expected composition of the sequences: 'auto' (default), 'equiprobable', 'none' (Do not perform any compositional adjustment), ",
1616 metavar="COMP.")
1617
1618 data_grp.add_option( "", "--weight",
1619 dest="weight",
1620 action="store",
1621 type="float",
1622 default = None,
1623 help="The weight of prior data. Default: total pseudocounts equal to the number of monomer types.",
1624 metavar="NUMBER")
1625
1626 data_grp.add_option( "-i", "--first-index",
1627 dest="first_index",
1628 action="store",
1629 type="int",
1630 default = 1,
1631 help="Index of first position in sequence data (default: 1)",
1632 metavar="INDEX")
1633
1634 data_grp.add_option( "-l", "--lower",
1635 dest="logo_start",
1636 action="store",
1637 type="int",
1638 help="Lower bound of sequence to display",
1639 metavar="INDEX")
1640
1641 data_grp.add_option( "-u", "--upper",
1642 dest="logo_end",
1643 action="store",
1644 type="int",
1645 help="Upper bound of sequence to display",
1646 metavar="INDEX")
1647
1648 # ========================== FORMAT OPTIONS ==========================
1649
1650 format_grp.add_option( "-s", "--size",
1651 dest="logo_size",
1652 action="store",
1653 type ="dict",
1654 choices = std_sizes,
1655 metavar = "LOGOSIZE",
1656 default = defaults.size,
1657 help="Specify a standard logo size (small, medium (default), large)" )
1658
1659
1660
1661 format_grp.add_option( "-n", "--stacks-per-line",
1662 dest="stacks_per_line",
1663 action="store",
1664 type="int",
1665 help="Maximum number of logo stacks per logo line. (default: %default)",
1666 default = defaults.stacks_per_line,
1667 metavar="COUNT")
1668
1669 format_grp.add_option( "-t", "--title",
1670 dest="logo_title",
1671 action="store",
1672 type="string",
1673 help="Logo title text.",
1674 default = defaults.logo_title,
1675 metavar="TEXT")
1676
1677 format_grp.add_option( "", "--label",
1678 dest="logo_label",
1679 action="store",
1680 type="string",
1681 help="A figure label, e.g. '2a'",
1682 default = defaults.logo_label,
1683 metavar="TEXT")
1684
1685 format_grp.add_option( "-X", "--show-xaxis",
1686 action="store",
1687 type = "boolean",
1688 default= defaults.show_xaxis,
1689 metavar = "YES/NO",
1690 help="Display sequence numbers along x-axis? (default: %default)")
1691
1692 format_grp.add_option( "-x", "--xlabel",
1693 dest="xaxis_label",
1694 action="store",
1695 type="string",
1696 default = defaults.xaxis_label,
1697 help="X-axis label",
1698 metavar="TEXT")
1699
1700 format_grp.add_option( "-S", "--yaxis",
1701 dest="yaxis_scale",
1702 action="store",
1703 type="float",
1704 help="Height of yaxis in units. (Default: Maximum value with uninformative prior.)",
1705 metavar = "UNIT")
1706
1707 format_grp.add_option( "-Y", "--show-yaxis",
1708 action="store",
1709 type = "boolean",
1710 dest = "show_yaxis",
1711 default= defaults.show_yaxis,
1712 metavar = "YES/NO",
1713 help="Display entropy scale along y-axis? (default: %default)")
1714
1715 format_grp.add_option( "-y", "--ylabel",
1716 dest="yaxis_label",
1717 action="store",
1718 type="string",
1719 help="Y-axis label (default depends on plot type and units)",
1720 metavar="TEXT")
1721
1722 #format_grp.add_option( "-E", "--show-ends",
1723 #action="store",
1724 #type = "boolean",
1725 #default= defaults.show_ends,
1726 #metavar = "YES/NO",
1727 #help="Label the ends of the sequence? (default: %default)")
1728
1729 format_grp.add_option( "-P", "--fineprint",
1730 dest="fineprint",
1731 action="store",
1732 type="string",
1733 default= defaults.fineprint,
1734 help="The fine print (default: Codonlogo version)",
1735 metavar="TEXT")
1736
1737 format_grp.add_option( "", "--ticmarks",
1738 dest="yaxis_tic_interval",
1739 action="store",
1740 type="float",
1741 default= defaults.yaxis_tic_interval,
1742 help="Distance between ticmarks (default: %default)",
1743 metavar = "NUMBER")
1744
1745
1746 format_grp.add_option( "", "--errorbars",
1747 dest = "show_errorbars",
1748 action="store",
1749 type = "boolean",
1750 default= defaults.show_errorbars,
1751 metavar = "YES/NO",
1752 help="Display error bars? (default: %default)")
1753
1754
1755
1756 # ========================== Color OPTIONS ==========================
1757 # TODO: Future Feature
1758 # color_grp.add_option( "-K", "--color-key",
1759 # dest= "show_color_key",
1760 # action="store",
1761 # type = "boolean",
1762 # default= defaults.show_color_key,
1763 # metavar = "YES/NO",
1764 # help="Display a color key (default: %default)")
1765
1766
1767 #color_scheme_choices = std_color_schemes.keys()
1768 #color_scheme_choices.sort()
1769 #color_grp.add_option( "-c", "--color-scheme",
1770 #dest="color_scheme",
1771 #action="store",
1772 #type ="dict",
1773 #choices = std_color_schemes,
1774 #metavar = "SCHEME",
1775 #default = None, # Auto
1776 #help="Specify a standard color scheme (%s)" % \
1777 #", ".join(color_scheme_choices) )
1778
1779 color_grp.add_option( "-C", "--color",
1780 dest="colors",
1781 action="append",
1782 metavar="COLOR SYMBOLS DESCRIPTION ",
1783 nargs = 3,
1784 default=[],
1785 help="Specify symbol colors, e.g. --color black AG 'Purine' --color red TC 'Pyrimidine' ")
1786
1787 color_grp.add_option( "", "--default-color",
1788 dest="default_color",
1789 action="store",
1790 metavar="COLOR",
1791 default= defaults.default_color,
1792 help="Symbol color if not otherwise specified.")
1793
1794 # ========================== Advanced options =========================
1795
1796 advanced_grp.add_option( "-W", "--stack-width",
1797 dest="stack_width",
1798 action="store",
1799 type="float",
1800 default= None,
1801 help="Width of a logo stack (default: %s)"% defaults.size.stack_width,
1802 metavar="POINTS" )
1803
1804 advanced_grp.add_option( "-H", "--stack-height",
1805 dest="stack_height",
1806 action="store",
1807 type="float",
1808 default= None,
1809 help="Height of a logo stack (default: %s)"%defaults.size.stack_height,
1810 metavar="POINTS" )
1811
1812 advanced_grp.add_option( "", "--box",
1813 dest="show_boxes",
1814 action="store",
1815 type = "boolean",
1816 default=False,
1817 metavar = "YES/NO",
1818 help="Draw boxes around symbols? (default: no)")
1819
1820 advanced_grp.add_option( "", "--resolution",
1821 dest="resolution",
1822 action="store",
1823 type="float",
1824 default=96,
1825 help="Bitmap resolution in dots per inch (DPI). (default: 96 DPI, except png_print, 600 DPI) Low resolution bitmaps (DPI<300) are antialiased.",
1826 metavar="DPI")
1827
1828 advanced_grp.add_option( "", "--scale-width",
1829 dest="scale_width",
1830 action="store",
1831 type = "boolean",
1832 default= True,
1833 metavar = "YES/NO",
1834 help="Scale the visible stack width by the fraction of symbols in the column? (i.e. columns with many gaps of unknowns are narrow.) (default: yes)")
1835
1836 advanced_grp.add_option( "", "--debug",
1837 action="store",
1838 type = "boolean",
1839 default= defaults.debug,
1840 metavar = "YES/NO",
1841 help="Output additional diagnostic information. (default: %default)")
1842
1843
1844 # ========================== Server options =========================
1845 server_grp.add_option( "", "--serve",
1846 dest="serve",
1847 action="store_true",
1848 default= False,
1849 help="Start a standalone CodonLogo server for creating sequence logos.")
1850
1851 server_grp.add_option( "", "--port",
1852 dest="port",
1853 action="store",
1854 type="int",
1855 default= 8080,
1856 help="Listen to this local port. (Default: %default)",
1857 metavar="PORT")
1858
1859 return parser
1860
1861 # END _build_option_parser
1862
1863
1864 ##############################################################
1865
1866 class Dirichlet(object) :
1867 """The Dirichlet probability distribution. The Dirichlet is a continuous
1868 multivariate probability distribution across non-negative unit length
1869 vectors. In other words, the Dirichlet is a probability distribution of
1870 probability distributions. It is conjugate to the multinomial
1871 distribution and is widely used in Bayesian statistics.
1872
1873 The Dirichlet probability distribution of order K-1 is
1874
1875 p(theta_1,...,theta_K) d theta_1 ... d theta_K =
1876 (1/Z) prod_i=1,K theta_i^{alpha_i - 1} delta(1 -sum_i=1,K theta_i)
1877
1878 The normalization factor Z can be expressed in terms of gamma functions:
1879
1880 Z = {prod_i=1,K Gamma(alpha_i)} / {Gamma( sum_i=1,K alpha_i)}
1881
1882 The K constants, alpha_1,...,alpha_K, must be positive. The K parameters,
1883 theta_1,...,theta_K are nonnegative and sum to 1.
1884
1885 Status:
1886 Alpha
1887 """
1888 __slots__ = 'alpha', '_total', '_mean',
1889
1890
1891
1892
1893 def __init__(self, alpha) :
1894 """
1895 Args:
1896 - alpha -- The parameters of the Dirichlet prior distribution.
1897 A vector of non-negative real numbers.
1898 """
1899 # TODO: Check that alphas are positive
1900 #TODO : what if alpha's not one dimensional?
1901 self.alpha = asarray(alpha, float64)
1902
1903 self._total = sum(alpha)
1904 self._mean = None
1905
1906
1907 def sample(self) :
1908 """Return a randomly generated probability vector.
1909
1910 Random samples are generated by sampling K values from gamma
1911 distributions with parameters a=\alpha_i, b=1, and renormalizing.
1912
1913 Ref:
1914 A.M. Law, W.D. Kelton, Simulation Modeling and Analysis (1991).
1915 Authors:
1916 Gavin E. Crooks <gec@compbio.berkeley.edu> (2002)
1917 """
1918 alpha = self.alpha
1919 K = len(alpha)
1920 theta = zeros( (K,), float64)
1921
1922 for k in range(K):
1923 theta[k] = random.gammavariate(alpha[k], 1.0)
1924 theta /= sum(theta)
1925
1926 return theta
1927
1928 def mean(self) :
1929 if self._mean ==None:
1930 self._mean = self.alpha / self._total
1931 return self._mean
1932
1933 def covariance(self) :
1934 alpha = self.alpha
1935 A = sum(alpha)
1936 #A2 = A * A
1937 K = len(alpha)
1938 cv = zeros( (K,K), float64)
1939
1940 for i in range(K) :
1941 cv[i,i] = alpha[i] * (1. - alpha[i]/A) / (A * (A+1.) )
1942
1943 for i in range(K) :
1944 for j in range(i+1,K) :
1945 v = - alpha[i] * alpha[j] / (A * A * (A+1.) )
1946 cv[i,j] = v
1947 cv[j,i] = v
1948 return cv
1949
1950 def mean_x(self, x) :
1951 x = asarray(x, float64)
1952 if shape(x) != shape(self.alpha) :
1953 raise ValueError("Argument must be same dimension as Dirichlet")
1954 return sum( x * self.mean())
1955
1956 def variance_x(self, x) :
1957 x = asarray(x, float64)
1958 if shape(x) != shape(self.alpha) :
1959 raise ValueError("Argument must be same dimension as Dirichlet")
1960
1961 cv = self.covariance()
1962 var = na.dot(na.dot(na.transpose( x), cv), x)
1963 return var
1964
1965
1966 def mean_entropy(self) :
1967 """Calculate the average entropy of probabilities sampled
1968 from this Dirichlet distribution.
1969
1970 Returns:
1971 The average entropy.
1972
1973 Ref:
1974 Wolpert & Wolf, PRE 53:6841-6854 (1996) Theorem 7
1975 (Warning: this paper contains typos.)
1976 Status:
1977 Alpha
1978 Authors:
1979 GEC 2005
1980
1981 """
1982 # TODO: Optimize
1983 alpha = self.alpha
1984 A = float(sum(alpha))
1985 ent = 0.0
1986 for a in alpha:
1987 if a>0 : ent += - 1.0 * a * digamma( 1.0+a) # FIXME: Check
1988 ent /= A
1989 ent += digamma(A+1.0)
1990 return ent
1991
1992
1993
1994 def variance_entropy(self):
1995 """Calculate the variance of the Dirichlet entropy.
1996
1997 Ref:
1998 Wolpert & Wolf, PRE 53:6841-6854 (1996) Theorem 8
1999 (Warning: this paper contains typos.)
2000 """
2001 alpha = self.alpha
2002 A = float(sum(alpha))
2003 A2 = A * (A+1)
2004 L = len(alpha)
2005
2006 dg1 = zeros( (L) , float64)
2007 dg2 = zeros( (L) , float64)
2008 tg2 = zeros( (L) , float64)
2009
2010 for i in range(L) :
2011 dg1[i] = digamma(alpha[i] + 1.0)
2012 dg2[i] = digamma(alpha[i] + 2.0)
2013 tg2[i] = trigamma(alpha[i] + 2.0)
2014
2015 dg_Ap2 = digamma( A+2. )
2016 tg_Ap2 = trigamma( A+2. )
2017
2018 mean = self.mean_entropy()
2019 var = 0.0
2020
2021 for i in range(L) :
2022 for j in range(L) :
2023 if i != j :
2024 var += (
2025 ( dg1[i] - dg_Ap2 ) * (dg1[j] - dg_Ap2 ) - tg_Ap2
2026 ) * (alpha[i] * alpha[j] ) / A2
2027 else :
2028 var += (
2029 ( dg2[i] - dg_Ap2 ) **2 + ( tg2[i] - tg_Ap2 )
2030 ) * ( alpha[i] * (alpha[i]+1.) ) / A2
2031
2032 var -= mean**2
2033 return var
2034
2035
2036
2037 def mean_relative_entropy(self, pvec) :
2038 ln_p = na.log(pvec)
2039 return - self.mean_x(ln_p) - self.mean_entropy()
2040
2041
2042 def variance_relative_entropy(self, pvec) :
2043 ln_p = na.log(pvec)
2044 return self.variance_x(ln_p) + self.variance_entropy()
2045
2046
2047 def interval_relative_entropy(self, pvec, frac) :
2048 mean = self.mean_relative_entropy(pvec)
2049 variance = self.variance_relative_entropy(pvec)
2050 # If the variance is small, use the standard 95%
2051 # confidence interval: mean +/- 1.96 * sd
2052 if variance< 0.1 :
2053 sd = sqrt(variance)
2054 return max(0.0, mean - sd*1.96), mean + sd*1.96
2055 sd = sqrt(variance)
2056 return max(0.0, mean - sd*1.96), mean + sd*1.96
2057
2058 g = gamma.from_mean_variance(mean, variance)
2059 low_limit = g.inverse_cdf( (1.-frac)/2.)
2060 high_limit = g.inverse_cdf( 1. - (1.-frac)/2. )
2061
2062 return low_limit, high_limit
2063
2064
2065 # Standard python voodoo for CLI
2066 if __name__ == "__main__":
2067 ## Code Profiling. Uncomment these lines
2068 #import hotshot, hotshot.stats
2069 #prof = hotshot.Profile("stones.prof")
2070 #prof.runcall(main)
2071 #prof.close()
2072 #stats = hotshot.stats.load("stones.prof")
2073 #stats.strip_dirs()
2074 #stats.sort_stats('cumulative', 'calls')
2075 #stats.print_stats(40)
2076 #sys.exit()
2077
2078 main()
2079
2080
2081
2082
2083
2084