0
|
1
|
|
2 # Copyright (c) 2005 Gavin E. Crooks <gec@compbio.berkeley.edu>
|
|
3 #
|
|
4 # This software is distributed under the MIT Open Source License.
|
|
5 # <http://www.opensource.org/licenses/mit-license.html>
|
|
6 #
|
|
7 # Permission is hereby granted, free of charge, to any person obtaining a
|
|
8 # copy of this software and associated documentation files (the "Software"),
|
|
9 # to deal in the Software without restriction, including without limitation
|
|
10 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
11 # and/or sell copies of the Software, and to permit persons to whom the
|
|
12 # Software is furnished to do so, subject to the following conditions:
|
|
13 #
|
|
14 # The above copyright notice and this permission notice shall be included
|
|
15 # in all copies or substantial portions of the Software.
|
|
16 #
|
|
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
23 # THE SOFTWARE.
|
|
24 #
|
|
25
|
|
26
|
|
27
|
|
28 """ Alphabetic sequences and associated tools and data.
|
|
29
|
|
30 Seq is a subclass of a python string with additional annotation and an alphabet.
|
|
31 The characters in string must be contained in the alphabet. Various standard
|
|
32 alphabets are provided.
|
|
33
|
|
34
|
|
35 Classes :
|
|
36 Alphabet -- A subset of non-null ascii characters
|
|
37 Seq -- An alphabetic string
|
|
38 SeqList -- A collection of Seq's
|
|
39
|
|
40 Alphabets :
|
|
41 o generic_alphabet -- A generic alphabet. Any printable ASCII character.
|
|
42 o protein_alphabet -- IUCAP/IUB Amino Acid one letter codes.
|
|
43 o nucleic_alphabet -- IUPAC/IUB Nucleic Acid codes 'ACGTURYSWKMBDHVN-'
|
|
44 o dna_alphabet -- Same as nucleic_alphabet, with 'U' (Uracil) an
|
|
45 alternative for 'T' (Thymidine).
|
|
46 o rna_alphabet -- Same as nucleic_alphabet, with 'T' (Thymidine) an
|
|
47 alternative for 'U' (Uracil).
|
|
48 o reduced_nucleic_alphabet -- All ambiguous codes in 'nucleic_alphabet' are
|
|
49 alternative to 'N' (aNy)
|
|
50 o reduced_protein_alphabet -- All ambiguous ('BZJ') and non-canonical amino
|
|
51 acids codes ( 'U', Selenocysteine and 'O', Pyrrolysine) in
|
|
52 'protein_alphabet' are alternative to 'X'.
|
|
53 o unambiguous_dna_alphabet -- 'ACGT'
|
|
54 o unambiguous_rna_alphabet -- 'ACGU'
|
|
55 o unambiguous_protein_alphabet -- The twenty canonical amino acid one letter
|
|
56 codes, in alphabetic order, 'ACDEFGHIKLMNPQRSTVWY'
|
|
57
|
|
58 Amino Acid Codes:
|
|
59 Code Alt. Meaning
|
|
60 -----------------
|
|
61 A Alanine
|
|
62 B Aspartic acid or Asparagine
|
|
63 C Cysteine
|
|
64 D Aspartate
|
|
65 E Glutamate
|
|
66 F Phenylalanine
|
|
67 G Glycine
|
|
68 H Histidine
|
|
69 I Isoleucine
|
|
70 J Leucine or Isoleucine
|
|
71 K Lysine
|
|
72 L Leucine
|
|
73 M Methionine
|
|
74 N Asparagine
|
|
75 O Pyrrolysine
|
|
76 P Proline
|
|
77 Q Glutamine
|
|
78 R Arginine
|
|
79 S Serine
|
|
80 T Threonine
|
|
81 U Selenocysteine
|
|
82 V Valine
|
|
83 W Tryptophan
|
|
84 Y Tyrosine
|
|
85 Z Glutamate or Glutamine
|
|
86 X ? any
|
|
87 * translation stop
|
|
88 - .~ gap
|
|
89
|
|
90 Nucleotide Codes:
|
|
91 Code Alt. Meaning
|
|
92 ------------------------------
|
|
93 A Adenosine
|
|
94 C Cytidine
|
|
95 G Guanine
|
|
96 T Thymidine
|
|
97 U Uracil
|
|
98 R G A (puRine)
|
|
99 Y T C (pYrimidine)
|
|
100 K G T (Ketone)
|
|
101 M A C (aMino group)
|
|
102 S G C (Strong interaction)
|
|
103 W A T (Weak interaction)
|
|
104 B G T C (not A) (B comes after A)
|
|
105 D G A T (not C) (D comes after C)
|
|
106 H A C T (not G) (H comes after G)
|
|
107 V G C A (not T, not U) (V comes after U)
|
|
108 N X? A G C T (aNy)
|
|
109 - .~ A gap
|
|
110
|
|
111
|
|
112
|
|
113
|
|
114 Refs:
|
|
115 http://www.chem.qmw.ac.uk/iupac/AminoAcid/A2021.html
|
|
116 http://www.chem.qmw.ac.uk/iubmb/misc/naseq.html
|
|
117 Status:
|
|
118 Beta
|
|
119 Authors:
|
|
120 GEC 2004,2005
|
|
121 """
|
|
122
|
|
123 # TODO: Add this to docstring somewhere.
|
|
124 # To replace all ambiguous nucleic code by 'N', replace alphabet and then n
|
|
125 # normalize.
|
|
126 #
|
|
127 # >>> Seq( 'ACGT-RYKM', reduced_nucleic_alphabet).normalized()
|
|
128 # 'ACGT-NNNN'
|
|
129
|
|
130 from array import array
|
|
131 from string import maketrans
|
|
132 from corebio.moremath import argmax, sqrt
|
|
133
|
|
134 __all__ = [
|
|
135 'Alphabet',
|
|
136 'Seq',
|
|
137 'rna', 'dna', 'protein',
|
|
138 'SeqList',
|
|
139 'generic_alphabet',
|
|
140 'protein_alphabet',
|
|
141 'nucleic_alphabet',
|
|
142 'dna_alphabet',
|
|
143 'rna_alphabet',
|
|
144 'reduced_nucleic_alphabet',
|
|
145 'reduced_protein_alphabet',
|
|
146 'unambiguous_dna_alphabet',
|
|
147 'unambiguous_dna_alphabet',
|
|
148 'unambiguous_rna_alphabet',
|
|
149 'unambiguous_protein_alphabet',
|
|
150 'generic_alphabet'
|
|
151 ]
|
|
152
|
|
153
|
|
154
|
|
155 class Alphabet(object) :
|
|
156 """An ordered subset of printable ascii characters.
|
|
157
|
|
158 Status:
|
|
159 Beta
|
|
160 Authors:
|
|
161 - GEC 2005
|
|
162 """
|
|
163 __slots__ = ['_letters', '_alternatives','_ord_table', '_chr_table']
|
|
164
|
|
165 # We're immutable, so use __new__ not __init__
|
|
166 def __new__(cls, letters, alternatives= None) :
|
|
167 """Create a new, immutable Alphabet.
|
|
168
|
|
169 arguments:
|
|
170 - letters -- the letters in the alphabet. The ordering determines
|
|
171 the ordinal position of each character in this alphabet.
|
|
172 - alt -- A list of (alternative, canonical) letters. The alternatives
|
|
173 are given the same ordinal position as the canonical character.
|
|
174 e.g. (('?','X'),('x', 'X')) states that '?' and 'x' are synomonous
|
|
175 with 'X'. Values that are not in 'letters' are ignored. Alternatives
|
|
176 that are already in 'letters' are also ignored. If the same
|
|
177 alternative character is used twice then the alternative is assigned
|
|
178 to the canonical character that occurs first in 'letters'. The
|
|
179 default is to assume that upper and lower case characters are
|
|
180 equivalent, unless both cases are included in 'letters'.
|
|
181 raises:
|
|
182 ValueError : Repetitive or otherwise illegal set of letters.
|
|
183 """
|
|
184 self = object.__new__(cls)
|
|
185
|
|
186 # Printable Ascii characters
|
|
187 ascii_letters = "".join([chr(__i) for __i in range(32,128)])
|
|
188
|
|
189 if letters is None : letters = ascii_letters
|
|
190 self._letters = letters
|
|
191
|
|
192 equivalent_by_case = zip( 'abcdefghijklmnopqrstuvwxyz',
|
|
193 'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
|
|
194
|
|
195 if alternatives is None : alternatives = equivalent_by_case
|
|
196
|
|
197
|
|
198 # The ord_table maps between the ordinal position of a character in ascii
|
|
199 # and the ordinal position in this alphabet. Characters not in the
|
|
200 # alphabet are given a position of 255. The ord_table is stored as a
|
|
201 # string.
|
|
202 ord_table = ["\xff",] * 256
|
|
203 for i,a in enumerate(letters) :
|
|
204 n = ord(a)
|
|
205 if n == 0 :
|
|
206 raise ValueError("Alphabet cannot contain null character \\0")
|
|
207 if ord_table[ n ] != "\xff":
|
|
208 raise ValueError("Repetitive alphabet")
|
|
209 ord_table[ n ] = chr(i)
|
|
210
|
|
211 # Add alternatives
|
|
212 _from = []
|
|
213 _to = []
|
|
214 for e, c in alternatives :
|
|
215 if c in letters :
|
|
216 n = ord(e)
|
|
217 if ord_table[ n ] == "\xff" : # empty
|
|
218 ord_table[ n ] = ord_table[ ord(c)]
|
|
219 _from.append(e)
|
|
220 _to.append(c)
|
|
221 self._alternatives = (''.join(_from), ''.join(_to))
|
|
222
|
|
223 ord_table = "".join(ord_table)
|
|
224 assert( ord_table[0] == "\xff")
|
|
225 self._ord_table = ord_table
|
|
226
|
|
227 # The chr_table maps between ordinal position in the alphabet letters
|
|
228 # and the ordinal position in ascii. This map is not the inverse of
|
|
229 # ord_table if there are alternatives.
|
|
230 chr_table = ["\x00"]*256
|
|
231 for i,a in enumerate(letters) :
|
|
232 chr_table[ i ] = a
|
|
233 chr_table = "".join(chr_table)
|
|
234 self._chr_table = chr_table
|
|
235
|
|
236 return self
|
|
237
|
|
238
|
|
239 def alphabetic(self, string) :
|
|
240 """True if all characters of the string are in this alphabet."""
|
|
241 table = self._ord_table
|
|
242 for s in str(string):
|
|
243 if table[ord(s)] == "\xff" :
|
|
244 return False
|
|
245 return True
|
|
246
|
|
247 def chr(self, n) :
|
|
248 """ The n'th character in the alphabet (zero indexed) or \\0 """
|
|
249 return self._chr_table[n]
|
|
250
|
|
251 def ord(self, c) :
|
|
252 """The ordinal position of the character c in this alphabet,
|
|
253 or 255 if no such character.
|
|
254 """
|
|
255 return ord(self._ord_table[ord(c)])
|
|
256
|
|
257 def chrs(self, sequence_of_ints) :
|
|
258 """Convert a sequence of ordinals into an alphabetic string."""
|
|
259 if not isinstance(sequence_of_ints, array) :
|
|
260 sequence_of_ints = array('B', sequence_of_ints)
|
|
261 s = sequence_of_ints.tostring().translate(self._chr_table)
|
|
262 return Seq(s, self)
|
|
263
|
|
264 def ords(self, string) :
|
|
265 """Convert an alphabetic string into a byte array of ordinals."""
|
|
266 string = str(string)
|
|
267 s = string.translate(self._ord_table)
|
|
268 a = array('B',s)
|
|
269 return a
|
|
270
|
|
271
|
|
272 def normalize(self, string) :
|
|
273 """Normalize an alphabetic string by converting all alternative symbols
|
|
274 to the canonical equivalent in 'letters'.
|
|
275 """
|
|
276 if not self.alphabetic(string) :
|
|
277 raise ValueError("Not an alphabetic string.")
|
|
278 return self.chrs(self.ords(string))
|
|
279
|
|
280 def letters(self) :
|
|
281 """ Letters of the alphabet as a string."""
|
|
282 return str(self)
|
|
283
|
|
284 def _all_letters(self) :
|
|
285 """ All allowed letters, including alternatives."""
|
|
286 let = []
|
|
287 let.append(self._letters)
|
|
288 for key, value in self._alternatives :
|
|
289 let.append(value)
|
|
290 return ''.join(let)
|
|
291
|
|
292 def __repr__(self) :
|
|
293 return "Alphabet( '" + self._letters +"', zip"+ repr(self._alternatives)+" )"
|
|
294
|
|
295 def __str__(self) :
|
|
296 return str(self._letters)
|
|
297
|
|
298 def __len__(self) :
|
|
299 return len(self._letters)
|
|
300
|
|
301 def __eq__(self, other) :
|
|
302 if not hasattr(other, "_ord_table") : return False
|
|
303 return self._ord_table == other._ord_table
|
|
304
|
|
305 def __ne__(self, other) :
|
|
306 return not self.__eq__(other)
|
|
307
|
|
308 def __iter__(self) :
|
|
309 return iter(self._letters)
|
|
310
|
|
311 def __getitem__(self, key) :
|
|
312 return self._letters[key]
|
|
313
|
|
314
|
|
315 # End class Alphabet
|
|
316
|
|
317 # ------------------- Standard ALPHABETS -------------------
|
|
318 # Standard alphabets are defined here, after Alphabet class.
|
|
319
|
|
320 generic_alphabet = Alphabet(None, None)
|
|
321
|
|
322
|
|
323 protein_alphabet = Alphabet('ACDEFGHIKLMNOPQRSTUVWYBJZX*-',
|
|
324 zip('acdefghiklmnopqrstuvwybjzx?.~',
|
|
325 'ACDEFGHIKLMNOPQRSTUVWYBJZXX--') )
|
|
326
|
|
327
|
|
328 nucleic_alphabet = Alphabet("ACGTURYSWKMBDHVN-",
|
|
329 zip("acgturyswkmbdhvnXx?.~",
|
|
330 "ACGTURYSWKMBDHVNNNN--") )
|
|
331
|
|
332 dna_alphabet = Alphabet("ACGTRYSWKMBDHVN-",
|
|
333 zip('acgtryswkmbdhvnXx?.~Uu',
|
|
334 'ACGTRYSWKMBDHVNNNN--TT') )
|
|
335
|
|
336 rna_alphabet = Alphabet("ACGURYSWKMBDHVN-",
|
|
337 zip('acguryswkmbdhvnXx?.~Tt',
|
|
338 'ACGURYSWKMBDHVNNNN--UU') )
|
|
339
|
|
340 reduced_nucleic_alphabet = Alphabet("ACGTN-",
|
|
341 zip('acgtryswkmbdhvnXx?.~TtRYSWKMBDHV',
|
|
342 'ACGTNNNNNNNNNNNNNN--TTNNNNNNNNNN') )
|
|
343
|
|
344 reduced_protein_alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWYX*-',
|
|
345 zip('acdefghiklmnpqrstvwyx?.~BbZzUu',
|
|
346 'ACDEFGHIKLMNPQRSTVWYXX--XXXXCC') )
|
|
347
|
|
348 unambiguous_dna_alphabet = Alphabet("ACGT", zip('acgt','ACGT') )
|
|
349
|
|
350 unambiguous_rna_alphabet = Alphabet("ACGU", zip('acgu','ACGU') )
|
|
351
|
|
352 unambiguous_protein_alphabet = Alphabet("ACDEFGHIKLMNPQRSTVWY",
|
|
353 zip('acdefghiklmnopqrstuvwy',
|
|
354 'ACDEFGHIKLMNOPQRSTUVWY') )
|
|
355
|
|
356
|
|
357 _complement_table = maketrans("ACGTRYSWKMBDHVN-acgtUuryswkmbdhvnXx?.~",
|
|
358 "TGCAYRSWMKVHDBN-tgcaAayrswmkvhdbnXx?.~")
|
|
359
|
|
360
|
|
361
|
|
362 class Seq(str):
|
|
363 """ An alphabetic string. A subclass of "str" consisting solely of
|
|
364 letters from the same alphabet.
|
|
365
|
|
366 Attributes:
|
|
367 alphabet -- A string or Alphabet of allowed characters.
|
|
368 name -- A short string used to identify the sequence.
|
|
369 description -- A string describing the sequence
|
|
370
|
|
371 Authors :
|
|
372 GEC 2005
|
|
373 """
|
|
374 # TODO: need a method to return a copy of the string with a new alphabet,
|
|
375 # preserving the sequence, name and alphabet?
|
|
376
|
|
377 def __new__(cls, obj,
|
|
378 alphabet= generic_alphabet,
|
|
379 name =None, description=None,
|
|
380 ):
|
|
381 self = str.__new__(cls, obj)
|
|
382 if alphabet is None:
|
|
383 alphabet = generic_alphabet
|
|
384 if not isinstance(alphabet, Alphabet):
|
|
385 alphabet = Alphabet(alphabet)
|
|
386 if not alphabet.alphabetic(self) :
|
|
387 raise ValueError("Sequence not alphabetic %s, '%s'" %(alphabet, self))
|
|
388
|
|
389 self._alphabet=alphabet
|
|
390 self.name = name
|
|
391 self.description = description
|
|
392
|
|
393 return self
|
|
394
|
|
395 # BEGIN PROPERTIES
|
|
396
|
|
397 # Make alphabet constant
|
|
398 def _get_alphabet(self):
|
|
399 return self._alphabet
|
|
400 alphabet = property(_get_alphabet)
|
|
401
|
|
402 # END PROPERTIES
|
|
403
|
|
404
|
|
405 def ords(self) :
|
|
406 """ Convert sequence to an array of integers
|
|
407 in the range [0, len(alphabet) )
|
|
408 """
|
|
409 return self.alphabet.ords(self)
|
|
410
|
|
411 def tally(self, alphabet = None):
|
|
412 """Counts the occurrences of alphabetic characters.
|
|
413
|
|
414 Arguments:
|
|
415 - alphabet -- an optional alternative alphabet
|
|
416
|
|
417 Returns :
|
|
418 A list of character counts in alphabetic order.
|
|
419 """
|
|
420 # Renamed from count() since this conflicts with str.count().
|
|
421 if not alphabet : alphabet = self.alphabet
|
|
422 L = len(alphabet)
|
|
423 counts = [0,] * L
|
|
424
|
|
425 ords = alphabet.ords(self)
|
|
426
|
|
427 for n in ords:
|
|
428 if n<L : counts[n] +=1
|
|
429
|
|
430 return counts
|
|
431
|
|
432
|
|
433 def kmers(self, alphabet = None, k=1):
|
|
434 """Counts the occurrences of overlapping alphabetic subsequences.
|
|
435
|
|
436 Arguments:
|
|
437 - alphabet -- an optional alternative alphabet
|
|
438 - k -- subsequence length. Default: 1 (monomers)
|
|
439
|
|
440 Returns :
|
|
441 A list of kmers counts in alphabetic order.
|
|
442 Status :
|
|
443 Alpha -- Not sure on interface. Will only work for small k
|
|
444 """
|
|
445 # TODO: Refactor?
|
|
446 # TODO: Rename 'kmers' to 'words' or word_count
|
|
447 if not alphabet : alphabet = self.alphabet
|
|
448
|
|
449 L = len(alphabet)
|
|
450 N = L**k
|
|
451 counts = [0,]*N
|
|
452
|
|
453 ords = alphabet.ords(self)
|
|
454
|
|
455
|
|
456 # Easy case
|
|
457 if k==1 :
|
|
458 for n in ords:
|
|
459 if n<N : counts[n] +=1
|
|
460 return counts
|
|
461
|
|
462 # Kmer counting.
|
|
463 # FIXME: This code assumes that k isn't too large.
|
|
464
|
|
465 # e.g. L =10, k = 3, multi = [100,10,1]
|
|
466 multi = [ L**i for i in range(k-1,-1,-1)]
|
|
467
|
|
468 for i in range(len(ords)-k+1) :
|
|
469 if ords[i] >= N : # Skip non-alphabetic kmers
|
|
470 i += k
|
|
471 continue
|
|
472 #FIXME: this should be a function of alphabet?
|
|
473 n = sum([multi[j]* ords[i+j] for j in range(k) ])
|
|
474 counts[n] +=1
|
|
475
|
|
476 return counts
|
|
477
|
|
478 def __getslice__(self, i, j):
|
|
479 cls = self.__class__
|
|
480 return cls( str.__getslice__(self,i,j), self.alphabet)
|
|
481
|
|
482 def __getitem__(self, key) :
|
|
483 cls = self.__class__
|
|
484 return cls( str.__getitem__(self,key), self.alphabet)
|
|
485
|
|
486 def __add__(self, other) :
|
|
487 # called for "self + other"
|
|
488 cls = self.__class__
|
|
489 return cls( str.__add__(self, other), self.alphabet)
|
|
490
|
|
491 def __radd__(self, other) :
|
|
492 # Called when "other + self" and other is superclass of self
|
|
493 cls = self.__class__
|
|
494 return cls( str.__add__(self, other), self.alphabet)
|
|
495
|
|
496 def join(self, str_list) :
|
|
497 cls = self.__class__
|
|
498 return cls( super(Seq, self).join(str_list), self.alphabet)
|
|
499
|
|
500 def __eq__(self, other) :
|
|
501 if not hasattr(other, "alphabet") : return False
|
|
502 if self.alphabet != other.alphabet :
|
|
503 return False
|
|
504 return str.__eq__(self, other)
|
|
505
|
|
506 def __ne__(self, other) :
|
|
507 return not self.__eq__(other)
|
|
508
|
|
509 def tostring(self) :
|
|
510 """ Converts Seq to a raw string.
|
|
511 """
|
|
512 # Compatibility with biopython
|
|
513 return str(self)
|
|
514
|
|
515 # ---- Transformations of Seq ----
|
|
516 def reverse(self) :
|
|
517 """Return the reversed sequence.
|
|
518
|
|
519 Not that this method returns a new object, in contrast to
|
|
520 the in-place reverse() method of list objects.
|
|
521 """
|
|
522 cls = self.__class__
|
|
523 return cls( self[::-1], self.alphabet)
|
|
524
|
|
525 def ungap(self) :
|
|
526 # FIXME: Gap symbols should be specified by the Alphabet?
|
|
527 return self.remove( '-.~')
|
|
528
|
|
529 def remove(self, delchars) :
|
|
530 """Return a new alphabetic sequence with all characters in 'delchars'
|
|
531 removed.
|
|
532 """
|
|
533 cls = self.__class__
|
|
534 return cls( str(self).translate(maketrans('',''), delchars), self.alphabet)
|
|
535
|
|
536 def lower(self) :
|
|
537 """Return a lower case copy of the sequence. """
|
|
538 cls = self.__class__
|
|
539 trans = maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')
|
|
540 return cls(str(self).translate(trans), self.alphabet)
|
|
541
|
|
542 def upper(self) :
|
|
543 """Return a lower case copy of the sequence. """
|
|
544 cls = self.__class__
|
|
545 trans = maketrans('abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ')
|
|
546 return cls(str(self).translate(trans), self.alphabet)
|
|
547
|
|
548 def mask(self, letters= 'abcdefghijklmnopqrstuvwxyz', mask='X') :
|
|
549 """Replace all occurences of letters with the mask character.
|
|
550 The default is to replace all lower case letters with 'X'.
|
|
551 """
|
|
552 LL = len(letters)
|
|
553 if len(mask) !=1 :
|
|
554 raise ValueError("Mask should be single character")
|
|
555 to = mask * LL
|
|
556 trans = maketrans( letters, to)
|
|
557 cls = self.__class__
|
|
558 return cls(str(self).translate(trans), self.alphabet)
|
|
559
|
|
560 def translate(self) :
|
|
561 """Translate a nucleotide sequence to a polypeptide using full
|
|
562 IUPAC ambiguities in DNA/RNA and amino acid codes, using the
|
|
563 standard genetic code. See corebio.transform.GeneticCode for
|
|
564 details and more options.
|
|
565 """
|
|
566 # Note: masks str.translate
|
|
567 from transform import GeneticCode
|
|
568 return GeneticCode.std().translate(self)
|
|
569
|
|
570 def back_translate(self) :
|
|
571 """Translate a protein sequence back into coding DNA, using using the
|
|
572 standard genetic code. See corebio.transform.GeneticCode for
|
|
573 details and more options.
|
|
574 """
|
|
575 from transform import GeneticCode
|
|
576 return GeneticCode.std().back_translate(self)
|
|
577
|
|
578
|
|
579 def reverse_complement(self) :
|
|
580 """Returns reversed complementary nucleic acid sequence (i.e. the other
|
|
581 strand of a DNA sequence.)
|
|
582 """
|
|
583 return self.reverse().complement()
|
|
584
|
|
585 def complement(self) :
|
|
586 """Returns complementary nucleic acid sequence."""
|
|
587 if not nucleic_alphabet.alphabetic(self.alphabet):
|
|
588 raise ValueError("Incompatable alphabets")
|
|
589 s = str.translate(self, _complement_table)
|
|
590 cls = self.__class__
|
|
591 return cls(s, self.alphabet, self.name, self.description)
|
|
592
|
|
593
|
|
594 # end class Seq
|
|
595
|
|
596
|
|
597 class SeqList(list):
|
|
598 """ A list of sequences.
|
|
599
|
|
600 Status:
|
|
601 Beta
|
|
602 """
|
|
603 # TODO: If alphabet given, we should ensure that all sequences conform.
|
|
604 # TODO: Need an isaligned() method. All seqs same length, same alphabet.
|
|
605 __slots__ =["alphabet", "name", "description"]
|
|
606
|
|
607 def __init__(self, alist=[], alphabet=None, name=None, description=None):
|
|
608 list.__init__(self, alist)
|
|
609 self.alphabet = alphabet
|
|
610 self.name = name
|
|
611 self.description = description
|
|
612
|
|
613 # TOOWTDI. Replicates seq_io.read()
|
|
614 #@classmethod
|
|
615 #def read(cls, afile, alphabet = None):
|
|
616 # return corebio.seq_io.read(afile, alphabet)
|
|
617 #read = classmethod(read)
|
|
618
|
|
619 def ords(self, alphabet=None) :
|
|
620 """ Convert sequence list into a 2D array of ordinals.
|
|
621 """
|
|
622 if not alphabet : alphabet = self.alphabet
|
|
623 if not alphabet : raise ValueError("No alphabet")
|
|
624 k = []
|
|
625 for s in self:
|
|
626 k.append( alphabet.ords(s) )
|
|
627 return k
|
|
628
|
|
629 def tally(self, alphabet = None):
|
|
630 """Counts the occurrences of characters in each column."""
|
|
631 if not alphabet : alphabet = self.alphabet
|
|
632 if not alphabet : raise ValueError("No alphabet")
|
|
633
|
|
634 N = len(alphabet)
|
|
635 ords = self.ords(alphabet)
|
|
636 L = len(ords[0])
|
|
637 counts = [ [0,]*N for l in range(0,L)]
|
|
638
|
|
639 for o in ords :
|
|
640 for j,n in enumerate(o) :
|
|
641 if n<N : counts[ j][n] +=1
|
|
642
|
|
643 return counts
|
|
644 # end class SeqList
|
|
645
|
|
646
|
|
647 def dna(string) :
|
|
648 """Create an alphabetic sequence representing a stretch of DNA.
|
|
649 """
|
|
650 return Seq(string, alphabet = dna_alphabet)
|
|
651
|
|
652 def rna(string) :
|
|
653 """Create an alphabetic sequence representing a stretch of RNA.
|
|
654 """
|
|
655 return Seq(string, alphabet = rna_alphabet)
|
|
656
|
|
657 def protein(string) :
|
|
658 """Create an alphabetic sequence representing a stretch of polypeptide.
|
|
659 """
|
|
660 return Seq(string, alphabet = protein_alphabet)
|
|
661
|
|
662
|
|
663
|
|
664
|
|
665 |