Mercurial > repos > davidmurphy > codonlogo
diff corebio/seq.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/corebio/seq.py Thu Oct 27 12:09:09 2011 -0400 @@ -0,0 +1,665 @@ + +# Copyright (c) 2005 Gavin E. Crooks <gec@compbio.berkeley.edu> +# +# This software is distributed under the MIT Open Source License. +# <http://www.opensource.org/licenses/mit-license.html> +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# + + + +""" Alphabetic sequences and associated tools and data. + +Seq is a subclass of a python string with additional annotation and an alphabet. +The characters in string must be contained in the alphabet. Various standard +alphabets are provided. + + +Classes : + Alphabet -- A subset of non-null ascii characters + Seq -- An alphabetic string + SeqList -- A collection of Seq's + +Alphabets : + o generic_alphabet -- A generic alphabet. Any printable ASCII character. + o protein_alphabet -- IUCAP/IUB Amino Acid one letter codes. + o nucleic_alphabet -- IUPAC/IUB Nucleic Acid codes 'ACGTURYSWKMBDHVN-' + o dna_alphabet -- Same as nucleic_alphabet, with 'U' (Uracil) an + alternative for 'T' (Thymidine). + o rna_alphabet -- Same as nucleic_alphabet, with 'T' (Thymidine) an + alternative for 'U' (Uracil). + o reduced_nucleic_alphabet -- All ambiguous codes in 'nucleic_alphabet' are + alternative to 'N' (aNy) + o reduced_protein_alphabet -- All ambiguous ('BZJ') and non-canonical amino + acids codes ( 'U', Selenocysteine and 'O', Pyrrolysine) in + 'protein_alphabet' are alternative to 'X'. + o unambiguous_dna_alphabet -- 'ACGT' + o unambiguous_rna_alphabet -- 'ACGU' + o unambiguous_protein_alphabet -- The twenty canonical amino acid one letter + codes, in alphabetic order, 'ACDEFGHIKLMNPQRSTVWY' + +Amino Acid Codes: + Code Alt. Meaning + ----------------- + A Alanine + B Aspartic acid or Asparagine + C Cysteine + D Aspartate + E Glutamate + F Phenylalanine + G Glycine + H Histidine + I Isoleucine + J Leucine or Isoleucine + K Lysine + L Leucine + M Methionine + N Asparagine + O Pyrrolysine + P Proline + Q Glutamine + R Arginine + S Serine + T Threonine + U Selenocysteine + V Valine + W Tryptophan + Y Tyrosine + Z Glutamate or Glutamine + X ? any + * translation stop + - .~ gap + +Nucleotide Codes: + Code Alt. Meaning + ------------------------------ + A Adenosine + C Cytidine + G Guanine + T Thymidine + U Uracil + R G A (puRine) + Y T C (pYrimidine) + K G T (Ketone) + M A C (aMino group) + S G C (Strong interaction) + W A T (Weak interaction) + B G T C (not A) (B comes after A) + D G A T (not C) (D comes after C) + H A C T (not G) (H comes after G) + V G C A (not T, not U) (V comes after U) + N X? A G C T (aNy) + - .~ A gap + + + + +Refs: + http://www.chem.qmw.ac.uk/iupac/AminoAcid/A2021.html + http://www.chem.qmw.ac.uk/iubmb/misc/naseq.html +Status: + Beta +Authors: + GEC 2004,2005 +""" + +# TODO: Add this to docstring somewhere. +# To replace all ambiguous nucleic code by 'N', replace alphabet and then n +# normalize. +# +# >>> Seq( 'ACGT-RYKM', reduced_nucleic_alphabet).normalized() +# 'ACGT-NNNN' + +from array import array +from string import maketrans +from corebio.moremath import argmax, sqrt + +__all__ = [ + 'Alphabet', + 'Seq', + 'rna', 'dna', 'protein', + 'SeqList', + 'generic_alphabet', + 'protein_alphabet', + 'nucleic_alphabet', + 'dna_alphabet', + 'rna_alphabet', + 'reduced_nucleic_alphabet', + 'reduced_protein_alphabet', + 'unambiguous_dna_alphabet', + 'unambiguous_dna_alphabet', + 'unambiguous_rna_alphabet', + 'unambiguous_protein_alphabet', + 'generic_alphabet' + ] + + + +class Alphabet(object) : + """An ordered subset of printable ascii characters. + + Status: + Beta + Authors: + - GEC 2005 + """ + __slots__ = ['_letters', '_alternatives','_ord_table', '_chr_table'] + + # We're immutable, so use __new__ not __init__ + def __new__(cls, letters, alternatives= None) : + """Create a new, immutable Alphabet. + + arguments: + - letters -- the letters in the alphabet. The ordering determines + the ordinal position of each character in this alphabet. + - alt -- A list of (alternative, canonical) letters. The alternatives + are given the same ordinal position as the canonical character. + e.g. (('?','X'),('x', 'X')) states that '?' and 'x' are synomonous + with 'X'. Values that are not in 'letters' are ignored. Alternatives + that are already in 'letters' are also ignored. If the same + alternative character is used twice then the alternative is assigned + to the canonical character that occurs first in 'letters'. The + default is to assume that upper and lower case characters are + equivalent, unless both cases are included in 'letters'. + raises: + ValueError : Repetitive or otherwise illegal set of letters. + """ + self = object.__new__(cls) + + # Printable Ascii characters + ascii_letters = "".join([chr(__i) for __i in range(32,128)]) + + if letters is None : letters = ascii_letters + self._letters = letters + + equivalent_by_case = zip( 'abcdefghijklmnopqrstuvwxyz', + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') + + if alternatives is None : alternatives = equivalent_by_case + + + # The ord_table maps between the ordinal position of a character in ascii + # and the ordinal position in this alphabet. Characters not in the + # alphabet are given a position of 255. The ord_table is stored as a + # string. + ord_table = ["\xff",] * 256 + for i,a in enumerate(letters) : + n = ord(a) + if n == 0 : + raise ValueError("Alphabet cannot contain null character \\0") + if ord_table[ n ] != "\xff": + raise ValueError("Repetitive alphabet") + ord_table[ n ] = chr(i) + + # Add alternatives + _from = [] + _to = [] + for e, c in alternatives : + if c in letters : + n = ord(e) + if ord_table[ n ] == "\xff" : # empty + ord_table[ n ] = ord_table[ ord(c)] + _from.append(e) + _to.append(c) + self._alternatives = (''.join(_from), ''.join(_to)) + + ord_table = "".join(ord_table) + assert( ord_table[0] == "\xff") + self._ord_table = ord_table + + # The chr_table maps between ordinal position in the alphabet letters + # and the ordinal position in ascii. This map is not the inverse of + # ord_table if there are alternatives. + chr_table = ["\x00"]*256 + for i,a in enumerate(letters) : + chr_table[ i ] = a + chr_table = "".join(chr_table) + self._chr_table = chr_table + + return self + + + def alphabetic(self, string) : + """True if all characters of the string are in this alphabet.""" + table = self._ord_table + for s in str(string): + if table[ord(s)] == "\xff" : + return False + return True + + def chr(self, n) : + """ The n'th character in the alphabet (zero indexed) or \\0 """ + return self._chr_table[n] + + def ord(self, c) : + """The ordinal position of the character c in this alphabet, + or 255 if no such character. + """ + return ord(self._ord_table[ord(c)]) + + def chrs(self, sequence_of_ints) : + """Convert a sequence of ordinals into an alphabetic string.""" + if not isinstance(sequence_of_ints, array) : + sequence_of_ints = array('B', sequence_of_ints) + s = sequence_of_ints.tostring().translate(self._chr_table) + return Seq(s, self) + + def ords(self, string) : + """Convert an alphabetic string into a byte array of ordinals.""" + string = str(string) + s = string.translate(self._ord_table) + a = array('B',s) + return a + + + def normalize(self, string) : + """Normalize an alphabetic string by converting all alternative symbols + to the canonical equivalent in 'letters'. + """ + if not self.alphabetic(string) : + raise ValueError("Not an alphabetic string.") + return self.chrs(self.ords(string)) + + def letters(self) : + """ Letters of the alphabet as a string.""" + return str(self) + + def _all_letters(self) : + """ All allowed letters, including alternatives.""" + let = [] + let.append(self._letters) + for key, value in self._alternatives : + let.append(value) + return ''.join(let) + + def __repr__(self) : + return "Alphabet( '" + self._letters +"', zip"+ repr(self._alternatives)+" )" + + def __str__(self) : + return str(self._letters) + + def __len__(self) : + return len(self._letters) + + def __eq__(self, other) : + if not hasattr(other, "_ord_table") : return False + return self._ord_table == other._ord_table + + def __ne__(self, other) : + return not self.__eq__(other) + + def __iter__(self) : + return iter(self._letters) + + def __getitem__(self, key) : + return self._letters[key] + + +# End class Alphabet + +# ------------------- Standard ALPHABETS ------------------- +# Standard alphabets are defined here, after Alphabet class. + +generic_alphabet = Alphabet(None, None) + + +protein_alphabet = Alphabet('ACDEFGHIKLMNOPQRSTUVWYBJZX*-', + zip('acdefghiklmnopqrstuvwybjzx?.~', + 'ACDEFGHIKLMNOPQRSTUVWYBJZXX--') ) + + +nucleic_alphabet = Alphabet("ACGTURYSWKMBDHVN-", + zip("acgturyswkmbdhvnXx?.~", + "ACGTURYSWKMBDHVNNNN--") ) + +dna_alphabet = Alphabet("ACGTRYSWKMBDHVN-", + zip('acgtryswkmbdhvnXx?.~Uu', + 'ACGTRYSWKMBDHVNNNN--TT') ) + +rna_alphabet = Alphabet("ACGURYSWKMBDHVN-", + zip('acguryswkmbdhvnXx?.~Tt', + 'ACGURYSWKMBDHVNNNN--UU') ) + +reduced_nucleic_alphabet = Alphabet("ACGTN-", + zip('acgtryswkmbdhvnXx?.~TtRYSWKMBDHV', + 'ACGTNNNNNNNNNNNNNN--TTNNNNNNNNNN') ) + +reduced_protein_alphabet = Alphabet('ACDEFGHIKLMNPQRSTVWYX*-', + zip('acdefghiklmnpqrstvwyx?.~BbZzUu', + 'ACDEFGHIKLMNPQRSTVWYXX--XXXXCC') ) + +unambiguous_dna_alphabet = Alphabet("ACGT", zip('acgt','ACGT') ) + +unambiguous_rna_alphabet = Alphabet("ACGU", zip('acgu','ACGU') ) + +unambiguous_protein_alphabet = Alphabet("ACDEFGHIKLMNPQRSTVWY", + zip('acdefghiklmnopqrstuvwy', + 'ACDEFGHIKLMNOPQRSTUVWY') ) + + +_complement_table = maketrans("ACGTRYSWKMBDHVN-acgtUuryswkmbdhvnXx?.~", + "TGCAYRSWMKVHDBN-tgcaAayrswmkvhdbnXx?.~") + + + +class Seq(str): + """ An alphabetic string. A subclass of "str" consisting solely of + letters from the same alphabet. + + Attributes: + alphabet -- A string or Alphabet of allowed characters. + name -- A short string used to identify the sequence. + description -- A string describing the sequence + + Authors : + GEC 2005 + """ + # TODO: need a method to return a copy of the string with a new alphabet, + # preserving the sequence, name and alphabet? + + def __new__(cls, obj, + alphabet= generic_alphabet, + name =None, description=None, + ): + self = str.__new__(cls, obj) + if alphabet is None: + alphabet = generic_alphabet + if not isinstance(alphabet, Alphabet): + alphabet = Alphabet(alphabet) + if not alphabet.alphabetic(self) : + raise ValueError("Sequence not alphabetic %s, '%s'" %(alphabet, self)) + + self._alphabet=alphabet + self.name = name + self.description = description + + return self + + # BEGIN PROPERTIES + + # Make alphabet constant + def _get_alphabet(self): + return self._alphabet + alphabet = property(_get_alphabet) + + # END PROPERTIES + + + def ords(self) : + """ Convert sequence to an array of integers + in the range [0, len(alphabet) ) + """ + return self.alphabet.ords(self) + + def tally(self, alphabet = None): + """Counts the occurrences of alphabetic characters. + + Arguments: + - alphabet -- an optional alternative alphabet + + Returns : + A list of character counts in alphabetic order. + """ + # Renamed from count() since this conflicts with str.count(). + if not alphabet : alphabet = self.alphabet + L = len(alphabet) + counts = [0,] * L + + ords = alphabet.ords(self) + + for n in ords: + if n<L : counts[n] +=1 + + return counts + + + def kmers(self, alphabet = None, k=1): + """Counts the occurrences of overlapping alphabetic subsequences. + + Arguments: + - alphabet -- an optional alternative alphabet + - k -- subsequence length. Default: 1 (monomers) + + Returns : + A list of kmers counts in alphabetic order. + Status : + Alpha -- Not sure on interface. Will only work for small k + """ + # TODO: Refactor? + # TODO: Rename 'kmers' to 'words' or word_count + if not alphabet : alphabet = self.alphabet + + L = len(alphabet) + N = L**k + counts = [0,]*N + + ords = alphabet.ords(self) + + + # Easy case + if k==1 : + for n in ords: + if n<N : counts[n] +=1 + return counts + + # Kmer counting. + # FIXME: This code assumes that k isn't too large. + + # e.g. L =10, k = 3, multi = [100,10,1] + multi = [ L**i for i in range(k-1,-1,-1)] + + for i in range(len(ords)-k+1) : + if ords[i] >= N : # Skip non-alphabetic kmers + i += k + continue + #FIXME: this should be a function of alphabet? + n = sum([multi[j]* ords[i+j] for j in range(k) ]) + counts[n] +=1 + + return counts + + def __getslice__(self, i, j): + cls = self.__class__ + return cls( str.__getslice__(self,i,j), self.alphabet) + + def __getitem__(self, key) : + cls = self.__class__ + return cls( str.__getitem__(self,key), self.alphabet) + + def __add__(self, other) : + # called for "self + other" + cls = self.__class__ + return cls( str.__add__(self, other), self.alphabet) + + def __radd__(self, other) : + # Called when "other + self" and other is superclass of self + cls = self.__class__ + return cls( str.__add__(self, other), self.alphabet) + + def join(self, str_list) : + cls = self.__class__ + return cls( super(Seq, self).join(str_list), self.alphabet) + + def __eq__(self, other) : + if not hasattr(other, "alphabet") : return False + if self.alphabet != other.alphabet : + return False + return str.__eq__(self, other) + + def __ne__(self, other) : + return not self.__eq__(other) + + def tostring(self) : + """ Converts Seq to a raw string. + """ + # Compatibility with biopython + return str(self) + + # ---- Transformations of Seq ---- + def reverse(self) : + """Return the reversed sequence. + + Not that this method returns a new object, in contrast to + the in-place reverse() method of list objects. + """ + cls = self.__class__ + return cls( self[::-1], self.alphabet) + + def ungap(self) : + # FIXME: Gap symbols should be specified by the Alphabet? + return self.remove( '-.~') + + def remove(self, delchars) : + """Return a new alphabetic sequence with all characters in 'delchars' + removed. + """ + cls = self.__class__ + return cls( str(self).translate(maketrans('',''), delchars), self.alphabet) + + def lower(self) : + """Return a lower case copy of the sequence. """ + cls = self.__class__ + trans = maketrans('ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz') + return cls(str(self).translate(trans), self.alphabet) + + def upper(self) : + """Return a lower case copy of the sequence. """ + cls = self.__class__ + trans = maketrans('abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ') + return cls(str(self).translate(trans), self.alphabet) + + def mask(self, letters= 'abcdefghijklmnopqrstuvwxyz', mask='X') : + """Replace all occurences of letters with the mask character. + The default is to replace all lower case letters with 'X'. + """ + LL = len(letters) + if len(mask) !=1 : + raise ValueError("Mask should be single character") + to = mask * LL + trans = maketrans( letters, to) + cls = self.__class__ + return cls(str(self).translate(trans), self.alphabet) + + def translate(self) : + """Translate a nucleotide sequence to a polypeptide using full + IUPAC ambiguities in DNA/RNA and amino acid codes, using the + standard genetic code. See corebio.transform.GeneticCode for + details and more options. + """ + # Note: masks str.translate + from transform import GeneticCode + return GeneticCode.std().translate(self) + + def back_translate(self) : + """Translate a protein sequence back into coding DNA, using using the + standard genetic code. See corebio.transform.GeneticCode for + details and more options. + """ + from transform import GeneticCode + return GeneticCode.std().back_translate(self) + + + def reverse_complement(self) : + """Returns reversed complementary nucleic acid sequence (i.e. the other + strand of a DNA sequence.) + """ + return self.reverse().complement() + + def complement(self) : + """Returns complementary nucleic acid sequence.""" + if not nucleic_alphabet.alphabetic(self.alphabet): + raise ValueError("Incompatable alphabets") + s = str.translate(self, _complement_table) + cls = self.__class__ + return cls(s, self.alphabet, self.name, self.description) + + +# end class Seq + + +class SeqList(list): + """ A list of sequences. + + Status: + Beta + """ + # TODO: If alphabet given, we should ensure that all sequences conform. + # TODO: Need an isaligned() method. All seqs same length, same alphabet. + __slots__ =["alphabet", "name", "description"] + + def __init__(self, alist=[], alphabet=None, name=None, description=None): + list.__init__(self, alist) + self.alphabet = alphabet + self.name = name + self.description = description + + # TOOWTDI. Replicates seq_io.read() + #@classmethod + #def read(cls, afile, alphabet = None): + # return corebio.seq_io.read(afile, alphabet) + #read = classmethod(read) + + def ords(self, alphabet=None) : + """ Convert sequence list into a 2D array of ordinals. + """ + if not alphabet : alphabet = self.alphabet + if not alphabet : raise ValueError("No alphabet") + k = [] + for s in self: + k.append( alphabet.ords(s) ) + return k + + def tally(self, alphabet = None): + """Counts the occurrences of characters in each column.""" + if not alphabet : alphabet = self.alphabet + if not alphabet : raise ValueError("No alphabet") + + N = len(alphabet) + ords = self.ords(alphabet) + L = len(ords[0]) + counts = [ [0,]*N for l in range(0,L)] + + for o in ords : + for j,n in enumerate(o) : + if n<N : counts[ j][n] +=1 + + return counts +# end class SeqList + + +def dna(string) : + """Create an alphabetic sequence representing a stretch of DNA. + """ + return Seq(string, alphabet = dna_alphabet) + +def rna(string) : + """Create an alphabetic sequence representing a stretch of RNA. + """ + return Seq(string, alphabet = rna_alphabet) + +def protein(string) : + """Create an alphabetic sequence representing a stretch of polypeptide. + """ + return Seq(string, alphabet = protein_alphabet) + + + + + \ No newline at end of file