view corebio/transform.py @ 10:20716450be87

Uploaded
author davidmurphy
date Mon, 30 Jan 2012 21:17:50 -0500
parents c55bdc2fb9fa
children
line wrap: on
line source

#  Copyright (c) 2006 John Gilman
#
#  This software is distributed under the MIT Open Source License.
#  <http://www.opensource.org/licenses/mit-license.html>
#
#  Permission is hereby granted, free of charge, to any person obtaining a 
#  copy of this software and associated documentation files (the "Software"),
#  to deal in the Software without restriction, including without limitation
#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
#  and/or sell copies of the Software, and to permit persons to whom the
#  Software is furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included
#  in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
#  THE SOFTWARE.

""" Transformations of Seqs (alphabetic sequences).



Classes :
- Transform   -- Simple transforms of alphabetic strings.
- GeneticCode -- The genetic mapping of dna to protein.

Functions :
-  mask_low_complexity -- Implementation of Seg algorithm to remove low complexity  
        regions from protein sequences.
   
    
"""


from corebio.data import dna_extended_letters, dna_ambiguity
from corebio.seq import Seq, protein_alphabet, nucleic_alphabet, dna_alphabet
from string import maketrans
from corebio.moremath import log2 , entropy

__all__ = [
    'Transform',
    'mask_low_complexity',
    'GeneticCode'
    ]

class Transform(object) :
    """A translation between alphabetic strings.
    (This class is not called 'Translation' to avoid confusion with the
    biological translation of rna to protein.)
    
    Example:
    trans = Transform( 
        Seq("ACGTRYSWKMBDHVN-acgtUuryswkmbdhvnXx?.~'", dna_alphabet),                    
        Seq("ACGTRYSWKMNNNNN-acgtUuryswkmbnnnnXx?.~", reduced_nucleic_alphabet)         
        )
    s0 = Seq("AAAAAV", nucleic_alphabet)
    s1 = trans(s0)              
    assert(s1.alphabet == reduced_nucleic_alphabet)
    assert(s2 == Seq("AAAAAN",  reduced_nucleic_alphabet)
        
    Status : Beta 
    """
    
    __slots__ = ["table", "source", "target"]
    def __init__(self, source, target) :

        self.table = maketrans(source, target)
        self.source = source
        self.target = target
     
     
    def __call__(self, seq) :
        """Translate sequence."""
        if not self.source.alphabet.alphabetic(seq) :
            raise ValueError("Incompatable alphabets")
        s = str.translate(seq, self.table)
        cls = self.target.__class__
        return cls(s, self.target.alphabet, seq.name, seq.description)
# End class Translation

# FIXME: Test, document, add to seq.
dna_complement = Transform(
        Seq("ACGTRYSWKMBDHVN-acgtUuryswkmbdhvnXx?.~", dna_alphabet),  
        Seq("TGCAYRSWMKVHDBN-tgcaAayrswmkvhdbnXx?.~", dna_alphabet),  
    )

     

def mask_low_complexity(seq, width =12, trigger=1.8, extension=2.0, mask='X') :
    """ Mask low complexity regions in protein sequences.
    
    Uses the method of Seg [1] by Wootton & Federhen [2] to divide a sequence   
    into regions of high and low complexity. The sequence is divided into
    overlapping windows. Low complexity windows either have a sequence entropy
    less that the trigger complexity, or have an entropy less than the extension    
    complexity and neighbor other low-complexity windows. The sequence within   
    low complexity regions are replaced with the mask character (default 'X'), 
    and the masked alphabetic sequence is returned.
    
    The default parameters, width=12, trigger=1.8, extension=2.0, mask='X' are
    suitable for masking protein sequences before a database search. The 
    standard default seg parameters are width=12, trigger=2.2, extension=2.5
    
    Arguments:
        Seq seq         -- An alphabetic sequence
        int width       -- Window width
        float trigger   -- Entropy in bits between 0 and 4.3.. ( =log_2(20) )
        float extension -- Entropy in bits between 0 and 4.3.. ( =log_2(20) )
        char mask       -- The mask character (default: 'X') 
    Returns :
        Seq         -- A masked alphabetic sequence
    Raises :
        ValueError  -- On invalid arguments
    Refs:
        [1] seg man page: 
            http://bioportal.weizmann.ac.il/education/materials/gcg/seg.html
        [2] Wootton & Federhen (Computers and Chemistry 17; 149-163, (1993)) 
    Authors:
        GEC 2005
    Future :
        - Optional mask character.
        - Option to lower case masked symbols.
        - Remove arbitary restriction to protein.
    """
    
    lg20 = log2(20)
    if trigger<0 or trigger>lg20 :
        raise ValueError("Invalid trigger complexity: %f"% trigger) 
    if extension<0 or extension>lg20 or extension<trigger:
        raise ValueError("Invalid extension complexity: %f"% extension)
    if width<0 :
        raise ValueError("Invalid width: %d"% width)

    if width > len(seq) : return seq
    
    s = seq.ords()

    X = seq.alphabet.ord(mask)

    
    nwindows = len(seq)- width +1
    ent = [ 0 for x in range(0, nwindows)]
    count = [ 0 for x in range(0, len(seq.alphabet) )]
    
    for c in s[0:width] : count[c] +=1
    ent[0] = entropy(count,2)
    
    for i in range(1, nwindows) :
        count[ s[i-1] ] -= 1
        count[ s[i+width-1] ] +=1
        ent[i] = entropy(count,2)
    
    prev_segged = False 
    for i in range(0, nwindows) :
        if ((prev_segged and ent[i]< extension) or 
            ent[i]< trigger) :
            for j in range(0, width) : s[i+j]=X
            prev_segged=True
        else :
            prev_segged = False


    # Redo, only backwards
    prev_segged = False 
    for i in range(nwindows-1, -1, -1) :
        if ((prev_segged and ent[i]< extension) or 
            ent[i]< trigger) :
            for j in range(0, width) : s[i+j]=X
            prev_segged=True
        else :
            prev_segged = False

    
    return  seq.alphabet.chrs(s)
# end mask_low_complexity()    
     

class GeneticCode(object):
    """An encoding of amino acids by DNA triplets.
 
    Example : 
    
    Genetic Code [1]: Standard   
          T         C         A         G      
       +---------+---------+---------+---------+
     T | TTT F   | TCT S   | TAT Y   | TGT C   | T
     T | TTC F   | TCC S   | TAC Y   | TGC C   | C
     T | TTA L   | TCA S   | TAA Stop| TGA Stop| A
     T | TTG L(s)| TCG S   | TAG Stop| TGG W   | G
       +---------+---------+---------+---------+
     C | CTT L   | CCT P   | CAT H   | CGT R   | T
     C | CTC L   | CCC P   | CAC H   | CGC R   | C
     C | CTA L   | CCA P   | CAA Q   | CGA R   | A
     C | CTG L(s)| CCG P   | CAG Q   | CGG R   | G
       +---------+---------+---------+---------+
     A | ATT I   | ACT T   | AAT N   | AGT S   | T
     A | ATC I   | ACC T   | AAC N   | AGC S   | C
     A | ATA I   | ACA T   | AAA K   | AGA R   | A
     A | ATG M(s)| ACG T   | AAG K   | AGG R   | G
       +---------+---------+---------+---------+
     G | GTT V   | GCT A   | GAT D   | GGT G   | T
     G | GTC V   | GCC A   | GAC D   | GGC G   | C
     G | GTA V   | GCA A   | GAA E   | GGA G   | A
     G | GTG V   | GCG A   | GAG E   | GGG G   | G
       +---------+---------+---------+---------+

    
    See Also :
    -- http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
    -- http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html#7.5
    Authors:
        JXG, GEC
    """
    # TODO: Explain use of '?' in translated sequence.
    # TODO: Does translate fails with aproriate execption when fed gaps?
    # TODO: Can back_translate handle gaps?
    
    def __init__(self, ident, description,
        amino_acid, start, base1, base2, base3):
        """Create a new GeneticCode.

        Args:
        -- ident - Standarad identifier (Or zero). An integer
        -- description 
        -- amino acid - A sequecne of amino acids and stop codons. e.g.
            "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"
        -- start - A sequence indicating start codons, e.g.,
            "---M---------------M---------------M----------------------------"
        -- base1 - The first base of each codon. e.g., 
            "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG"
        -- base2 - The second base of each codon. e.g.,
            "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG"
        -- base3 - The last base of each codon. e.g., 
            "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"            
        """
        self.ident = ident
        self.description = description
   
        self.amino_acid = amino_acid
        self.start = start
        self.base1 = base1
        self.base2 = base2
        self.base3 = base3
    
        stop_codons = []
        start_codons = []
        for i, a in enumerate(amino_acid) :
            codon = base1[i] + base2[i] + base3[i]
            if a=='*' :  stop_codons.append(codon)
            if start[i] == 'M': start_codons.append(codon)
            
        self.stop_codons = tuple(stop_codons)
        self.start_codons = tuple(start_codons)
        
        # Building the full translation table is expensive,
        # so we avoid doing so until necessary.
        self._table = None
        self._back_table = None

    #@staticmethod
    def std_list():
        "Return a list of standard genetic codes."
        return _codon_tables
    std_list = staticmethod(std_list)

    #@staticmethod
    def std():
        "The standard 'universal' genetic code."
        return _codon_tables[0]
    std = staticmethod(std)

    
    #@staticmethod
    def by_name(name) :
        """Find a genetic code in the code list by name or identifier.
        """
        for t in _codon_tables :
            if t.ident == name or t.description == name :
                return t
        raise ValueError("No such translation table: %s" % str(name) )
    by_name = staticmethod(by_name)        
    
  
    def _get_table(self) :
        if self._table is None : self._create_table() 
        return self._table  
    table = property(_get_table, None, "A map between codons and amino acids")

    def _get_back_table(self) :
        if self._back_table is None : 
            self._create_table() 
        return self._back_table  
    back_table = property(_get_back_table, None, "A map between amino acids and codons")


    def _create_table(self) :
        aa = self.amino_acid
        base1 = self.base1
        base2 = self.base2
        base3 = self.base3
        
        # Construct a table of unambiguous codon translations
        table = {}
        for i, a in enumerate(aa) :
            codon = base1[i] + base2[i] + base3[i]
            table[codon] = a
        
        # Build the back table.
        back_table = {}
        items = table.items()
        items.sort()
        for codon, aa in items[::-1] :
            back_table[aa] = codon   # Use first codon, alphabetically.
        back_table['X'] = 'NNN'
        back_table['B'] = 'NNN'         
        back_table['Z'] = 'NNN'         
        back_table['J'] = 'NNN'         
        self._back_table = back_table
                
        ltable = {}
        letters = dna_extended_letters+'U' # include RNA in table

        # Create a list of all possble codons
        codons = []
        for c1 in letters:
            for c2 in letters:
                for c3 in letters :
                    codons.append( c1+c2+c3)        

        # For each ambiguous codon, construct all compatible unambiguous codons.
        # Translate and collect a set of all possible translated amino acids.
        # If more than one translation look for possible amino acid ambiguity       
        # codes. 
        for C in codons :
            translated = dict() # Use dict, because no set in py2.3
            c = C.replace('U', 'T') # Convert rna codon to dna
            for c1 in dna_ambiguity[c[0]]:
                for c2 in dna_ambiguity[c[1]]:
                    for c3 in dna_ambiguity[c[2]]:
                        aa = table[ c1+c2+c3 ]
                        translated[aa] = ''
            translated = list(translated.keys())
            translated.sort()
            if len(translated) ==1 :
                trans = list(translated)[0]
            elif translated == ['D','N'] :
                trans = 'B'      
            elif translated == ['E','Q'] :
                trans = 'Z' 
            elif translated == ['I','L'] :
                trans = 'J'             
            elif '*' in translated:
                trans = '?'
            else :
                trans = 'X'
            ltable[C] = trans

        self._table = ltable
    # End create tables

    def translate(self, seq, frame=0) :
        """Translate a DNA sequence to a polypeptide using full
        IUPAC ambiguities in DNA/RNA and amino acid codes.
        
        Returns : 
        -- Seq - A polypeptide sequence 
        """
        # TODO: Optimize.
        # TODO: Insanity check alphabet.
        seq = str(seq)
        table = self.table
        trans = []
        L = len(seq)
        for i in range(frame, L-2, 3) :   
            codon = seq[i:i+3].upper()
            trans.append( table[codon])
        return Seq(''.join(trans), protein_alphabet)

             
    def back_translate(self, seq) :
        """Convert protein back into coding DNA.
        
        Args:
        -- seq - A polypeptide sequence.
        
        Returns :
        -- Seq - A dna sequence
        """
        # TODO: Optimzie
        # TODO: Insanity check alphabet.
        table = self.back_table
        seq = str(seq)
        trans = [ table[a] for a in seq]
        return Seq(''.join(trans), dna_alphabet)
        
 #TODO: translate_orf(self, seq, start) ?
 #TODO: translate_to_stop(self, seq, frame) ?       
 #TODO: translate_all_frames(self,seq) -> 6 translations.
 
    def __repr__(self) :
        string = []
        string += 'GeneticCode( %d, "' % self.ident
        string += self.description
        string += '", \n'
        string += '    amino_acid = "'
        string += self.amino_acid
        string += '",\n'
        string += '    start =      "'
        string += self.start
        string += '",\n'
        string += '    base1 =      "'
        string += self.base1
        string += '",\n'
        string += '    base2 =      "'
        string += self.base2
        string += '",\n'
        string += '    base3 =      "'
        string += self.base3
        string += '" )'
        return ''.join(string)
        
        
    def __str__(self) :
        """Returns a text representation of this genetic code."""
        # Inspired by http://bugzilla.open-bio.org/show_bug.cgi?id=1963
        letters = "TCAG" # Convectional ordering for codon tables.
        string = []
        
        if self.ident :
            string += 'Genetic Code [%d]: ' % self.ident
        else :
            string += 'Genetic Code: '
        string +=  self.description or ''        

        string += "\n    "
        string += " ".join( ["  %s      " % c2 for c2 in letters] ) 
        
        string += "\n   +" 
        string +=  "+".join(["---------" for c2 in letters]) + "+  "
        
        table = self.table
        
        for c1 in letters :
            for c3 in letters :
                string += '\n '
                string  += c1 
                string  += " |"
                for c2 in letters :
                    codon = c1+c2+c3
                    string += " " + codon
                    if codon in self.stop_codons :
                        string  += " Stop|"
                    else :
                        amino = table.get(codon, '?')
                        if codon in self.start_codons :
                            string += " %s(s)|" % amino
                        else :
                            string += " %s   |" % amino
                string += " " + c3
                
            string += "\n   +"
            string += "+".join(["---------" for c2 in letters]) 
            string += "+  "
        string += '\n'
        return ''.join(string)
# end class GeneticCode

 
# Data from http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html#7.5
# Aug. 2006
# Genetic Code Tables
# 
# Authority      International Sequence Databank Collaboration
# Contact        NCBI
# Scope          /transl_table qualifier
# URL            http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c   
_codon_tables = ( 
    GeneticCode(1, "Standard",
        "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "---M---------------M---------------M----------------------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),
        
    GeneticCode(2, "Vertebrate Mitochondrial",
        "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
        "--------------------------------MMMM---------------M------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),
 
    GeneticCode(3, "Yeast Mitochondrial",
        "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "----------------------------------MM----------------------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"), 

    GeneticCode(4, "Mold, Protozoan, Coelenterate Mitochondrial & Mycoplasma/Spiroplasma",
        "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "--MM---------------M------------MMMM---------------M------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),
        
    GeneticCode(5, "Invertebrate Mitochondrial", 
        "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
        "---M----------------------------MMMM---------------M------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),

    GeneticCode(6, "Ciliate, Dasycladacean and Hexamita Nuclear",    
        "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "-----------------------------------M----------------------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),
 
    GeneticCode(9, "Echinoderm and Flatworm Mitochondrial", 
        "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
        "-----------------------------------M---------------M------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),

    GeneticCode(10, "Euplotid Nuclear",
        "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "-----------------------------------M----------------------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),
  
    GeneticCode(11, "Bacterial and Plant Plastid",
        "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "---M---------------M------------MMMM---------------M------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),

    GeneticCode(12, "Alternative Yeast Nuclear",
        "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "-------------------M---------------M----------------------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),
                                                                   
    GeneticCode(13,"Ascidian Mitochondrial", 
        "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
        "-----------------------------------M----------------------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),

    GeneticCode(14, "Alternative Flatworm Mitochondrial",
        "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
        "-----------------------------------M----------------------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),

    GeneticCode(15, "Blepharisma Nuclear",
        "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "-----------------------------------M----------------------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),

    GeneticCode(16, "Chlorophycean Mitochondrial",
        "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "-----------------------------------M----------------------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),

    GeneticCode(21, "Trematode Mitochondrial",
        "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
        "-----------------------------------M---------------M------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),

    GeneticCode(22, "Scenedesmus obliquus Mitochondrial",
        "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "-----------------------------------M----------------------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG"),

    GeneticCode(23,"Thraustochytrium Mitochondrial",
        "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
        "--------------------------------M--M---------------M------------",
        "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG",
        "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG",
        "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG",),
    )