view corebio/data.py @ 10:20716450be87

Uploaded
author davidmurphy
date Mon, 30 Jan 2012 21:17:50 -0500
parents c55bdc2fb9fa
children
line wrap: on
line source

#  Copyright (c) 2006, The Regents of the University of California, through 
#  Lawrence Berkeley National Laboratory (subject to receipt of any required
#  approvals from the U.S. Dept. of Energy).  All rights reserved.

#  This software is distributed under the new BSD Open Source License.
#  <http://www.opensource.org/licenses/bsd-license.html>
#
#  Redistribution and use in source and binary forms, with or without 
#  modification, are permitted provided that the following conditions are met: 
#
#  (1) Redistributions of source code must retain the above copyright notice, 
#  this list of conditions and the following disclaimer. 
#
#  (2) Redistributions in binary form must reproduce the above copyright 
#  notice, this list of conditions and the following disclaimer in the 
#  documentation and or other materials provided with the distribution. 
#
#  (3) Neither the name of the University of California, Lawrence Berkeley 
#  National Laboratory, U.S. Dept. of Energy nor the names of its contributors 
#  may be used to endorse or promote products derived from this software 
#  without specific prior written permission. 
#
#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
#  POSSIBILITY OF SUCH DAMAGE. 

"""
Standard information used in computational biology.


To convert a property dictionary to a list :
>>> comp = [ amino_acid_composition[k] for k in amino_acid_letters]


Resources: (Various standard data files.)
 

BLOSUM Scoring Matrices
    Source: ftp://ftp.ncbi.nih.gov/repository/blocks/unix/blosum
    These are all new blast style with 1/3 bit scaling
    - blosum35
    - blosum45    
    - blosum62    
    - blosum40    
    - blosum50    
    - blosum80    
    - blosum100   

Other subsitution scoring matrices:
    - dist20_comp 
    - pam250
    - pam120
    
    
Status: Beta (Data needs to be proof checked.)    
"""
# TODO: add this datafile?
# Description of database cross references :
#    - dbxref.txt (http://www.expasy.org/cgi-bin/lists?dbxref.txt)
    

# FIXME: Move documentation of data to docstring above. docstrings
# after variables don't work.


# The ExPasy ProtScale tool is a great source of amino acid properties.
# http://au.expasy.org/cgi-bin/protscale.pl       

from StringIO import StringIO
from corebio._future import resource_string, resource_stream,resource_filename
from corebio import utils

# Explictly list set of available data resources. We want to be able to access
# these resources in, for example, a webapp, without inadvertently allowing
# unrestricted read access to the local file system.

resource_names = [
    'blosum35',
    'blosum45',    
    'blosum62',    
    'blosum40',    
    'blosum50',    
    'blosum80',    
    'blosum100',   
    'dist20_comp', 
    'pam250',
    'pam120', 
    ]

_resource_filenames = {
    'blosum35':    'data/blosum35.mat',
    'blosum45':    'data/blosum45.mat',    
    'blosum62':    'data/blosum62.mat',    
    'blosum40':    'data/blosum40.mat',    
    'blosum50':    'data/blosum50.mat',    
    'blosum80':    'data/blosum80.mat',    
    'blosum100':   'data/blosum100.mat',   
    'dist20_comp': 'data/dist20_comp.mat', 
    'pam250':      'data/pam250.mat',
    'pam120':      'data/pam120.mat', 
    }

# TODO: Subsitution matrix parser, SeqMatrix.read
_resource_parsers = {}

def data_string( name ): 
    fn = _resource_filenames[name]
    return resource_string(__name__, fn , __file__)    

def data_stream( name ):
    fn = _resource_filenames[name]
    return resource_stream(__name__, fn , __file__)    

def data_filename( name ): 
    fn = _resource_filenames[name]
    return resource_filename(__name__, fn, __file__)            

def data_object( name, parser = None) :
    if parser is None : 
        if name in _resource_parsers :
            parser = _resource_parsers[name]
        else :
            parser = str    
    return parser( data_stream(name) )


amino_acid_letters = "ACDEFGHIKLMNPQRSTVWY"
"""Standard codes for the 20 canonical amino acids, in alphabetic order."""

amino_acid_alternative_letters = "ARNDCQEGHILKMFPSTWYV"
"""Amino acid one letter codes, alphabetic by three letter codes."""

amino_acid_extended_letters = "ACDEFGHIKLMNOPQRSTUVWYBJZX*-"


dna_letters = "GATC"
dna_extended_letters = "GATCRYWSMKHBVDN"

rna_letters = "GAUC"  
rna_extended_letters = "GAUCRYWSMKHBVDN"


dna_ambiguity = {
    "A": "A",
    "C": "C",
    "G": "G",
    "T": "T",
    "M": "AC",
    "R": "AG",
    "W": "AT",
    "S": "CG",
    "Y": "CT",
    "K": "GT",
    "V": "ACG",
    "H": "ACT",
    "D": "AGT",
    "B": "CGT",
    "X": "GATC",
    "N": "GATC",
}

rna_ambiguity = {
    "A": "A",
    "C": "C",
    "G": "G",
    "U": "U",
    "M": "AC",
    "R": "AG",
    "W": "AU",
    "S": "CG",
    "Y": "CU",
    "K": "GU",
    "V": "ACG",
    "H": "ACU",
    "D": "AGU",
    "B": "CGU",
    "X": "GAUC",
    "N": "GAUC",
}

amino_acid_ambiguity = {
    "A": "A",
    "B": "ND",
    "C": "C",
    "D": "D",
    "E": "E",
    "F": "F",
    "G": "G",
    "H": "H",
    "I": "I",
    "K": "K",
    "L": "L",
    "M": "M",
    "N": "N",
    "P": "P",
    "Q": "Q",
    "R": "R",
    "S": "S",
    "T": "T",
    "V": "V",
    "W": "W",
    "X": "ACDEFGHIKLMNPQRSTVWY",
    "Y": "Y",
    "Z": "QE",
    "J": "IL",
    'U': 'U',
    'O': 'O',
}


# Monomer isotopically averaged molecular mass 
# Data Checked GEC Nov 2006
amino_acid_mass = {
    "A": 89.09,
    "B" : 132.66,  # Averaged proportional to amino_acid_composition
    "C": 121.16,
    "D": 133.10,
    "E": 147.13,
    "F": 165.19,
    "G": 75.07,
    "H": 155.16, 
    "I": 131.18,
    "J": 131.18,
    "K": 146.19,
    "L": 131.18,
    "M": 149.21,
    "N": 132.12,
    # "O" : ???, # TODO
    "P": 115.13,
    "Q": 146.15,
    "R": 174.20,
    "S": 105.09,
    "T": 119.12,
    "U" : 168.05,
    "V": 117.15,
    "W": 204.23,
    "X" : 129.15, # Averaged proportional to amino_acid_composition  
    "Y": 181.19,
    "Z" : 146.76, # Averaged proportional to amino_acid_composition    
    }
    
dna_mass = {
    "A": 347.,
    "C": 323.,
    "G": 363.,
    "T": 322.,
    }

rna_mass = {
    "A": 363.,
    "C": 319.,
    "G": 379.,
    "U": 340.,
}

one_to_three = {
    'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
    'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
    'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
    'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
    'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
    'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', 
    'U':'Sec', 'J':'Xle', 'O':'Pyl'
    }
""" Map between standard 1 letter amino acid codes and standard three letter codes. 

Ref: http://www.ebi.ac.uk/RESID/faq.html
"""

standard_three_to_one = utils.invert_dict(one_to_three)
""" Map between standard three letter amino acid codes and standard one letter codes. 

Ref: http://www.ebi.ac.uk/RESID/faq.html
"""


extended_three_to_one= {
'2as':'D', '3ah':'H', '5hp':'E', 'Acl':'R', 'Agm':'R', 'Aib':'A', 'Ala':'A', 'Alm':'A', 'Alo':'T', 'Aly':'K', 'Arg':'R', 'Arm':'R', 'Asa':'D', 'Asb':'D', 'Ask':'D', 'Asl':'D', 'Asn':'N', 'Asp':'D', 'Asq':'D', 'Asx':'B', 'Aya':'A', 'Bcs':'C', 'Bhd':'D', 'Bmt':'T', 'Bnn':'A', 'Buc':'C', 'Bug':'L', 'C5c':'C', 'C6c':'C', 'Ccs':'C', 'Cea':'C', 'Cgu':'E', 'Chg':'A', 'Cle':'L', 'Cme':'C', 'Csd':'A', 'Cso':'C', 'Csp':'C', 'Css':'C', 'Csw':'C', 'Csx':'C', 'Cxm':'M', 'Cy1':'C', 'Cy3':'C', 'Cyg':'C', 'Cym':'C', 'Cyq':'C', 'Cys':'C', 'Dah':'F', 'Dal':'A', 'Dar':'R', 'Das':'D', 'Dcy':'C', 'Dgl':'E', 'Dgn':'Q', 'Dha':'A', 'Dhi':'H', 'Dil':'I', 'Div':'V', 'Dle':'L', 'Dly':'K', 'Dnp':'A', 'Dpn':'F', 'Dpr':'P', 'Dsn':'S', 'Dsp':'D', 'Dth':'T', 'Dtr':'W', 'Dty':'Y', 'Dva':'V', 'Efc':'C', 'Fla':'A', 'Fme':'M', 'Ggl':'E', 'Gl3':'G', 'Gln':'Q', 'Glu':'E', 'Glx':'Z', 'Gly':'G', 'Glz':'G', 'Gma':'E', 'Gsc':'G', 'Hac':'A', 'Har':'R', 'Hic':'H', 'Hip':'H', 'His':'H', 'Hmr':'R', 'Hpq':'F', 'Htr':'W', 'Hyp':'P', 'Iil':'I', 'Ile':'I', 'Iyr':'Y', 'Kcx':'K', 'Leu':'L', 'Llp':'K', 'Lly':'K', 'Ltr':'W', 'Lym':'K', 'Lys':'K', 'Lyz':'K', 'Maa':'A', 'Men':'N', 'Met':'M', 'Mhs':'H', 'Mis':'S', 'Mle':'L', 'Mpq':'G', 'Msa':'G', 'Mse':'M', 'Mva':'V', 'Nem':'H', 'Nep':'H', 'Nle':'L', 'Nln':'L', 'Nlp':'L', 'Nmc':'G', 'Oas':'S', 'Ocs':'C', 'Omt':'M', 'Paq':'Y', 'Pca':'E', 'Pec':'C', 'Phe':'F', 'Phi':'F', 'Phl':'F', 'Pr3':'C', 'Pro':'P', 'Prr':'A', 'Ptr':'Y', 'Pyl':'O', 'Sac':'S', 'Sar':'G', 'Sch':'C', 'Scs':'C', 'Scy':'C', 'Sec':'U', 'Sel':'U', 'Sep':'S', 'Ser':'S', 'Set':'S', 'Shc':'C', 'Shr':'K', 'Smc':'C', 'Soc':'C', 'Sty':'Y', 'Sva':'S', 'Ter':'*', 'Thr':'T', 'Tih':'A', 'Tpl':'W', 'Tpo':'T', 'Tpq':'A', 'Trg':'K', 'Tro':'W', 'Trp':'W', 'Tyb':'Y', 'Tyq':'Y', 'Tyr':'Y', 'Tys':'Y', 'Tyy':'Y', 'Unk':'X', 'Val':'V', 'Xaa':'X', 'Xer':'X', 'Xle':'J'}

""" Map between three letter amino acid codes and standard one letter codes. 
This map contains many nonstandard three letter codes, used, for example, to specify chemically modified amino acids in PDB files.

Ref: http://astral.berkeley.edu/ 
Ref: http://www.ebi.ac.uk/RESID/faq.html
"""
# Initial table is from the ASTRAL RAF release notes.
# added UNK
# Extra IUPAC: Xle, Xaa, Sec, Pyl
# The following have been seen in biopython code.
# Ter : '*'     Termination
# Sel : 'U'     A typo for Sec, selenocysteine? 
# Xer : 'X'     Another alternative for unknown?


amino_acid_names = {
    'A'	: 'alanine',	
    'M'	: 'methionine',  
    'C'	: 'cysteine',
    'N'	: 'asparagine',
    'D'	: 'aspartic acid',
    'P'	: 'proline',
    'E'	: 'glutamic acid',
    'Q'	: 'glutamine',
    'F'	: 'phenylalanine',
    'R'	: 'arginine',
    'G'	: 'glycine',	
    'S'	: 'serine',
    'H'	: 'histidine',	
    'T' : 'threonine',
    'I'	: 'isoleucine',	
    'V'	: 'valine',
    'K'	: 'lysine',
    'W'	: 'tryptophan', 
    'L'	: 'leucine',	
    'Y'	: 'tyrosine', 
    'B' : 'aspartic acid or asparagine',
    'J' : 'leucine or isoleucine',
    'X' : 'unknown',
    'Z' : 'glutamic acid or glutamine',
    'U' : 'selenocysteine',
    'O' : 'pyrrolysine',
    '*' : 'translation stop',
    '-' : 'gap'
    }

amino_acid_composition = dict(
    A = .082, R = .057, N = .044, D = .053, C = .017, 
    Q = .040, E = .062, G = .072, H = .022, I = .052,  
    L = .090, K = .057, M = .024, F =.039, P = .051, 
    S = .069, T = .058, W = .013, Y= .032, V =.066 )
      
"""
Overall amino acid composition of proteins.
Ref: McCaldon P., Argos P. Proteins 4:99-122 (1988).
"""
# FIXME : Proof these values

kyte_doolittle_hydrophobicity = dict(
    A=1.8, R=-4.5, N=-3.5, D=-3.5,  C=2.5, 
    Q=-3.5, E=-3.5, G=-0.4, H=-3.2, I=4.5,
    L=3.8, K=-3.9,  M=1.9,  F=2.8, P=-1.6,
    S=-0.8, T=-0.7, W=-0.9, Y=-1.3, V=4.2 )
"""
Kyte-Doolittle hydrophobicity scale.
Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132 (1982)
"""
# FIXME : Proof these values


nucleotide_names = { 
    'A' : 'Adenosine',
    'C'	: 'Cytidine',
    'G'	: 'Guanine',
    'T'	: 'Thymidine',
    'U'	: 'Uracil',
    'R'	: 'G A (puRine)',
    'Y'	: 'T C (pYrimidine)',
    'K'	: 'G T (Ketone)',
    'M'	: 'A C (aMino group)',
    'S'	: 'G C (Strong interaction)',
    'W'	: 'A T (Weak interaction)',
    'B'	: 'G T C (not A) (B comes after A)',
    'D'	: 'G A T (not C) (D comes after C)',
    'H'	: 'A C T (not G) (H comes after G)',
    'V'	: 'G C A (not T, not U) (V comes after U)',
    'N' : 'A G C T (aNy)',
    '-' : 'gap', 
    }