diff corebio/data.py @ 4:4d47ab2b7bcc

Uploaded
author davidmurphy
date Fri, 13 Jan 2012 07:18:19 -0500
parents c55bdc2fb9fa
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/corebio/data.py	Fri Jan 13 07:18:19 2012 -0500
@@ -0,0 +1,385 @@
+#  Copyright (c) 2006, The Regents of the University of California, through 
+#  Lawrence Berkeley National Laboratory (subject to receipt of any required
+#  approvals from the U.S. Dept. of Energy).  All rights reserved.
+
+#  This software is distributed under the new BSD Open Source License.
+#  <http://www.opensource.org/licenses/bsd-license.html>
+#
+#  Redistribution and use in source and binary forms, with or without 
+#  modification, are permitted provided that the following conditions are met: 
+#
+#  (1) Redistributions of source code must retain the above copyright notice, 
+#  this list of conditions and the following disclaimer. 
+#
+#  (2) Redistributions in binary form must reproduce the above copyright 
+#  notice, this list of conditions and the following disclaimer in the 
+#  documentation and or other materials provided with the distribution. 
+#
+#  (3) Neither the name of the University of California, Lawrence Berkeley 
+#  National Laboratory, U.S. Dept. of Energy nor the names of its contributors 
+#  may be used to endorse or promote products derived from this software 
+#  without specific prior written permission. 
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+#  POSSIBILITY OF SUCH DAMAGE. 
+
+"""
+Standard information used in computational biology.
+
+
+To convert a property dictionary to a list :
+>>> comp = [ amino_acid_composition[k] for k in amino_acid_letters]
+
+
+Resources: (Various standard data files.)
+ 
+
+BLOSUM Scoring Matrices
+    Source: ftp://ftp.ncbi.nih.gov/repository/blocks/unix/blosum
+    These are all new blast style with 1/3 bit scaling
+    - blosum35
+    - blosum45    
+    - blosum62    
+    - blosum40    
+    - blosum50    
+    - blosum80    
+    - blosum100   
+
+Other subsitution scoring matrices:
+    - dist20_comp 
+    - pam250
+    - pam120
+    
+    
+Status: Beta (Data needs to be proof checked.)    
+"""
+# TODO: add this datafile?
+# Description of database cross references :
+#    - dbxref.txt (http://www.expasy.org/cgi-bin/lists?dbxref.txt)
+    
+
+# FIXME: Move documentation of data to docstring above. docstrings
+# after variables don't work.
+
+
+# The ExPasy ProtScale tool is a great source of amino acid properties.
+# http://au.expasy.org/cgi-bin/protscale.pl       
+
+from StringIO import StringIO
+from corebio._future import resource_string, resource_stream,resource_filename
+from corebio import utils
+
+# Explictly list set of available data resources. We want to be able to access
+# these resources in, for example, a webapp, without inadvertently allowing
+# unrestricted read access to the local file system.
+
+resource_names = [
+    'blosum35',
+    'blosum45',    
+    'blosum62',    
+    'blosum40',    
+    'blosum50',    
+    'blosum80',    
+    'blosum100',   
+    'dist20_comp', 
+    'pam250',
+    'pam120', 
+    ]
+
+_resource_filenames = {
+    'blosum35':    'data/blosum35.mat',
+    'blosum45':    'data/blosum45.mat',    
+    'blosum62':    'data/blosum62.mat',    
+    'blosum40':    'data/blosum40.mat',    
+    'blosum50':    'data/blosum50.mat',    
+    'blosum80':    'data/blosum80.mat',    
+    'blosum100':   'data/blosum100.mat',   
+    'dist20_comp': 'data/dist20_comp.mat', 
+    'pam250':      'data/pam250.mat',
+    'pam120':      'data/pam120.mat', 
+    }
+
+# TODO: Subsitution matrix parser, SeqMatrix.read
+_resource_parsers = {}
+
+def data_string( name ): 
+    fn = _resource_filenames[name]
+    return resource_string(__name__, fn , __file__)    
+
+def data_stream( name ):
+    fn = _resource_filenames[name]
+    return resource_stream(__name__, fn , __file__)    
+
+def data_filename( name ): 
+    fn = _resource_filenames[name]
+    return resource_filename(__name__, fn, __file__)            
+
+def data_object( name, parser = None) :
+    if parser is None : 
+        if name in _resource_parsers :
+            parser = _resource_parsers[name]
+        else :
+            parser = str    
+    return parser( data_stream(name) )
+
+
+amino_acid_letters = "ACDEFGHIKLMNPQRSTVWY"
+"""Standard codes for the 20 canonical amino acids, in alphabetic order."""
+
+amino_acid_alternative_letters = "ARNDCQEGHILKMFPSTWYV"
+"""Amino acid one letter codes, alphabetic by three letter codes."""
+
+amino_acid_extended_letters = "ACDEFGHIKLMNOPQRSTUVWYBJZX*-"
+
+
+dna_letters = "GATC"
+dna_extended_letters = "GATCRYWSMKHBVDN"
+
+rna_letters = "GAUC"  
+rna_extended_letters = "GAUCRYWSMKHBVDN"
+
+
+dna_ambiguity = {
+    "A": "A",
+    "C": "C",
+    "G": "G",
+    "T": "T",
+    "M": "AC",
+    "R": "AG",
+    "W": "AT",
+    "S": "CG",
+    "Y": "CT",
+    "K": "GT",
+    "V": "ACG",
+    "H": "ACT",
+    "D": "AGT",
+    "B": "CGT",
+    "X": "GATC",
+    "N": "GATC",
+}
+
+rna_ambiguity = {
+    "A": "A",
+    "C": "C",
+    "G": "G",
+    "U": "U",
+    "M": "AC",
+    "R": "AG",
+    "W": "AU",
+    "S": "CG",
+    "Y": "CU",
+    "K": "GU",
+    "V": "ACG",
+    "H": "ACU",
+    "D": "AGU",
+    "B": "CGU",
+    "X": "GAUC",
+    "N": "GAUC",
+}
+
+amino_acid_ambiguity = {
+    "A": "A",
+    "B": "ND",
+    "C": "C",
+    "D": "D",
+    "E": "E",
+    "F": "F",
+    "G": "G",
+    "H": "H",
+    "I": "I",
+    "K": "K",
+    "L": "L",
+    "M": "M",
+    "N": "N",
+    "P": "P",
+    "Q": "Q",
+    "R": "R",
+    "S": "S",
+    "T": "T",
+    "V": "V",
+    "W": "W",
+    "X": "ACDEFGHIKLMNPQRSTVWY",
+    "Y": "Y",
+    "Z": "QE",
+    "J": "IL",
+    'U': 'U',
+    'O': 'O',
+}
+
+
+# Monomer isotopically averaged molecular mass 
+# Data Checked GEC Nov 2006
+amino_acid_mass = {
+    "A": 89.09,
+    "B" : 132.66,  # Averaged proportional to amino_acid_composition
+    "C": 121.16,
+    "D": 133.10,
+    "E": 147.13,
+    "F": 165.19,
+    "G": 75.07,
+    "H": 155.16, 
+    "I": 131.18,
+    "J": 131.18,
+    "K": 146.19,
+    "L": 131.18,
+    "M": 149.21,
+    "N": 132.12,
+    # "O" : ???, # TODO
+    "P": 115.13,
+    "Q": 146.15,
+    "R": 174.20,
+    "S": 105.09,
+    "T": 119.12,
+    "U" : 168.05,
+    "V": 117.15,
+    "W": 204.23,
+    "X" : 129.15, # Averaged proportional to amino_acid_composition  
+    "Y": 181.19,
+    "Z" : 146.76, # Averaged proportional to amino_acid_composition    
+    }
+    
+dna_mass = {
+    "A": 347.,
+    "C": 323.,
+    "G": 363.,
+    "T": 322.,
+    }
+
+rna_mass = {
+    "A": 363.,
+    "C": 319.,
+    "G": 379.,
+    "U": 340.,
+}
+
+one_to_three = {
+    'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
+    'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
+    'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
+    'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
+    'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
+    'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', 
+    'U':'Sec', 'J':'Xle', 'O':'Pyl'
+    }
+""" Map between standard 1 letter amino acid codes and standard three letter codes. 
+
+Ref: http://www.ebi.ac.uk/RESID/faq.html
+"""
+
+standard_three_to_one = utils.invert_dict(one_to_three)
+""" Map between standard three letter amino acid codes and standard one letter codes. 
+
+Ref: http://www.ebi.ac.uk/RESID/faq.html
+"""
+
+
+extended_three_to_one= {
+'2as':'D', '3ah':'H', '5hp':'E', 'Acl':'R', 'Agm':'R', 'Aib':'A', 'Ala':'A', 'Alm':'A', 'Alo':'T', 'Aly':'K', 'Arg':'R', 'Arm':'R', 'Asa':'D', 'Asb':'D', 'Ask':'D', 'Asl':'D', 'Asn':'N', 'Asp':'D', 'Asq':'D', 'Asx':'B', 'Aya':'A', 'Bcs':'C', 'Bhd':'D', 'Bmt':'T', 'Bnn':'A', 'Buc':'C', 'Bug':'L', 'C5c':'C', 'C6c':'C', 'Ccs':'C', 'Cea':'C', 'Cgu':'E', 'Chg':'A', 'Cle':'L', 'Cme':'C', 'Csd':'A', 'Cso':'C', 'Csp':'C', 'Css':'C', 'Csw':'C', 'Csx':'C', 'Cxm':'M', 'Cy1':'C', 'Cy3':'C', 'Cyg':'C', 'Cym':'C', 'Cyq':'C', 'Cys':'C', 'Dah':'F', 'Dal':'A', 'Dar':'R', 'Das':'D', 'Dcy':'C', 'Dgl':'E', 'Dgn':'Q', 'Dha':'A', 'Dhi':'H', 'Dil':'I', 'Div':'V', 'Dle':'L', 'Dly':'K', 'Dnp':'A', 'Dpn':'F', 'Dpr':'P', 'Dsn':'S', 'Dsp':'D', 'Dth':'T', 'Dtr':'W', 'Dty':'Y', 'Dva':'V', 'Efc':'C', 'Fla':'A', 'Fme':'M', 'Ggl':'E', 'Gl3':'G', 'Gln':'Q', 'Glu':'E', 'Glx':'Z', 'Gly':'G', 'Glz':'G', 'Gma':'E', 'Gsc':'G', 'Hac':'A', 'Har':'R', 'Hic':'H', 'Hip':'H', 'His':'H', 'Hmr':'R', 'Hpq':'F', 'Htr':'W', 'Hyp':'P', 'Iil':'I', 'Ile':'I', 'Iyr':'Y', 'Kcx':'K', 'Leu':'L', 'Llp':'K', 'Lly':'K', 'Ltr':'W', 'Lym':'K', 'Lys':'K', 'Lyz':'K', 'Maa':'A', 'Men':'N', 'Met':'M', 'Mhs':'H', 'Mis':'S', 'Mle':'L', 'Mpq':'G', 'Msa':'G', 'Mse':'M', 'Mva':'V', 'Nem':'H', 'Nep':'H', 'Nle':'L', 'Nln':'L', 'Nlp':'L', 'Nmc':'G', 'Oas':'S', 'Ocs':'C', 'Omt':'M', 'Paq':'Y', 'Pca':'E', 'Pec':'C', 'Phe':'F', 'Phi':'F', 'Phl':'F', 'Pr3':'C', 'Pro':'P', 'Prr':'A', 'Ptr':'Y', 'Pyl':'O', 'Sac':'S', 'Sar':'G', 'Sch':'C', 'Scs':'C', 'Scy':'C', 'Sec':'U', 'Sel':'U', 'Sep':'S', 'Ser':'S', 'Set':'S', 'Shc':'C', 'Shr':'K', 'Smc':'C', 'Soc':'C', 'Sty':'Y', 'Sva':'S', 'Ter':'*', 'Thr':'T', 'Tih':'A', 'Tpl':'W', 'Tpo':'T', 'Tpq':'A', 'Trg':'K', 'Tro':'W', 'Trp':'W', 'Tyb':'Y', 'Tyq':'Y', 'Tyr':'Y', 'Tys':'Y', 'Tyy':'Y', 'Unk':'X', 'Val':'V', 'Xaa':'X', 'Xer':'X', 'Xle':'J'}
+
+""" Map between three letter amino acid codes and standard one letter codes. 
+This map contains many nonstandard three letter codes, used, for example, to specify chemically modified amino acids in PDB files.
+
+Ref: http://astral.berkeley.edu/ 
+Ref: http://www.ebi.ac.uk/RESID/faq.html
+"""
+# Initial table is from the ASTRAL RAF release notes.
+# added UNK
+# Extra IUPAC: Xle, Xaa, Sec, Pyl
+# The following have been seen in biopython code.
+# Ter : '*'     Termination
+# Sel : 'U'     A typo for Sec, selenocysteine? 
+# Xer : 'X'     Another alternative for unknown?
+
+
+amino_acid_names = {
+    'A'	: 'alanine',	
+    'M'	: 'methionine',  
+    'C'	: 'cysteine',
+    'N'	: 'asparagine',
+    'D'	: 'aspartic acid',
+    'P'	: 'proline',
+    'E'	: 'glutamic acid',
+    'Q'	: 'glutamine',
+    'F'	: 'phenylalanine',
+    'R'	: 'arginine',
+    'G'	: 'glycine',	
+    'S'	: 'serine',
+    'H'	: 'histidine',	
+    'T' : 'threonine',
+    'I'	: 'isoleucine',	
+    'V'	: 'valine',
+    'K'	: 'lysine',
+    'W'	: 'tryptophan', 
+    'L'	: 'leucine',	
+    'Y'	: 'tyrosine', 
+    'B' : 'aspartic acid or asparagine',
+    'J' : 'leucine or isoleucine',
+    'X' : 'unknown',
+    'Z' : 'glutamic acid or glutamine',
+    'U' : 'selenocysteine',
+    'O' : 'pyrrolysine',
+    '*' : 'translation stop',
+    '-' : 'gap'
+    }
+
+amino_acid_composition = dict(
+    A = .082, R = .057, N = .044, D = .053, C = .017, 
+    Q = .040, E = .062, G = .072, H = .022, I = .052,  
+    L = .090, K = .057, M = .024, F =.039, P = .051, 
+    S = .069, T = .058, W = .013, Y= .032, V =.066 )
+      
+"""
+Overall amino acid composition of proteins.
+Ref: McCaldon P., Argos P. Proteins 4:99-122 (1988).
+"""
+# FIXME : Proof these values
+
+kyte_doolittle_hydrophobicity = dict(
+    A=1.8, R=-4.5, N=-3.5, D=-3.5,  C=2.5, 
+    Q=-3.5, E=-3.5, G=-0.4, H=-3.2, I=4.5,
+    L=3.8, K=-3.9,  M=1.9,  F=2.8, P=-1.6,
+    S=-0.8, T=-0.7, W=-0.9, Y=-1.3, V=4.2 )
+"""
+Kyte-Doolittle hydrophobicity scale.
+Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132 (1982)
+"""
+# FIXME : Proof these values
+
+
+nucleotide_names = { 
+    'A' : 'Adenosine',
+    'C'	: 'Cytidine',
+    'G'	: 'Guanine',
+    'T'	: 'Thymidine',
+    'U'	: 'Uracil',
+    'R'	: 'G A (puRine)',
+    'Y'	: 'T C (pYrimidine)',
+    'K'	: 'G T (Ketone)',
+    'M'	: 'A C (aMino group)',
+    'S'	: 'G C (Strong interaction)',
+    'W'	: 'A T (Weak interaction)',
+    'B'	: 'G T C (not A) (B comes after A)',
+    'D'	: 'G A T (not C) (D comes after C)',
+    'H'	: 'A C T (not G) (H comes after G)',
+    'V'	: 'G C A (not T, not U) (V comes after U)',
+    'N' : 'A G C T (aNy)',
+    '-' : 'gap', 
+    }
+    
+    
+
+    
+    
+    
+    
+    
+    
\ No newline at end of file