| 0 | 1 #  Copyright (c) 2006, The Regents of the University of California, through | 
|  | 2 #  Lawrence Berkeley National Laboratory (subject to receipt of any required | 
|  | 3 #  approvals from the U.S. Dept. of Energy).  All rights reserved. | 
|  | 4 | 
|  | 5 #  This software is distributed under the new BSD Open Source License. | 
|  | 6 #  <http://www.opensource.org/licenses/bsd-license.html> | 
|  | 7 # | 
|  | 8 #  Redistribution and use in source and binary forms, with or without | 
|  | 9 #  modification, are permitted provided that the following conditions are met: | 
|  | 10 # | 
|  | 11 #  (1) Redistributions of source code must retain the above copyright notice, | 
|  | 12 #  this list of conditions and the following disclaimer. | 
|  | 13 # | 
|  | 14 #  (2) Redistributions in binary form must reproduce the above copyright | 
|  | 15 #  notice, this list of conditions and the following disclaimer in the | 
|  | 16 #  documentation and or other materials provided with the distribution. | 
|  | 17 # | 
|  | 18 #  (3) Neither the name of the University of California, Lawrence Berkeley | 
|  | 19 #  National Laboratory, U.S. Dept. of Energy nor the names of its contributors | 
|  | 20 #  may be used to endorse or promote products derived from this software | 
|  | 21 #  without specific prior written permission. | 
|  | 22 # | 
|  | 23 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
|  | 24 #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | 25 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | 26 #  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
|  | 27 #  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|  | 28 #  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|  | 29 #  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|  | 30 #  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|  | 31 #  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|  | 32 #  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
|  | 33 #  POSSIBILITY OF SUCH DAMAGE. | 
|  | 34 | 
|  | 35 """ | 
|  | 36 Standard information used in computational biology. | 
|  | 37 | 
|  | 38 | 
|  | 39 To convert a property dictionary to a list : | 
|  | 40 >>> comp = [ amino_acid_composition[k] for k in amino_acid_letters] | 
|  | 41 | 
|  | 42 | 
|  | 43 Resources: (Various standard data files.) | 
|  | 44 | 
|  | 45 | 
|  | 46 BLOSUM Scoring Matrices | 
|  | 47     Source: ftp://ftp.ncbi.nih.gov/repository/blocks/unix/blosum | 
|  | 48     These are all new blast style with 1/3 bit scaling | 
|  | 49     - blosum35 | 
|  | 50     - blosum45 | 
|  | 51     - blosum62 | 
|  | 52     - blosum40 | 
|  | 53     - blosum50 | 
|  | 54     - blosum80 | 
|  | 55     - blosum100 | 
|  | 56 | 
|  | 57 Other subsitution scoring matrices: | 
|  | 58     - dist20_comp | 
|  | 59     - pam250 | 
|  | 60     - pam120 | 
|  | 61 | 
|  | 62 | 
|  | 63 Status: Beta (Data needs to be proof checked.) | 
|  | 64 """ | 
|  | 65 # TODO: add this datafile? | 
|  | 66 # Description of database cross references : | 
|  | 67 #    - dbxref.txt (http://www.expasy.org/cgi-bin/lists?dbxref.txt) | 
|  | 68 | 
|  | 69 | 
|  | 70 # FIXME: Move documentation of data to docstring above. docstrings | 
|  | 71 # after variables don't work. | 
|  | 72 | 
|  | 73 | 
|  | 74 # The ExPasy ProtScale tool is a great source of amino acid properties. | 
|  | 75 # http://au.expasy.org/cgi-bin/protscale.pl | 
|  | 76 | 
|  | 77 from StringIO import StringIO | 
|  | 78 from corebio._future import resource_string, resource_stream,resource_filename | 
|  | 79 from corebio import utils | 
|  | 80 | 
|  | 81 # Explictly list set of available data resources. We want to be able to access | 
|  | 82 # these resources in, for example, a webapp, without inadvertently allowing | 
|  | 83 # unrestricted read access to the local file system. | 
|  | 84 | 
|  | 85 resource_names = [ | 
|  | 86     'blosum35', | 
|  | 87     'blosum45', | 
|  | 88     'blosum62', | 
|  | 89     'blosum40', | 
|  | 90     'blosum50', | 
|  | 91     'blosum80', | 
|  | 92     'blosum100', | 
|  | 93     'dist20_comp', | 
|  | 94     'pam250', | 
|  | 95     'pam120', | 
|  | 96     ] | 
|  | 97 | 
|  | 98 _resource_filenames = { | 
|  | 99     'blosum35':    'data/blosum35.mat', | 
|  | 100     'blosum45':    'data/blosum45.mat', | 
|  | 101     'blosum62':    'data/blosum62.mat', | 
|  | 102     'blosum40':    'data/blosum40.mat', | 
|  | 103     'blosum50':    'data/blosum50.mat', | 
|  | 104     'blosum80':    'data/blosum80.mat', | 
|  | 105     'blosum100':   'data/blosum100.mat', | 
|  | 106     'dist20_comp': 'data/dist20_comp.mat', | 
|  | 107     'pam250':      'data/pam250.mat', | 
|  | 108     'pam120':      'data/pam120.mat', | 
|  | 109     } | 
|  | 110 | 
|  | 111 # TODO: Subsitution matrix parser, SeqMatrix.read | 
|  | 112 _resource_parsers = {} | 
|  | 113 | 
|  | 114 def data_string( name ): | 
|  | 115     fn = _resource_filenames[name] | 
|  | 116     return resource_string(__name__, fn , __file__) | 
|  | 117 | 
|  | 118 def data_stream( name ): | 
|  | 119     fn = _resource_filenames[name] | 
|  | 120     return resource_stream(__name__, fn , __file__) | 
|  | 121 | 
|  | 122 def data_filename( name ): | 
|  | 123     fn = _resource_filenames[name] | 
|  | 124     return resource_filename(__name__, fn, __file__) | 
|  | 125 | 
|  | 126 def data_object( name, parser = None) : | 
|  | 127     if parser is None : | 
|  | 128         if name in _resource_parsers : | 
|  | 129             parser = _resource_parsers[name] | 
|  | 130         else : | 
|  | 131             parser = str | 
|  | 132     return parser( data_stream(name) ) | 
|  | 133 | 
|  | 134 | 
|  | 135 amino_acid_letters = "ACDEFGHIKLMNPQRSTVWY" | 
|  | 136 """Standard codes for the 20 canonical amino acids, in alphabetic order.""" | 
|  | 137 | 
|  | 138 amino_acid_alternative_letters = "ARNDCQEGHILKMFPSTWYV" | 
|  | 139 """Amino acid one letter codes, alphabetic by three letter codes.""" | 
|  | 140 | 
|  | 141 amino_acid_extended_letters = "ACDEFGHIKLMNOPQRSTUVWYBJZX*-" | 
|  | 142 | 
|  | 143 | 
|  | 144 dna_letters = "GATC" | 
|  | 145 dna_extended_letters = "GATCRYWSMKHBVDN" | 
|  | 146 | 
|  | 147 rna_letters = "GAUC" | 
|  | 148 rna_extended_letters = "GAUCRYWSMKHBVDN" | 
|  | 149 | 
|  | 150 | 
|  | 151 dna_ambiguity = { | 
|  | 152     "A": "A", | 
|  | 153     "C": "C", | 
|  | 154     "G": "G", | 
|  | 155     "T": "T", | 
|  | 156     "M": "AC", | 
|  | 157     "R": "AG", | 
|  | 158     "W": "AT", | 
|  | 159     "S": "CG", | 
|  | 160     "Y": "CT", | 
|  | 161     "K": "GT", | 
|  | 162     "V": "ACG", | 
|  | 163     "H": "ACT", | 
|  | 164     "D": "AGT", | 
|  | 165     "B": "CGT", | 
|  | 166     "X": "GATC", | 
|  | 167     "N": "GATC", | 
|  | 168 } | 
|  | 169 | 
|  | 170 rna_ambiguity = { | 
|  | 171     "A": "A", | 
|  | 172     "C": "C", | 
|  | 173     "G": "G", | 
|  | 174     "U": "U", | 
|  | 175     "M": "AC", | 
|  | 176     "R": "AG", | 
|  | 177     "W": "AU", | 
|  | 178     "S": "CG", | 
|  | 179     "Y": "CU", | 
|  | 180     "K": "GU", | 
|  | 181     "V": "ACG", | 
|  | 182     "H": "ACU", | 
|  | 183     "D": "AGU", | 
|  | 184     "B": "CGU", | 
|  | 185     "X": "GAUC", | 
|  | 186     "N": "GAUC", | 
|  | 187 } | 
|  | 188 | 
|  | 189 amino_acid_ambiguity = { | 
|  | 190     "A": "A", | 
|  | 191     "B": "ND", | 
|  | 192     "C": "C", | 
|  | 193     "D": "D", | 
|  | 194     "E": "E", | 
|  | 195     "F": "F", | 
|  | 196     "G": "G", | 
|  | 197     "H": "H", | 
|  | 198     "I": "I", | 
|  | 199     "K": "K", | 
|  | 200     "L": "L", | 
|  | 201     "M": "M", | 
|  | 202     "N": "N", | 
|  | 203     "P": "P", | 
|  | 204     "Q": "Q", | 
|  | 205     "R": "R", | 
|  | 206     "S": "S", | 
|  | 207     "T": "T", | 
|  | 208     "V": "V", | 
|  | 209     "W": "W", | 
|  | 210     "X": "ACDEFGHIKLMNPQRSTVWY", | 
|  | 211     "Y": "Y", | 
|  | 212     "Z": "QE", | 
|  | 213     "J": "IL", | 
|  | 214     'U': 'U', | 
|  | 215     'O': 'O', | 
|  | 216 } | 
|  | 217 | 
|  | 218 | 
|  | 219 # Monomer isotopically averaged molecular mass | 
|  | 220 # Data Checked GEC Nov 2006 | 
|  | 221 amino_acid_mass = { | 
|  | 222     "A": 89.09, | 
|  | 223     "B" : 132.66,  # Averaged proportional to amino_acid_composition | 
|  | 224     "C": 121.16, | 
|  | 225     "D": 133.10, | 
|  | 226     "E": 147.13, | 
|  | 227     "F": 165.19, | 
|  | 228     "G": 75.07, | 
|  | 229     "H": 155.16, | 
|  | 230     "I": 131.18, | 
|  | 231     "J": 131.18, | 
|  | 232     "K": 146.19, | 
|  | 233     "L": 131.18, | 
|  | 234     "M": 149.21, | 
|  | 235     "N": 132.12, | 
|  | 236     # "O" : ???, # TODO | 
|  | 237     "P": 115.13, | 
|  | 238     "Q": 146.15, | 
|  | 239     "R": 174.20, | 
|  | 240     "S": 105.09, | 
|  | 241     "T": 119.12, | 
|  | 242     "U" : 168.05, | 
|  | 243     "V": 117.15, | 
|  | 244     "W": 204.23, | 
|  | 245     "X" : 129.15, # Averaged proportional to amino_acid_composition | 
|  | 246     "Y": 181.19, | 
|  | 247     "Z" : 146.76, # Averaged proportional to amino_acid_composition | 
|  | 248     } | 
|  | 249 | 
|  | 250 dna_mass = { | 
|  | 251     "A": 347., | 
|  | 252     "C": 323., | 
|  | 253     "G": 363., | 
|  | 254     "T": 322., | 
|  | 255     } | 
|  | 256 | 
|  | 257 rna_mass = { | 
|  | 258     "A": 363., | 
|  | 259     "C": 319., | 
|  | 260     "G": 379., | 
|  | 261     "U": 340., | 
|  | 262 } | 
|  | 263 | 
|  | 264 one_to_three = { | 
|  | 265     'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', | 
|  | 266     'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', | 
|  | 267     'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', | 
|  | 268     'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', | 
|  | 269     'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', | 
|  | 270     'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', | 
|  | 271     'U':'Sec', 'J':'Xle', 'O':'Pyl' | 
|  | 272     } | 
|  | 273 """ Map between standard 1 letter amino acid codes and standard three letter codes. | 
|  | 274 | 
|  | 275 Ref: http://www.ebi.ac.uk/RESID/faq.html | 
|  | 276 """ | 
|  | 277 | 
|  | 278 standard_three_to_one = utils.invert_dict(one_to_three) | 
|  | 279 """ Map between standard three letter amino acid codes and standard one letter codes. | 
|  | 280 | 
|  | 281 Ref: http://www.ebi.ac.uk/RESID/faq.html | 
|  | 282 """ | 
|  | 283 | 
|  | 284 | 
|  | 285 extended_three_to_one= { | 
|  | 286 '2as':'D', '3ah':'H', '5hp':'E', 'Acl':'R', 'Agm':'R', 'Aib':'A', 'Ala':'A', 'Alm':'A', 'Alo':'T', 'Aly':'K', 'Arg':'R', 'Arm':'R', 'Asa':'D', 'Asb':'D', 'Ask':'D', 'Asl':'D', 'Asn':'N', 'Asp':'D', 'Asq':'D', 'Asx':'B', 'Aya':'A', 'Bcs':'C', 'Bhd':'D', 'Bmt':'T', 'Bnn':'A', 'Buc':'C', 'Bug':'L', 'C5c':'C', 'C6c':'C', 'Ccs':'C', 'Cea':'C', 'Cgu':'E', 'Chg':'A', 'Cle':'L', 'Cme':'C', 'Csd':'A', 'Cso':'C', 'Csp':'C', 'Css':'C', 'Csw':'C', 'Csx':'C', 'Cxm':'M', 'Cy1':'C', 'Cy3':'C', 'Cyg':'C', 'Cym':'C', 'Cyq':'C', 'Cys':'C', 'Dah':'F', 'Dal':'A', 'Dar':'R', 'Das':'D', 'Dcy':'C', 'Dgl':'E', 'Dgn':'Q', 'Dha':'A', 'Dhi':'H', 'Dil':'I', 'Div':'V', 'Dle':'L', 'Dly':'K', 'Dnp':'A', 'Dpn':'F', 'Dpr':'P', 'Dsn':'S', 'Dsp':'D', 'Dth':'T', 'Dtr':'W', 'Dty':'Y', 'Dva':'V', 'Efc':'C', 'Fla':'A', 'Fme':'M', 'Ggl':'E', 'Gl3':'G', 'Gln':'Q', 'Glu':'E', 'Glx':'Z', 'Gly':'G', 'Glz':'G', 'Gma':'E', 'Gsc':'G', 'Hac':'A', 'Har':'R', 'Hic':'H', 'Hip':'H', 'His':'H', 'Hmr':'R', 'Hpq':'F', 'Htr':'W', 'Hyp':'P', 'Iil':'I', 'Ile':'I', 'Iyr':'Y', 'Kcx':'K', 'Leu':'L', 'Llp':'K', 'Lly':'K', 'Ltr':'W', 'Lym':'K', 'Lys':'K', 'Lyz':'K', 'Maa':'A', 'Men':'N', 'Met':'M', 'Mhs':'H', 'Mis':'S', 'Mle':'L', 'Mpq':'G', 'Msa':'G', 'Mse':'M', 'Mva':'V', 'Nem':'H', 'Nep':'H', 'Nle':'L', 'Nln':'L', 'Nlp':'L', 'Nmc':'G', 'Oas':'S', 'Ocs':'C', 'Omt':'M', 'Paq':'Y', 'Pca':'E', 'Pec':'C', 'Phe':'F', 'Phi':'F', 'Phl':'F', 'Pr3':'C', 'Pro':'P', 'Prr':'A', 'Ptr':'Y', 'Pyl':'O', 'Sac':'S', 'Sar':'G', 'Sch':'C', 'Scs':'C', 'Scy':'C', 'Sec':'U', 'Sel':'U', 'Sep':'S', 'Ser':'S', 'Set':'S', 'Shc':'C', 'Shr':'K', 'Smc':'C', 'Soc':'C', 'Sty':'Y', 'Sva':'S', 'Ter':'*', 'Thr':'T', 'Tih':'A', 'Tpl':'W', 'Tpo':'T', 'Tpq':'A', 'Trg':'K', 'Tro':'W', 'Trp':'W', 'Tyb':'Y', 'Tyq':'Y', 'Tyr':'Y', 'Tys':'Y', 'Tyy':'Y', 'Unk':'X', 'Val':'V', 'Xaa':'X', 'Xer':'X', 'Xle':'J'} | 
|  | 287 | 
|  | 288 """ Map between three letter amino acid codes and standard one letter codes. | 
|  | 289 This map contains many nonstandard three letter codes, used, for example, to specify chemically modified amino acids in PDB files. | 
|  | 290 | 
|  | 291 Ref: http://astral.berkeley.edu/ | 
|  | 292 Ref: http://www.ebi.ac.uk/RESID/faq.html | 
|  | 293 """ | 
|  | 294 # Initial table is from the ASTRAL RAF release notes. | 
|  | 295 # added UNK | 
|  | 296 # Extra IUPAC: Xle, Xaa, Sec, Pyl | 
|  | 297 # The following have been seen in biopython code. | 
|  | 298 # Ter : '*'     Termination | 
|  | 299 # Sel : 'U'     A typo for Sec, selenocysteine? | 
|  | 300 # Xer : 'X'     Another alternative for unknown? | 
|  | 301 | 
|  | 302 | 
|  | 303 amino_acid_names = { | 
|  | 304     'A'	: 'alanine', | 
|  | 305     'M'	: 'methionine', | 
|  | 306     'C'	: 'cysteine', | 
|  | 307     'N'	: 'asparagine', | 
|  | 308     'D'	: 'aspartic acid', | 
|  | 309     'P'	: 'proline', | 
|  | 310     'E'	: 'glutamic acid', | 
|  | 311     'Q'	: 'glutamine', | 
|  | 312     'F'	: 'phenylalanine', | 
|  | 313     'R'	: 'arginine', | 
|  | 314     'G'	: 'glycine', | 
|  | 315     'S'	: 'serine', | 
|  | 316     'H'	: 'histidine', | 
|  | 317     'T' : 'threonine', | 
|  | 318     'I'	: 'isoleucine', | 
|  | 319     'V'	: 'valine', | 
|  | 320     'K'	: 'lysine', | 
|  | 321     'W'	: 'tryptophan', | 
|  | 322     'L'	: 'leucine', | 
|  | 323     'Y'	: 'tyrosine', | 
|  | 324     'B' : 'aspartic acid or asparagine', | 
|  | 325     'J' : 'leucine or isoleucine', | 
|  | 326     'X' : 'unknown', | 
|  | 327     'Z' : 'glutamic acid or glutamine', | 
|  | 328     'U' : 'selenocysteine', | 
|  | 329     'O' : 'pyrrolysine', | 
|  | 330     '*' : 'translation stop', | 
|  | 331     '-' : 'gap' | 
|  | 332     } | 
|  | 333 | 
|  | 334 amino_acid_composition = dict( | 
|  | 335     A = .082, R = .057, N = .044, D = .053, C = .017, | 
|  | 336     Q = .040, E = .062, G = .072, H = .022, I = .052, | 
|  | 337     L = .090, K = .057, M = .024, F =.039, P = .051, | 
|  | 338     S = .069, T = .058, W = .013, Y= .032, V =.066 ) | 
|  | 339 | 
|  | 340 """ | 
|  | 341 Overall amino acid composition of proteins. | 
|  | 342 Ref: McCaldon P., Argos P. Proteins 4:99-122 (1988). | 
|  | 343 """ | 
|  | 344 # FIXME : Proof these values | 
|  | 345 | 
|  | 346 kyte_doolittle_hydrophobicity = dict( | 
|  | 347     A=1.8, R=-4.5, N=-3.5, D=-3.5,  C=2.5, | 
|  | 348     Q=-3.5, E=-3.5, G=-0.4, H=-3.2, I=4.5, | 
|  | 349     L=3.8, K=-3.9,  M=1.9,  F=2.8, P=-1.6, | 
|  | 350     S=-0.8, T=-0.7, W=-0.9, Y=-1.3, V=4.2 ) | 
|  | 351 """ | 
|  | 352 Kyte-Doolittle hydrophobicity scale. | 
|  | 353 Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132 (1982) | 
|  | 354 """ | 
|  | 355 # FIXME : Proof these values | 
|  | 356 | 
|  | 357 | 
|  | 358 nucleotide_names = { | 
|  | 359     'A' : 'Adenosine', | 
|  | 360     'C'	: 'Cytidine', | 
|  | 361     'G'	: 'Guanine', | 
|  | 362     'T'	: 'Thymidine', | 
|  | 363     'U'	: 'Uracil', | 
|  | 364     'R'	: 'G A (puRine)', | 
|  | 365     'Y'	: 'T C (pYrimidine)', | 
|  | 366     'K'	: 'G T (Ketone)', | 
|  | 367     'M'	: 'A C (aMino group)', | 
|  | 368     'S'	: 'G C (Strong interaction)', | 
|  | 369     'W'	: 'A T (Weak interaction)', | 
|  | 370     'B'	: 'G T C (not A) (B comes after A)', | 
|  | 371     'D'	: 'G A T (not C) (D comes after C)', | 
|  | 372     'H'	: 'A C T (not G) (H comes after G)', | 
|  | 373     'V'	: 'G C A (not T, not U) (V comes after U)', | 
|  | 374     'N' : 'A G C T (aNy)', | 
|  | 375     '-' : 'gap', | 
|  | 376     } | 
|  | 377 | 
|  | 378 | 
|  | 379 | 
|  | 380 | 
|  | 381 | 
|  | 382 | 
|  | 383 | 
|  | 384 | 
|  | 385 |