comparison corebio/data.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1 # Copyright (c) 2006, The Regents of the University of California, through
2 # Lawrence Berkeley National Laboratory (subject to receipt of any required
3 # approvals from the U.S. Dept. of Energy). All rights reserved.
4
5 # This software is distributed under the new BSD Open Source License.
6 # <http://www.opensource.org/licenses/bsd-license.html>
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions are met:
10 #
11 # (1) Redistributions of source code must retain the above copyright notice,
12 # this list of conditions and the following disclaimer.
13 #
14 # (2) Redistributions in binary form must reproduce the above copyright
15 # notice, this list of conditions and the following disclaimer in the
16 # documentation and or other materials provided with the distribution.
17 #
18 # (3) Neither the name of the University of California, Lawrence Berkeley
19 # National Laboratory, U.S. Dept. of Energy nor the names of its contributors
20 # may be used to endorse or promote products derived from this software
21 # without specific prior written permission.
22 #
23 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 # POSSIBILITY OF SUCH DAMAGE.
34
35 """
36 Standard information used in computational biology.
37
38
39 To convert a property dictionary to a list :
40 >>> comp = [ amino_acid_composition[k] for k in amino_acid_letters]
41
42
43 Resources: (Various standard data files.)
44
45
46 BLOSUM Scoring Matrices
47 Source: ftp://ftp.ncbi.nih.gov/repository/blocks/unix/blosum
48 These are all new blast style with 1/3 bit scaling
49 - blosum35
50 - blosum45
51 - blosum62
52 - blosum40
53 - blosum50
54 - blosum80
55 - blosum100
56
57 Other subsitution scoring matrices:
58 - dist20_comp
59 - pam250
60 - pam120
61
62
63 Status: Beta (Data needs to be proof checked.)
64 """
65 # TODO: add this datafile?
66 # Description of database cross references :
67 # - dbxref.txt (http://www.expasy.org/cgi-bin/lists?dbxref.txt)
68
69
70 # FIXME: Move documentation of data to docstring above. docstrings
71 # after variables don't work.
72
73
74 # The ExPasy ProtScale tool is a great source of amino acid properties.
75 # http://au.expasy.org/cgi-bin/protscale.pl
76
77 from StringIO import StringIO
78 from corebio._future import resource_string, resource_stream,resource_filename
79 from corebio import utils
80
81 # Explictly list set of available data resources. We want to be able to access
82 # these resources in, for example, a webapp, without inadvertently allowing
83 # unrestricted read access to the local file system.
84
85 resource_names = [
86 'blosum35',
87 'blosum45',
88 'blosum62',
89 'blosum40',
90 'blosum50',
91 'blosum80',
92 'blosum100',
93 'dist20_comp',
94 'pam250',
95 'pam120',
96 ]
97
98 _resource_filenames = {
99 'blosum35': 'data/blosum35.mat',
100 'blosum45': 'data/blosum45.mat',
101 'blosum62': 'data/blosum62.mat',
102 'blosum40': 'data/blosum40.mat',
103 'blosum50': 'data/blosum50.mat',
104 'blosum80': 'data/blosum80.mat',
105 'blosum100': 'data/blosum100.mat',
106 'dist20_comp': 'data/dist20_comp.mat',
107 'pam250': 'data/pam250.mat',
108 'pam120': 'data/pam120.mat',
109 }
110
111 # TODO: Subsitution matrix parser, SeqMatrix.read
112 _resource_parsers = {}
113
114 def data_string( name ):
115 fn = _resource_filenames[name]
116 return resource_string(__name__, fn , __file__)
117
118 def data_stream( name ):
119 fn = _resource_filenames[name]
120 return resource_stream(__name__, fn , __file__)
121
122 def data_filename( name ):
123 fn = _resource_filenames[name]
124 return resource_filename(__name__, fn, __file__)
125
126 def data_object( name, parser = None) :
127 if parser is None :
128 if name in _resource_parsers :
129 parser = _resource_parsers[name]
130 else :
131 parser = str
132 return parser( data_stream(name) )
133
134
135 amino_acid_letters = "ACDEFGHIKLMNPQRSTVWY"
136 """Standard codes for the 20 canonical amino acids, in alphabetic order."""
137
138 amino_acid_alternative_letters = "ARNDCQEGHILKMFPSTWYV"
139 """Amino acid one letter codes, alphabetic by three letter codes."""
140
141 amino_acid_extended_letters = "ACDEFGHIKLMNOPQRSTUVWYBJZX*-"
142
143
144 dna_letters = "GATC"
145 dna_extended_letters = "GATCRYWSMKHBVDN"
146
147 rna_letters = "GAUC"
148 rna_extended_letters = "GAUCRYWSMKHBVDN"
149
150
151 dna_ambiguity = {
152 "A": "A",
153 "C": "C",
154 "G": "G",
155 "T": "T",
156 "M": "AC",
157 "R": "AG",
158 "W": "AT",
159 "S": "CG",
160 "Y": "CT",
161 "K": "GT",
162 "V": "ACG",
163 "H": "ACT",
164 "D": "AGT",
165 "B": "CGT",
166 "X": "GATC",
167 "N": "GATC",
168 }
169
170 rna_ambiguity = {
171 "A": "A",
172 "C": "C",
173 "G": "G",
174 "U": "U",
175 "M": "AC",
176 "R": "AG",
177 "W": "AU",
178 "S": "CG",
179 "Y": "CU",
180 "K": "GU",
181 "V": "ACG",
182 "H": "ACU",
183 "D": "AGU",
184 "B": "CGU",
185 "X": "GAUC",
186 "N": "GAUC",
187 }
188
189 amino_acid_ambiguity = {
190 "A": "A",
191 "B": "ND",
192 "C": "C",
193 "D": "D",
194 "E": "E",
195 "F": "F",
196 "G": "G",
197 "H": "H",
198 "I": "I",
199 "K": "K",
200 "L": "L",
201 "M": "M",
202 "N": "N",
203 "P": "P",
204 "Q": "Q",
205 "R": "R",
206 "S": "S",
207 "T": "T",
208 "V": "V",
209 "W": "W",
210 "X": "ACDEFGHIKLMNPQRSTVWY",
211 "Y": "Y",
212 "Z": "QE",
213 "J": "IL",
214 'U': 'U',
215 'O': 'O',
216 }
217
218
219 # Monomer isotopically averaged molecular mass
220 # Data Checked GEC Nov 2006
221 amino_acid_mass = {
222 "A": 89.09,
223 "B" : 132.66, # Averaged proportional to amino_acid_composition
224 "C": 121.16,
225 "D": 133.10,
226 "E": 147.13,
227 "F": 165.19,
228 "G": 75.07,
229 "H": 155.16,
230 "I": 131.18,
231 "J": 131.18,
232 "K": 146.19,
233 "L": 131.18,
234 "M": 149.21,
235 "N": 132.12,
236 # "O" : ???, # TODO
237 "P": 115.13,
238 "Q": 146.15,
239 "R": 174.20,
240 "S": 105.09,
241 "T": 119.12,
242 "U" : 168.05,
243 "V": 117.15,
244 "W": 204.23,
245 "X" : 129.15, # Averaged proportional to amino_acid_composition
246 "Y": 181.19,
247 "Z" : 146.76, # Averaged proportional to amino_acid_composition
248 }
249
250 dna_mass = {
251 "A": 347.,
252 "C": 323.,
253 "G": 363.,
254 "T": 322.,
255 }
256
257 rna_mass = {
258 "A": 363.,
259 "C": 319.,
260 "G": 379.,
261 "U": 340.,
262 }
263
264 one_to_three = {
265 'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
266 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
267 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
268 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
269 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
270 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa',
271 'U':'Sec', 'J':'Xle', 'O':'Pyl'
272 }
273 """ Map between standard 1 letter amino acid codes and standard three letter codes.
274
275 Ref: http://www.ebi.ac.uk/RESID/faq.html
276 """
277
278 standard_three_to_one = utils.invert_dict(one_to_three)
279 """ Map between standard three letter amino acid codes and standard one letter codes.
280
281 Ref: http://www.ebi.ac.uk/RESID/faq.html
282 """
283
284
285 extended_three_to_one= {
286 '2as':'D', '3ah':'H', '5hp':'E', 'Acl':'R', 'Agm':'R', 'Aib':'A', 'Ala':'A', 'Alm':'A', 'Alo':'T', 'Aly':'K', 'Arg':'R', 'Arm':'R', 'Asa':'D', 'Asb':'D', 'Ask':'D', 'Asl':'D', 'Asn':'N', 'Asp':'D', 'Asq':'D', 'Asx':'B', 'Aya':'A', 'Bcs':'C', 'Bhd':'D', 'Bmt':'T', 'Bnn':'A', 'Buc':'C', 'Bug':'L', 'C5c':'C', 'C6c':'C', 'Ccs':'C', 'Cea':'C', 'Cgu':'E', 'Chg':'A', 'Cle':'L', 'Cme':'C', 'Csd':'A', 'Cso':'C', 'Csp':'C', 'Css':'C', 'Csw':'C', 'Csx':'C', 'Cxm':'M', 'Cy1':'C', 'Cy3':'C', 'Cyg':'C', 'Cym':'C', 'Cyq':'C', 'Cys':'C', 'Dah':'F', 'Dal':'A', 'Dar':'R', 'Das':'D', 'Dcy':'C', 'Dgl':'E', 'Dgn':'Q', 'Dha':'A', 'Dhi':'H', 'Dil':'I', 'Div':'V', 'Dle':'L', 'Dly':'K', 'Dnp':'A', 'Dpn':'F', 'Dpr':'P', 'Dsn':'S', 'Dsp':'D', 'Dth':'T', 'Dtr':'W', 'Dty':'Y', 'Dva':'V', 'Efc':'C', 'Fla':'A', 'Fme':'M', 'Ggl':'E', 'Gl3':'G', 'Gln':'Q', 'Glu':'E', 'Glx':'Z', 'Gly':'G', 'Glz':'G', 'Gma':'E', 'Gsc':'G', 'Hac':'A', 'Har':'R', 'Hic':'H', 'Hip':'H', 'His':'H', 'Hmr':'R', 'Hpq':'F', 'Htr':'W', 'Hyp':'P', 'Iil':'I', 'Ile':'I', 'Iyr':'Y', 'Kcx':'K', 'Leu':'L', 'Llp':'K', 'Lly':'K', 'Ltr':'W', 'Lym':'K', 'Lys':'K', 'Lyz':'K', 'Maa':'A', 'Men':'N', 'Met':'M', 'Mhs':'H', 'Mis':'S', 'Mle':'L', 'Mpq':'G', 'Msa':'G', 'Mse':'M', 'Mva':'V', 'Nem':'H', 'Nep':'H', 'Nle':'L', 'Nln':'L', 'Nlp':'L', 'Nmc':'G', 'Oas':'S', 'Ocs':'C', 'Omt':'M', 'Paq':'Y', 'Pca':'E', 'Pec':'C', 'Phe':'F', 'Phi':'F', 'Phl':'F', 'Pr3':'C', 'Pro':'P', 'Prr':'A', 'Ptr':'Y', 'Pyl':'O', 'Sac':'S', 'Sar':'G', 'Sch':'C', 'Scs':'C', 'Scy':'C', 'Sec':'U', 'Sel':'U', 'Sep':'S', 'Ser':'S', 'Set':'S', 'Shc':'C', 'Shr':'K', 'Smc':'C', 'Soc':'C', 'Sty':'Y', 'Sva':'S', 'Ter':'*', 'Thr':'T', 'Tih':'A', 'Tpl':'W', 'Tpo':'T', 'Tpq':'A', 'Trg':'K', 'Tro':'W', 'Trp':'W', 'Tyb':'Y', 'Tyq':'Y', 'Tyr':'Y', 'Tys':'Y', 'Tyy':'Y', 'Unk':'X', 'Val':'V', 'Xaa':'X', 'Xer':'X', 'Xle':'J'}
287
288 """ Map between three letter amino acid codes and standard one letter codes.
289 This map contains many nonstandard three letter codes, used, for example, to specify chemically modified amino acids in PDB files.
290
291 Ref: http://astral.berkeley.edu/
292 Ref: http://www.ebi.ac.uk/RESID/faq.html
293 """
294 # Initial table is from the ASTRAL RAF release notes.
295 # added UNK
296 # Extra IUPAC: Xle, Xaa, Sec, Pyl
297 # The following have been seen in biopython code.
298 # Ter : '*' Termination
299 # Sel : 'U' A typo for Sec, selenocysteine?
300 # Xer : 'X' Another alternative for unknown?
301
302
303 amino_acid_names = {
304 'A' : 'alanine',
305 'M' : 'methionine',
306 'C' : 'cysteine',
307 'N' : 'asparagine',
308 'D' : 'aspartic acid',
309 'P' : 'proline',
310 'E' : 'glutamic acid',
311 'Q' : 'glutamine',
312 'F' : 'phenylalanine',
313 'R' : 'arginine',
314 'G' : 'glycine',
315 'S' : 'serine',
316 'H' : 'histidine',
317 'T' : 'threonine',
318 'I' : 'isoleucine',
319 'V' : 'valine',
320 'K' : 'lysine',
321 'W' : 'tryptophan',
322 'L' : 'leucine',
323 'Y' : 'tyrosine',
324 'B' : 'aspartic acid or asparagine',
325 'J' : 'leucine or isoleucine',
326 'X' : 'unknown',
327 'Z' : 'glutamic acid or glutamine',
328 'U' : 'selenocysteine',
329 'O' : 'pyrrolysine',
330 '*' : 'translation stop',
331 '-' : 'gap'
332 }
333
334 amino_acid_composition = dict(
335 A = .082, R = .057, N = .044, D = .053, C = .017,
336 Q = .040, E = .062, G = .072, H = .022, I = .052,
337 L = .090, K = .057, M = .024, F =.039, P = .051,
338 S = .069, T = .058, W = .013, Y= .032, V =.066 )
339
340 """
341 Overall amino acid composition of proteins.
342 Ref: McCaldon P., Argos P. Proteins 4:99-122 (1988).
343 """
344 # FIXME : Proof these values
345
346 kyte_doolittle_hydrophobicity = dict(
347 A=1.8, R=-4.5, N=-3.5, D=-3.5, C=2.5,
348 Q=-3.5, E=-3.5, G=-0.4, H=-3.2, I=4.5,
349 L=3.8, K=-3.9, M=1.9, F=2.8, P=-1.6,
350 S=-0.8, T=-0.7, W=-0.9, Y=-1.3, V=4.2 )
351 """
352 Kyte-Doolittle hydrophobicity scale.
353 Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132 (1982)
354 """
355 # FIXME : Proof these values
356
357
358 nucleotide_names = {
359 'A' : 'Adenosine',
360 'C' : 'Cytidine',
361 'G' : 'Guanine',
362 'T' : 'Thymidine',
363 'U' : 'Uracil',
364 'R' : 'G A (puRine)',
365 'Y' : 'T C (pYrimidine)',
366 'K' : 'G T (Ketone)',
367 'M' : 'A C (aMino group)',
368 'S' : 'G C (Strong interaction)',
369 'W' : 'A T (Weak interaction)',
370 'B' : 'G T C (not A) (B comes after A)',
371 'D' : 'G A T (not C) (D comes after C)',
372 'H' : 'A C T (not G) (H comes after G)',
373 'V' : 'G C A (not T, not U) (V comes after U)',
374 'N' : 'A G C T (aNy)',
375 '-' : 'gap',
376 }
377
378
379
380
381
382
383
384
385