Mercurial > repos > davidmurphy > codonlogo
comparison corebio/data.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c55bdc2fb9fa |
---|---|
1 # Copyright (c) 2006, The Regents of the University of California, through | |
2 # Lawrence Berkeley National Laboratory (subject to receipt of any required | |
3 # approvals from the U.S. Dept. of Energy). All rights reserved. | |
4 | |
5 # This software is distributed under the new BSD Open Source License. | |
6 # <http://www.opensource.org/licenses/bsd-license.html> | |
7 # | |
8 # Redistribution and use in source and binary forms, with or without | |
9 # modification, are permitted provided that the following conditions are met: | |
10 # | |
11 # (1) Redistributions of source code must retain the above copyright notice, | |
12 # this list of conditions and the following disclaimer. | |
13 # | |
14 # (2) Redistributions in binary form must reproduce the above copyright | |
15 # notice, this list of conditions and the following disclaimer in the | |
16 # documentation and or other materials provided with the distribution. | |
17 # | |
18 # (3) Neither the name of the University of California, Lawrence Berkeley | |
19 # National Laboratory, U.S. Dept. of Energy nor the names of its contributors | |
20 # may be used to endorse or promote products derived from this software | |
21 # without specific prior written permission. | |
22 # | |
23 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
24 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
25 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
26 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
27 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
28 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
29 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
30 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
31 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
32 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
33 # POSSIBILITY OF SUCH DAMAGE. | |
34 | |
35 """ | |
36 Standard information used in computational biology. | |
37 | |
38 | |
39 To convert a property dictionary to a list : | |
40 >>> comp = [ amino_acid_composition[k] for k in amino_acid_letters] | |
41 | |
42 | |
43 Resources: (Various standard data files.) | |
44 | |
45 | |
46 BLOSUM Scoring Matrices | |
47 Source: ftp://ftp.ncbi.nih.gov/repository/blocks/unix/blosum | |
48 These are all new blast style with 1/3 bit scaling | |
49 - blosum35 | |
50 - blosum45 | |
51 - blosum62 | |
52 - blosum40 | |
53 - blosum50 | |
54 - blosum80 | |
55 - blosum100 | |
56 | |
57 Other subsitution scoring matrices: | |
58 - dist20_comp | |
59 - pam250 | |
60 - pam120 | |
61 | |
62 | |
63 Status: Beta (Data needs to be proof checked.) | |
64 """ | |
65 # TODO: add this datafile? | |
66 # Description of database cross references : | |
67 # - dbxref.txt (http://www.expasy.org/cgi-bin/lists?dbxref.txt) | |
68 | |
69 | |
70 # FIXME: Move documentation of data to docstring above. docstrings | |
71 # after variables don't work. | |
72 | |
73 | |
74 # The ExPasy ProtScale tool is a great source of amino acid properties. | |
75 # http://au.expasy.org/cgi-bin/protscale.pl | |
76 | |
77 from StringIO import StringIO | |
78 from corebio._future import resource_string, resource_stream,resource_filename | |
79 from corebio import utils | |
80 | |
81 # Explictly list set of available data resources. We want to be able to access | |
82 # these resources in, for example, a webapp, without inadvertently allowing | |
83 # unrestricted read access to the local file system. | |
84 | |
85 resource_names = [ | |
86 'blosum35', | |
87 'blosum45', | |
88 'blosum62', | |
89 'blosum40', | |
90 'blosum50', | |
91 'blosum80', | |
92 'blosum100', | |
93 'dist20_comp', | |
94 'pam250', | |
95 'pam120', | |
96 ] | |
97 | |
98 _resource_filenames = { | |
99 'blosum35': 'data/blosum35.mat', | |
100 'blosum45': 'data/blosum45.mat', | |
101 'blosum62': 'data/blosum62.mat', | |
102 'blosum40': 'data/blosum40.mat', | |
103 'blosum50': 'data/blosum50.mat', | |
104 'blosum80': 'data/blosum80.mat', | |
105 'blosum100': 'data/blosum100.mat', | |
106 'dist20_comp': 'data/dist20_comp.mat', | |
107 'pam250': 'data/pam250.mat', | |
108 'pam120': 'data/pam120.mat', | |
109 } | |
110 | |
111 # TODO: Subsitution matrix parser, SeqMatrix.read | |
112 _resource_parsers = {} | |
113 | |
114 def data_string( name ): | |
115 fn = _resource_filenames[name] | |
116 return resource_string(__name__, fn , __file__) | |
117 | |
118 def data_stream( name ): | |
119 fn = _resource_filenames[name] | |
120 return resource_stream(__name__, fn , __file__) | |
121 | |
122 def data_filename( name ): | |
123 fn = _resource_filenames[name] | |
124 return resource_filename(__name__, fn, __file__) | |
125 | |
126 def data_object( name, parser = None) : | |
127 if parser is None : | |
128 if name in _resource_parsers : | |
129 parser = _resource_parsers[name] | |
130 else : | |
131 parser = str | |
132 return parser( data_stream(name) ) | |
133 | |
134 | |
135 amino_acid_letters = "ACDEFGHIKLMNPQRSTVWY" | |
136 """Standard codes for the 20 canonical amino acids, in alphabetic order.""" | |
137 | |
138 amino_acid_alternative_letters = "ARNDCQEGHILKMFPSTWYV" | |
139 """Amino acid one letter codes, alphabetic by three letter codes.""" | |
140 | |
141 amino_acid_extended_letters = "ACDEFGHIKLMNOPQRSTUVWYBJZX*-" | |
142 | |
143 | |
144 dna_letters = "GATC" | |
145 dna_extended_letters = "GATCRYWSMKHBVDN" | |
146 | |
147 rna_letters = "GAUC" | |
148 rna_extended_letters = "GAUCRYWSMKHBVDN" | |
149 | |
150 | |
151 dna_ambiguity = { | |
152 "A": "A", | |
153 "C": "C", | |
154 "G": "G", | |
155 "T": "T", | |
156 "M": "AC", | |
157 "R": "AG", | |
158 "W": "AT", | |
159 "S": "CG", | |
160 "Y": "CT", | |
161 "K": "GT", | |
162 "V": "ACG", | |
163 "H": "ACT", | |
164 "D": "AGT", | |
165 "B": "CGT", | |
166 "X": "GATC", | |
167 "N": "GATC", | |
168 } | |
169 | |
170 rna_ambiguity = { | |
171 "A": "A", | |
172 "C": "C", | |
173 "G": "G", | |
174 "U": "U", | |
175 "M": "AC", | |
176 "R": "AG", | |
177 "W": "AU", | |
178 "S": "CG", | |
179 "Y": "CU", | |
180 "K": "GU", | |
181 "V": "ACG", | |
182 "H": "ACU", | |
183 "D": "AGU", | |
184 "B": "CGU", | |
185 "X": "GAUC", | |
186 "N": "GAUC", | |
187 } | |
188 | |
189 amino_acid_ambiguity = { | |
190 "A": "A", | |
191 "B": "ND", | |
192 "C": "C", | |
193 "D": "D", | |
194 "E": "E", | |
195 "F": "F", | |
196 "G": "G", | |
197 "H": "H", | |
198 "I": "I", | |
199 "K": "K", | |
200 "L": "L", | |
201 "M": "M", | |
202 "N": "N", | |
203 "P": "P", | |
204 "Q": "Q", | |
205 "R": "R", | |
206 "S": "S", | |
207 "T": "T", | |
208 "V": "V", | |
209 "W": "W", | |
210 "X": "ACDEFGHIKLMNPQRSTVWY", | |
211 "Y": "Y", | |
212 "Z": "QE", | |
213 "J": "IL", | |
214 'U': 'U', | |
215 'O': 'O', | |
216 } | |
217 | |
218 | |
219 # Monomer isotopically averaged molecular mass | |
220 # Data Checked GEC Nov 2006 | |
221 amino_acid_mass = { | |
222 "A": 89.09, | |
223 "B" : 132.66, # Averaged proportional to amino_acid_composition | |
224 "C": 121.16, | |
225 "D": 133.10, | |
226 "E": 147.13, | |
227 "F": 165.19, | |
228 "G": 75.07, | |
229 "H": 155.16, | |
230 "I": 131.18, | |
231 "J": 131.18, | |
232 "K": 146.19, | |
233 "L": 131.18, | |
234 "M": 149.21, | |
235 "N": 132.12, | |
236 # "O" : ???, # TODO | |
237 "P": 115.13, | |
238 "Q": 146.15, | |
239 "R": 174.20, | |
240 "S": 105.09, | |
241 "T": 119.12, | |
242 "U" : 168.05, | |
243 "V": 117.15, | |
244 "W": 204.23, | |
245 "X" : 129.15, # Averaged proportional to amino_acid_composition | |
246 "Y": 181.19, | |
247 "Z" : 146.76, # Averaged proportional to amino_acid_composition | |
248 } | |
249 | |
250 dna_mass = { | |
251 "A": 347., | |
252 "C": 323., | |
253 "G": 363., | |
254 "T": 322., | |
255 } | |
256 | |
257 rna_mass = { | |
258 "A": 363., | |
259 "C": 319., | |
260 "G": 379., | |
261 "U": 340., | |
262 } | |
263 | |
264 one_to_three = { | |
265 'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', | |
266 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', | |
267 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', | |
268 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', | |
269 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', | |
270 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', | |
271 'U':'Sec', 'J':'Xle', 'O':'Pyl' | |
272 } | |
273 """ Map between standard 1 letter amino acid codes and standard three letter codes. | |
274 | |
275 Ref: http://www.ebi.ac.uk/RESID/faq.html | |
276 """ | |
277 | |
278 standard_three_to_one = utils.invert_dict(one_to_three) | |
279 """ Map between standard three letter amino acid codes and standard one letter codes. | |
280 | |
281 Ref: http://www.ebi.ac.uk/RESID/faq.html | |
282 """ | |
283 | |
284 | |
285 extended_three_to_one= { | |
286 '2as':'D', '3ah':'H', '5hp':'E', 'Acl':'R', 'Agm':'R', 'Aib':'A', 'Ala':'A', 'Alm':'A', 'Alo':'T', 'Aly':'K', 'Arg':'R', 'Arm':'R', 'Asa':'D', 'Asb':'D', 'Ask':'D', 'Asl':'D', 'Asn':'N', 'Asp':'D', 'Asq':'D', 'Asx':'B', 'Aya':'A', 'Bcs':'C', 'Bhd':'D', 'Bmt':'T', 'Bnn':'A', 'Buc':'C', 'Bug':'L', 'C5c':'C', 'C6c':'C', 'Ccs':'C', 'Cea':'C', 'Cgu':'E', 'Chg':'A', 'Cle':'L', 'Cme':'C', 'Csd':'A', 'Cso':'C', 'Csp':'C', 'Css':'C', 'Csw':'C', 'Csx':'C', 'Cxm':'M', 'Cy1':'C', 'Cy3':'C', 'Cyg':'C', 'Cym':'C', 'Cyq':'C', 'Cys':'C', 'Dah':'F', 'Dal':'A', 'Dar':'R', 'Das':'D', 'Dcy':'C', 'Dgl':'E', 'Dgn':'Q', 'Dha':'A', 'Dhi':'H', 'Dil':'I', 'Div':'V', 'Dle':'L', 'Dly':'K', 'Dnp':'A', 'Dpn':'F', 'Dpr':'P', 'Dsn':'S', 'Dsp':'D', 'Dth':'T', 'Dtr':'W', 'Dty':'Y', 'Dva':'V', 'Efc':'C', 'Fla':'A', 'Fme':'M', 'Ggl':'E', 'Gl3':'G', 'Gln':'Q', 'Glu':'E', 'Glx':'Z', 'Gly':'G', 'Glz':'G', 'Gma':'E', 'Gsc':'G', 'Hac':'A', 'Har':'R', 'Hic':'H', 'Hip':'H', 'His':'H', 'Hmr':'R', 'Hpq':'F', 'Htr':'W', 'Hyp':'P', 'Iil':'I', 'Ile':'I', 'Iyr':'Y', 'Kcx':'K', 'Leu':'L', 'Llp':'K', 'Lly':'K', 'Ltr':'W', 'Lym':'K', 'Lys':'K', 'Lyz':'K', 'Maa':'A', 'Men':'N', 'Met':'M', 'Mhs':'H', 'Mis':'S', 'Mle':'L', 'Mpq':'G', 'Msa':'G', 'Mse':'M', 'Mva':'V', 'Nem':'H', 'Nep':'H', 'Nle':'L', 'Nln':'L', 'Nlp':'L', 'Nmc':'G', 'Oas':'S', 'Ocs':'C', 'Omt':'M', 'Paq':'Y', 'Pca':'E', 'Pec':'C', 'Phe':'F', 'Phi':'F', 'Phl':'F', 'Pr3':'C', 'Pro':'P', 'Prr':'A', 'Ptr':'Y', 'Pyl':'O', 'Sac':'S', 'Sar':'G', 'Sch':'C', 'Scs':'C', 'Scy':'C', 'Sec':'U', 'Sel':'U', 'Sep':'S', 'Ser':'S', 'Set':'S', 'Shc':'C', 'Shr':'K', 'Smc':'C', 'Soc':'C', 'Sty':'Y', 'Sva':'S', 'Ter':'*', 'Thr':'T', 'Tih':'A', 'Tpl':'W', 'Tpo':'T', 'Tpq':'A', 'Trg':'K', 'Tro':'W', 'Trp':'W', 'Tyb':'Y', 'Tyq':'Y', 'Tyr':'Y', 'Tys':'Y', 'Tyy':'Y', 'Unk':'X', 'Val':'V', 'Xaa':'X', 'Xer':'X', 'Xle':'J'} | |
287 | |
288 """ Map between three letter amino acid codes and standard one letter codes. | |
289 This map contains many nonstandard three letter codes, used, for example, to specify chemically modified amino acids in PDB files. | |
290 | |
291 Ref: http://astral.berkeley.edu/ | |
292 Ref: http://www.ebi.ac.uk/RESID/faq.html | |
293 """ | |
294 # Initial table is from the ASTRAL RAF release notes. | |
295 # added UNK | |
296 # Extra IUPAC: Xle, Xaa, Sec, Pyl | |
297 # The following have been seen in biopython code. | |
298 # Ter : '*' Termination | |
299 # Sel : 'U' A typo for Sec, selenocysteine? | |
300 # Xer : 'X' Another alternative for unknown? | |
301 | |
302 | |
303 amino_acid_names = { | |
304 'A' : 'alanine', | |
305 'M' : 'methionine', | |
306 'C' : 'cysteine', | |
307 'N' : 'asparagine', | |
308 'D' : 'aspartic acid', | |
309 'P' : 'proline', | |
310 'E' : 'glutamic acid', | |
311 'Q' : 'glutamine', | |
312 'F' : 'phenylalanine', | |
313 'R' : 'arginine', | |
314 'G' : 'glycine', | |
315 'S' : 'serine', | |
316 'H' : 'histidine', | |
317 'T' : 'threonine', | |
318 'I' : 'isoleucine', | |
319 'V' : 'valine', | |
320 'K' : 'lysine', | |
321 'W' : 'tryptophan', | |
322 'L' : 'leucine', | |
323 'Y' : 'tyrosine', | |
324 'B' : 'aspartic acid or asparagine', | |
325 'J' : 'leucine or isoleucine', | |
326 'X' : 'unknown', | |
327 'Z' : 'glutamic acid or glutamine', | |
328 'U' : 'selenocysteine', | |
329 'O' : 'pyrrolysine', | |
330 '*' : 'translation stop', | |
331 '-' : 'gap' | |
332 } | |
333 | |
334 amino_acid_composition = dict( | |
335 A = .082, R = .057, N = .044, D = .053, C = .017, | |
336 Q = .040, E = .062, G = .072, H = .022, I = .052, | |
337 L = .090, K = .057, M = .024, F =.039, P = .051, | |
338 S = .069, T = .058, W = .013, Y= .032, V =.066 ) | |
339 | |
340 """ | |
341 Overall amino acid composition of proteins. | |
342 Ref: McCaldon P., Argos P. Proteins 4:99-122 (1988). | |
343 """ | |
344 # FIXME : Proof these values | |
345 | |
346 kyte_doolittle_hydrophobicity = dict( | |
347 A=1.8, R=-4.5, N=-3.5, D=-3.5, C=2.5, | |
348 Q=-3.5, E=-3.5, G=-0.4, H=-3.2, I=4.5, | |
349 L=3.8, K=-3.9, M=1.9, F=2.8, P=-1.6, | |
350 S=-0.8, T=-0.7, W=-0.9, Y=-1.3, V=4.2 ) | |
351 """ | |
352 Kyte-Doolittle hydrophobicity scale. | |
353 Ref: Kyte J., Doolittle R.F. J. Mol. Biol. 157:105-132 (1982) | |
354 """ | |
355 # FIXME : Proof these values | |
356 | |
357 | |
358 nucleotide_names = { | |
359 'A' : 'Adenosine', | |
360 'C' : 'Cytidine', | |
361 'G' : 'Guanine', | |
362 'T' : 'Thymidine', | |
363 'U' : 'Uracil', | |
364 'R' : 'G A (puRine)', | |
365 'Y' : 'T C (pYrimidine)', | |
366 'K' : 'G T (Ketone)', | |
367 'M' : 'A C (aMino group)', | |
368 'S' : 'G C (Strong interaction)', | |
369 'W' : 'A T (Weak interaction)', | |
370 'B' : 'G T C (not A) (B comes after A)', | |
371 'D' : 'G A T (not C) (D comes after C)', | |
372 'H' : 'A C T (not G) (H comes after G)', | |
373 'V' : 'G C A (not T, not U) (V comes after U)', | |
374 'N' : 'A G C T (aNy)', | |
375 '-' : 'gap', | |
376 } | |
377 | |
378 | |
379 | |
380 | |
381 | |
382 | |
383 | |
384 | |
385 |