Mercurial > repos > davidmurphy > codonlogo
comparison corebio/resource/astral.py @ 7:8d676bbd1f2d
Uploaded
author | davidmurphy |
---|---|
date | Mon, 16 Jan 2012 07:03:36 -0500 |
parents | c55bdc2fb9fa |
children |
comparison
equal
deleted
inserted
replaced
6:4a4aca3d57c9 | 7:8d676bbd1f2d |
---|---|
1 | |
2 # Copyright 2000 by Jeffrey Chang. All rights reserved. | |
3 # Copyright 2001 by Gavin E. Crooks. All rights reserved. | |
4 # Modifications Copyright 2004/2005 James Casbon. | |
5 # Copyright 2005 by Regents of the University of California. All rights reserved | |
6 # (Major rewrite for conformance to corebio. Gavin Crooks) | |
7 # | |
8 # This code is derived from the Biopython distribution and is governed by it's | |
9 # license. Please see the LICENSE file that should have been included | |
10 # as part of this package. | |
11 | |
12 """ ASTRAL dataset IO. | |
13 | |
14 From http://astral.berkeley.edu/ : | |
15 | |
16 The ASTRAL Compendium for Sequence and Structure Analysis | |
17 | |
18 The ASTRAL compendium provides databases and tools useful for analyzing protein structures and their sequences. It is partially derived from, and augments the SCOP: Structural Classification of Proteins database. Most of the resources depend upon the coordinate files maintained and distributed by the Protein Data Bank. | |
19 | |
20 * Classes : | |
21 - Raf -- A file ofASTRAL RAF (Rapid Access Format) Sequence Maps. | |
22 - RafSeqMap -- A sequence map, a RAF record. | |
23 - Res -- A single residue mapping from a RAF record. | |
24 | |
25 * Functions : | |
26 - parse_domain -- Convert an ASTRAL fasta header string into a Scop domain. | |
27 - normalize_letters -- Bormalize RAF amino acid codes. | |
28 | |
29 """ | |
30 | |
31 # TODO : Need to pull more of James Casbon's Astral code. | |
32 | |
33 import re | |
34 from copy import copy | |
35 | |
36 from corebio.resource.scop import Domain, Residues | |
37 from corebio.data import extended_three_to_one as to_one_letter_code | |
38 from corebio.utils import FileIndex | |
39 | |
40 __all__ = ('astral_evalues', 'astral_percent_identities', | |
41 'astral_evalues_filenames', 'normalize_letters', 'parse_domain', | |
42 'Raf', 'RafSeqMap', 'Res') | |
43 | |
44 # Percentage identity filtered ASTRAL SCOP genetic domain sequence subset | |
45 astral_percent_identities = [10,20,25,30,35,40,50,70,90,95,100] | |
46 | |
47 # E-value filtered ASTRAL SCOP genetic domain sequence subsets, based on PDB SEQRES records. | |
48 astral_evalues = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15,1e-20, 1e-25, 1e-50] | |
49 | |
50 # A map between evalues and astral filename suffixes. | |
51 astral_evalues_filenames = { | |
52 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', | |
53 0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', | |
54 1e-4: 'e-4', 1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', | |
55 1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' } | |
56 | |
57 | |
58 | |
59 def normalize_letters(one_letter_code) : | |
60 """Convert RAF one-letter amino acid codes into IUPAC standard codes. | |
61 Letters are uppercased, and "." ("Unknown") is converted to "X". | |
62 """ | |
63 if one_letter_code == '.' : | |
64 return 'X' | |
65 else : | |
66 return one_letter_code.upper() | |
67 | |
68 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") | |
69 def parse_domain(str) : | |
70 """Convert an ASTRAL fasta header string into a Scop domain. | |
71 | |
72 An ASTRAL (http://astral.stanford.edu/) header contains a concise | |
73 description of a SCOP domain. A very similar format is used when a | |
74 Domain object is converted into a string. The Domain returned by this | |
75 method contains most of the SCOP information, but it will not be located | |
76 within the SCOP hierarchy (i.e. The parent node will be None). The | |
77 description is composed of the SCOP protein and species descriptions. | |
78 | |
79 A typical ASTRAL header looks like -- | |
80 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} | |
81 """ | |
82 | |
83 m = _domain_re.match(str) | |
84 if (not m) : raise ValueError("Domain: "+ str) | |
85 | |
86 dom = Domain() | |
87 dom.sid = m.group(1) | |
88 dom.sccs = m.group(2) | |
89 dom.residues = Residues(m.group(3)) | |
90 if not dom.residues.pdbid : | |
91 dom.residues.pdbid= dom.sid[1:5] | |
92 dom.description = m.group(4).strip() | |
93 | |
94 return dom | |
95 | |
96 | |
97 class Raf(FileIndex) : | |
98 """ASTRAL RAF (Rapid Access Format) Sequence Maps. | |
99 | |
100 The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES | |
101 records (representing the sequence of the molecule used in an experiment) | |
102 and the ATOM records (representing the atoms experimentally observed). | |
103 | |
104 This data is derived from the Protein Data Bank CIF files. Known errors in | |
105 the CIF files are corrected manually, with the original PDB file serving as | |
106 the final arbiter in case of discrepancies. | |
107 | |
108 Residues are referenced by residue ID. This consists of a the PDB residue | |
109 sequence number (up to 4 digits) and an optional PDB insertion code (an | |
110 ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1" | |
111 | |
112 See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html | |
113 | |
114 The RAF file itself is about 50 MB. Each line consists of a sequence map of | |
115 a different protein chain. This index provides rapid, random | |
116 access of RAF records without having to load the entire file into memory. | |
117 | |
118 This class does not load the entire RAF file into memory. Instead, it | |
119 reads the file once, noting the location and content of each RafSeqMap. | |
120 The index key is a concatenation of the PDB ID and chain ID. e.g | |
121 "2drcA", "155c_". RAF uses an underscore to indicate blank | |
122 chain IDs. Custom maps of subsequences or spanning multiple chains, can | |
123 be constructed with the get_seqmap method. | |
124 | |
125 """ | |
126 def __init__(self, raf_file) : | |
127 def linekey(line) : | |
128 if not line or len(line)<5 or line.isspace() or line[0]=='#': | |
129 return None | |
130 return line[0:5] | |
131 def parser( f) : return RafSeqMap(f.readline()) | |
132 | |
133 FileIndex.__init__(self, raf_file, linekey, parser) | |
134 | |
135 | |
136 def get_seqmap(self, residues) : | |
137 """Get the sequence map for a collection of residues. | |
138 | |
139 residues -- A SCOP style description of a collection of residues from a | |
140 PDB strucure, (e.g. '(1bba A:10-20,B:)'), as a string or a | |
141 scop.Residues instance. | |
142 """ | |
143 if type(residues)== str : | |
144 residues = Residues(residues) | |
145 | |
146 pdbid = residues.pdbid | |
147 frags = residues.fragments | |
148 if not frags: frags =(('_','',''),) # All residues of unnamed chain | |
149 | |
150 seqMap = None | |
151 for frag in frags : | |
152 chainid = frag[0] | |
153 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_': | |
154 chainid = '_' | |
155 sid = pdbid + chainid | |
156 | |
157 sm = self[sid] | |
158 | |
159 # Cut out fragment of interest | |
160 start = 0 | |
161 end = len(sm.res) | |
162 if frag[1] : start = int(sm.index(frag[1], chainid)) | |
163 if frag[2] : end = int(sm.index(frag[2], chainid)+1) | |
164 | |
165 sm = sm[start:end] | |
166 | |
167 if seqMap is None : | |
168 seqMap = sm | |
169 else : | |
170 seqMap += sm | |
171 | |
172 return seqMap | |
173 # End Raf | |
174 | |
175 class RafSeqMap(object) : | |
176 """ASTRAL RAF (Rapid Access Format) Sequence Maps. | |
177 | |
178 RafSeqMap is a list like object; You can find the location of particular | |
179 residues with index(), slice this RafSeqMap into fragments, and glue | |
180 fragments back together with extend(). | |
181 | |
182 - pdbid -- The PDB 4 character ID | |
183 - pdb_datestamp -- From the PDB file | |
184 - version -- The RAF format version. e.g. 0.01 | |
185 - flags -- RAF flags. (See release notes for more information.) | |
186 - res -- A list of Res objects, one for each residue in this sequence map | |
187 """ | |
188 | |
189 def __init__(self, raf_record=None) : | |
190 """Parses a RAF record into a RafSeqMap object.""" | |
191 | |
192 self.pdbid = '' | |
193 self.pdb_datestamp = '' | |
194 self.version = '' | |
195 self.flags = '' | |
196 self.res = [] | |
197 | |
198 if not raf_record : return | |
199 | |
200 header_len = 38 | |
201 line = raf_record.rstrip() # no trailing whitespace | |
202 | |
203 if len(line)<header_len: | |
204 raise ValueError("Incomplete header: "+line) | |
205 | |
206 self.pdbid = line[0:4] | |
207 chainid = line[4:5] | |
208 | |
209 self.version = line[6:10] | |
210 | |
211 # Raf format versions 0.01 and 0.02 are identical for practical purposes | |
212 if(self.version != "0.01" and self.version !="0.02") : | |
213 raise ValueError("Incompatible RAF version: "+self.version) | |
214 | |
215 self.pdb_datestamp = line[14:20] | |
216 self.flags = line[21:27] | |
217 | |
218 for i in range(header_len, len(line), 7) : | |
219 f = line[i : i+7] | |
220 if len(f)!=7: | |
221 raise ValueError("Corrupt Field: ("+f+")" ) | |
222 r = Res() | |
223 r.chainid = chainid | |
224 r.resid = f[0:5].strip() | |
225 r.atom = normalize_letters(f[5:6]) | |
226 r.seqres = normalize_letters(f[6:7]) | |
227 | |
228 self.res.append(r) | |
229 # end __init__ | |
230 | |
231 #@staticmethod | |
232 def records(raf_file) : | |
233 """Iterates over a Raf file, generating RafSeqMaps """ | |
234 for line in raf_file: | |
235 if line[0] =='#': continue # A comment | |
236 if line.isspace() : continue | |
237 yield RafSeqMap(line) | |
238 records = staticmethod(records) | |
239 | |
240 def index(self, resid, chainid="_") : | |
241 for i in range(0, len(self.res)) : | |
242 if self.res[i].resid == resid and self.res[i].chainid == chainid : | |
243 return i | |
244 raise KeyError("No such residue "+chainid+resid) | |
245 | |
246 def __getslice__(self, i, j) : | |
247 s = copy(self) | |
248 s.res = s.res[i:j] | |
249 return s | |
250 | |
251 def append(self, res) : | |
252 """Append another Res object onto the list of residue mappings.""" | |
253 self.res.append(res) | |
254 | |
255 def extend(self, other) : | |
256 """Append another RafSeqMap onto the end of self. | |
257 | |
258 Both RafSeqMaps must have the same PDB ID, PDB datestamp and | |
259 RAF version. The RAF flags are erased if they are inconsistent. This | |
260 may happen when fragments are taken from different chains. | |
261 """ | |
262 if not isinstance(other, RafSeqMap): | |
263 raise TypeError("Can only extend a RafSeqMap with a RafSeqMap.") | |
264 if self.pdbid != other.pdbid : | |
265 raise TypeError("Cannot add fragments from different proteins.") | |
266 if self.version != other.version : | |
267 raise TypeError("Incompatible rafs.") | |
268 if self.pdb_datestamp != other.pdb_datestamp : | |
269 raise TypeError("Different pdb dates!") | |
270 if self.flags != other.flags : | |
271 self.flags = '' | |
272 self.res += other.res | |
273 | |
274 def __iadd__(self, other) : | |
275 self.extend(other) | |
276 return self | |
277 | |
278 def __add__(self, other) : | |
279 s = copy(self) | |
280 s.extend(other) | |
281 return s | |
282 | |
283 def extract_atoms(self, pdb_handle, out_handle) : | |
284 """Extract all relevant ATOM and HETATOM records from a PDB file. | |
285 | |
286 The PDB file is scanned for ATOM and HETATOM records. If the | |
287 chain ID, residue ID (seqNum and iCode), and residue type match | |
288 a residue in this sequence map, then the record is echoed to the | |
289 output handle. | |
290 | |
291 This is typically used to find the coordinates of a domain, or other | |
292 residue subset. | |
293 | |
294 pdb_file -- A handle to the relevant PDB file. | |
295 out_file -- All output is written to this stream. | |
296 """ | |
297 resSet = {} | |
298 for r in self.res : | |
299 if r.atom=='X' : # Unknown residue type | |
300 continue | |
301 chainid = r.chainid | |
302 if chainid == '_': | |
303 chainid = ' ' | |
304 resid = r.resid | |
305 resSet[(chainid,resid)] = r | |
306 | |
307 resFound = {} | |
308 for line in pdb_handle : | |
309 if line.startswith("ATOM ") or line.startswith("HETATM") : | |
310 chainid = line[21:22] | |
311 resid = line[22:27].strip() | |
312 key = (chainid, resid) | |
313 if key in resSet: | |
314 res = resSet[key] | |
315 atom_aa = res.atom | |
316 resName = line[17:20].capitilize() | |
317 if resName in to_one_letter_code : | |
318 if to_one_letter_code[resName] == atom_aa : | |
319 out_handle.write(line) | |
320 resFound[key] = res | |
321 | |
322 if len(resSet) != len(resFound) : | |
323 raise RuntimeError('I could not find at least one ATOM or ' | |
324 'HETATM record for each and every residue in this sequence map.') | |
325 | |
326 | |
327 class Res(object) : | |
328 """ A single residue mapping from a RAF record. | |
329 | |
330 - chainid -- A single character chain ID. | |
331 - resid -- The residue ID. | |
332 - atom -- amino acid one-letter code from ATOM records. | |
333 - seqres -- amino acid one-letter code from SEQRES records. | |
334 """ | |
335 def __init__(self) : | |
336 self.chainid = '' | |
337 self.resid = '' | |
338 self.atom = '' | |
339 self.seqres = '' | |
340 | |
341 |