comparison corebio/resource/astral.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1
2 # Copyright 2000 by Jeffrey Chang. All rights reserved.
3 # Copyright 2001 by Gavin E. Crooks. All rights reserved.
4 # Modifications Copyright 2004/2005 James Casbon.
5 # Copyright 2005 by Regents of the University of California. All rights reserved
6 # (Major rewrite for conformance to corebio. Gavin Crooks)
7 #
8 # This code is derived from the Biopython distribution and is governed by it's
9 # license. Please see the LICENSE file that should have been included
10 # as part of this package.
11
12 """ ASTRAL dataset IO.
13
14 From http://astral.berkeley.edu/ :
15
16 The ASTRAL Compendium for Sequence and Structure Analysis
17
18 The ASTRAL compendium provides databases and tools useful for analyzing protein structures and their sequences. It is partially derived from, and augments the SCOP: Structural Classification of Proteins database. Most of the resources depend upon the coordinate files maintained and distributed by the Protein Data Bank.
19
20 * Classes :
21 - Raf -- A file ofASTRAL RAF (Rapid Access Format) Sequence Maps.
22 - RafSeqMap -- A sequence map, a RAF record.
23 - Res -- A single residue mapping from a RAF record.
24
25 * Functions :
26 - parse_domain -- Convert an ASTRAL fasta header string into a Scop domain.
27 - normalize_letters -- Bormalize RAF amino acid codes.
28
29 """
30
31 # TODO : Need to pull more of James Casbon's Astral code.
32
33 import re
34 from copy import copy
35
36 from corebio.resource.scop import Domain, Residues
37 from corebio.data import extended_three_to_one as to_one_letter_code
38 from corebio.utils import FileIndex
39
40 __all__ = ('astral_evalues', 'astral_percent_identities',
41 'astral_evalues_filenames', 'normalize_letters', 'parse_domain',
42 'Raf', 'RafSeqMap', 'Res')
43
44 # Percentage identity filtered ASTRAL SCOP genetic domain sequence subset
45 astral_percent_identities = [10,20,25,30,35,40,50,70,90,95,100]
46
47 # E-value filtered ASTRAL SCOP genetic domain sequence subsets, based on PDB SEQRES records.
48 astral_evalues = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15,1e-20, 1e-25, 1e-50]
49
50 # A map between evalues and astral filename suffixes.
51 astral_evalues_filenames = {
52 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1',
53 0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3',
54 1e-4: 'e-4', 1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15',
55 1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' }
56
57
58
59 def normalize_letters(one_letter_code) :
60 """Convert RAF one-letter amino acid codes into IUPAC standard codes.
61 Letters are uppercased, and "." ("Unknown") is converted to "X".
62 """
63 if one_letter_code == '.' :
64 return 'X'
65 else :
66 return one_letter_code.upper()
67
68 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)")
69 def parse_domain(str) :
70 """Convert an ASTRAL fasta header string into a Scop domain.
71
72 An ASTRAL (http://astral.stanford.edu/) header contains a concise
73 description of a SCOP domain. A very similar format is used when a
74 Domain object is converted into a string. The Domain returned by this
75 method contains most of the SCOP information, but it will not be located
76 within the SCOP hierarchy (i.e. The parent node will be None). The
77 description is composed of the SCOP protein and species descriptions.
78
79 A typical ASTRAL header looks like --
80 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli}
81 """
82
83 m = _domain_re.match(str)
84 if (not m) : raise ValueError("Domain: "+ str)
85
86 dom = Domain()
87 dom.sid = m.group(1)
88 dom.sccs = m.group(2)
89 dom.residues = Residues(m.group(3))
90 if not dom.residues.pdbid :
91 dom.residues.pdbid= dom.sid[1:5]
92 dom.description = m.group(4).strip()
93
94 return dom
95
96
97 class Raf(FileIndex) :
98 """ASTRAL RAF (Rapid Access Format) Sequence Maps.
99
100 The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES
101 records (representing the sequence of the molecule used in an experiment)
102 and the ATOM records (representing the atoms experimentally observed).
103
104 This data is derived from the Protein Data Bank CIF files. Known errors in
105 the CIF files are corrected manually, with the original PDB file serving as
106 the final arbiter in case of discrepancies.
107
108 Residues are referenced by residue ID. This consists of a the PDB residue
109 sequence number (up to 4 digits) and an optional PDB insertion code (an
110 ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1"
111
112 See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html
113
114 The RAF file itself is about 50 MB. Each line consists of a sequence map of
115 a different protein chain. This index provides rapid, random
116 access of RAF records without having to load the entire file into memory.
117
118 This class does not load the entire RAF file into memory. Instead, it
119 reads the file once, noting the location and content of each RafSeqMap.
120 The index key is a concatenation of the PDB ID and chain ID. e.g
121 "2drcA", "155c_". RAF uses an underscore to indicate blank
122 chain IDs. Custom maps of subsequences or spanning multiple chains, can
123 be constructed with the get_seqmap method.
124
125 """
126 def __init__(self, raf_file) :
127 def linekey(line) :
128 if not line or len(line)<5 or line.isspace() or line[0]=='#':
129 return None
130 return line[0:5]
131 def parser( f) : return RafSeqMap(f.readline())
132
133 FileIndex.__init__(self, raf_file, linekey, parser)
134
135
136 def get_seqmap(self, residues) :
137 """Get the sequence map for a collection of residues.
138
139 residues -- A SCOP style description of a collection of residues from a
140 PDB strucure, (e.g. '(1bba A:10-20,B:)'), as a string or a
141 scop.Residues instance.
142 """
143 if type(residues)== str :
144 residues = Residues(residues)
145
146 pdbid = residues.pdbid
147 frags = residues.fragments
148 if not frags: frags =(('_','',''),) # All residues of unnamed chain
149
150 seqMap = None
151 for frag in frags :
152 chainid = frag[0]
153 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_':
154 chainid = '_'
155 sid = pdbid + chainid
156
157 sm = self[sid]
158
159 # Cut out fragment of interest
160 start = 0
161 end = len(sm.res)
162 if frag[1] : start = int(sm.index(frag[1], chainid))
163 if frag[2] : end = int(sm.index(frag[2], chainid)+1)
164
165 sm = sm[start:end]
166
167 if seqMap is None :
168 seqMap = sm
169 else :
170 seqMap += sm
171
172 return seqMap
173 # End Raf
174
175 class RafSeqMap(object) :
176 """ASTRAL RAF (Rapid Access Format) Sequence Maps.
177
178 RafSeqMap is a list like object; You can find the location of particular
179 residues with index(), slice this RafSeqMap into fragments, and glue
180 fragments back together with extend().
181
182 - pdbid -- The PDB 4 character ID
183 - pdb_datestamp -- From the PDB file
184 - version -- The RAF format version. e.g. 0.01
185 - flags -- RAF flags. (See release notes for more information.)
186 - res -- A list of Res objects, one for each residue in this sequence map
187 """
188
189 def __init__(self, raf_record=None) :
190 """Parses a RAF record into a RafSeqMap object."""
191
192 self.pdbid = ''
193 self.pdb_datestamp = ''
194 self.version = ''
195 self.flags = ''
196 self.res = []
197
198 if not raf_record : return
199
200 header_len = 38
201 line = raf_record.rstrip() # no trailing whitespace
202
203 if len(line)<header_len:
204 raise ValueError("Incomplete header: "+line)
205
206 self.pdbid = line[0:4]
207 chainid = line[4:5]
208
209 self.version = line[6:10]
210
211 # Raf format versions 0.01 and 0.02 are identical for practical purposes
212 if(self.version != "0.01" and self.version !="0.02") :
213 raise ValueError("Incompatible RAF version: "+self.version)
214
215 self.pdb_datestamp = line[14:20]
216 self.flags = line[21:27]
217
218 for i in range(header_len, len(line), 7) :
219 f = line[i : i+7]
220 if len(f)!=7:
221 raise ValueError("Corrupt Field: ("+f+")" )
222 r = Res()
223 r.chainid = chainid
224 r.resid = f[0:5].strip()
225 r.atom = normalize_letters(f[5:6])
226 r.seqres = normalize_letters(f[6:7])
227
228 self.res.append(r)
229 # end __init__
230
231 #@staticmethod
232 def records(raf_file) :
233 """Iterates over a Raf file, generating RafSeqMaps """
234 for line in raf_file:
235 if line[0] =='#': continue # A comment
236 if line.isspace() : continue
237 yield RafSeqMap(line)
238 records = staticmethod(records)
239
240 def index(self, resid, chainid="_") :
241 for i in range(0, len(self.res)) :
242 if self.res[i].resid == resid and self.res[i].chainid == chainid :
243 return i
244 raise KeyError("No such residue "+chainid+resid)
245
246 def __getslice__(self, i, j) :
247 s = copy(self)
248 s.res = s.res[i:j]
249 return s
250
251 def append(self, res) :
252 """Append another Res object onto the list of residue mappings."""
253 self.res.append(res)
254
255 def extend(self, other) :
256 """Append another RafSeqMap onto the end of self.
257
258 Both RafSeqMaps must have the same PDB ID, PDB datestamp and
259 RAF version. The RAF flags are erased if they are inconsistent. This
260 may happen when fragments are taken from different chains.
261 """
262 if not isinstance(other, RafSeqMap):
263 raise TypeError("Can only extend a RafSeqMap with a RafSeqMap.")
264 if self.pdbid != other.pdbid :
265 raise TypeError("Cannot add fragments from different proteins.")
266 if self.version != other.version :
267 raise TypeError("Incompatible rafs.")
268 if self.pdb_datestamp != other.pdb_datestamp :
269 raise TypeError("Different pdb dates!")
270 if self.flags != other.flags :
271 self.flags = ''
272 self.res += other.res
273
274 def __iadd__(self, other) :
275 self.extend(other)
276 return self
277
278 def __add__(self, other) :
279 s = copy(self)
280 s.extend(other)
281 return s
282
283 def extract_atoms(self, pdb_handle, out_handle) :
284 """Extract all relevant ATOM and HETATOM records from a PDB file.
285
286 The PDB file is scanned for ATOM and HETATOM records. If the
287 chain ID, residue ID (seqNum and iCode), and residue type match
288 a residue in this sequence map, then the record is echoed to the
289 output handle.
290
291 This is typically used to find the coordinates of a domain, or other
292 residue subset.
293
294 pdb_file -- A handle to the relevant PDB file.
295 out_file -- All output is written to this stream.
296 """
297 resSet = {}
298 for r in self.res :
299 if r.atom=='X' : # Unknown residue type
300 continue
301 chainid = r.chainid
302 if chainid == '_':
303 chainid = ' '
304 resid = r.resid
305 resSet[(chainid,resid)] = r
306
307 resFound = {}
308 for line in pdb_handle :
309 if line.startswith("ATOM ") or line.startswith("HETATM") :
310 chainid = line[21:22]
311 resid = line[22:27].strip()
312 key = (chainid, resid)
313 if key in resSet:
314 res = resSet[key]
315 atom_aa = res.atom
316 resName = line[17:20].capitilize()
317 if resName in to_one_letter_code :
318 if to_one_letter_code[resName] == atom_aa :
319 out_handle.write(line)
320 resFound[key] = res
321
322 if len(resSet) != len(resFound) :
323 raise RuntimeError('I could not find at least one ATOM or '
324 'HETATM record for each and every residue in this sequence map.')
325
326
327 class Res(object) :
328 """ A single residue mapping from a RAF record.
329
330 - chainid -- A single character chain ID.
331 - resid -- The residue ID.
332 - atom -- amino acid one-letter code from ATOM records.
333 - seqres -- amino acid one-letter code from SEQRES records.
334 """
335 def __init__(self) :
336 self.chainid = ''
337 self.resid = ''
338 self.atom = ''
339 self.seqres = ''
340
341