0
|
1
|
|
2 # Copyright 2000 by Jeffrey Chang. All rights reserved.
|
|
3 # Copyright 2001 by Gavin E. Crooks. All rights reserved.
|
|
4 # Modifications Copyright 2004/2005 James Casbon.
|
|
5 # Copyright 2005 by Regents of the University of California. All rights reserved
|
|
6 # (Major rewrite for conformance to corebio. Gavin Crooks)
|
|
7 #
|
|
8 # This code is derived from the Biopython distribution and is governed by it's
|
|
9 # license. Please see the LICENSE file that should have been included
|
|
10 # as part of this package.
|
|
11
|
|
12 """ ASTRAL dataset IO.
|
|
13
|
|
14 From http://astral.berkeley.edu/ :
|
|
15
|
|
16 The ASTRAL Compendium for Sequence and Structure Analysis
|
|
17
|
|
18 The ASTRAL compendium provides databases and tools useful for analyzing protein structures and their sequences. It is partially derived from, and augments the SCOP: Structural Classification of Proteins database. Most of the resources depend upon the coordinate files maintained and distributed by the Protein Data Bank.
|
|
19
|
|
20 * Classes :
|
|
21 - Raf -- A file ofASTRAL RAF (Rapid Access Format) Sequence Maps.
|
|
22 - RafSeqMap -- A sequence map, a RAF record.
|
|
23 - Res -- A single residue mapping from a RAF record.
|
|
24
|
|
25 * Functions :
|
|
26 - parse_domain -- Convert an ASTRAL fasta header string into a Scop domain.
|
|
27 - normalize_letters -- Bormalize RAF amino acid codes.
|
|
28
|
|
29 """
|
|
30
|
|
31 # TODO : Need to pull more of James Casbon's Astral code.
|
|
32
|
|
33 import re
|
|
34 from copy import copy
|
|
35
|
|
36 from corebio.resource.scop import Domain, Residues
|
|
37 from corebio.data import extended_three_to_one as to_one_letter_code
|
|
38 from corebio.utils import FileIndex
|
|
39
|
|
40 __all__ = ('astral_evalues', 'astral_percent_identities',
|
|
41 'astral_evalues_filenames', 'normalize_letters', 'parse_domain',
|
|
42 'Raf', 'RafSeqMap', 'Res')
|
|
43
|
|
44 # Percentage identity filtered ASTRAL SCOP genetic domain sequence subset
|
|
45 astral_percent_identities = [10,20,25,30,35,40,50,70,90,95,100]
|
|
46
|
|
47 # E-value filtered ASTRAL SCOP genetic domain sequence subsets, based on PDB SEQRES records.
|
|
48 astral_evalues = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15,1e-20, 1e-25, 1e-50]
|
|
49
|
|
50 # A map between evalues and astral filename suffixes.
|
|
51 astral_evalues_filenames = {
|
|
52 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1',
|
|
53 0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3',
|
|
54 1e-4: 'e-4', 1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15',
|
|
55 1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' }
|
|
56
|
|
57
|
|
58
|
|
59 def normalize_letters(one_letter_code) :
|
|
60 """Convert RAF one-letter amino acid codes into IUPAC standard codes.
|
|
61 Letters are uppercased, and "." ("Unknown") is converted to "X".
|
|
62 """
|
|
63 if one_letter_code == '.' :
|
|
64 return 'X'
|
|
65 else :
|
|
66 return one_letter_code.upper()
|
|
67
|
|
68 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)")
|
|
69 def parse_domain(str) :
|
|
70 """Convert an ASTRAL fasta header string into a Scop domain.
|
|
71
|
|
72 An ASTRAL (http://astral.stanford.edu/) header contains a concise
|
|
73 description of a SCOP domain. A very similar format is used when a
|
|
74 Domain object is converted into a string. The Domain returned by this
|
|
75 method contains most of the SCOP information, but it will not be located
|
|
76 within the SCOP hierarchy (i.e. The parent node will be None). The
|
|
77 description is composed of the SCOP protein and species descriptions.
|
|
78
|
|
79 A typical ASTRAL header looks like --
|
|
80 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli}
|
|
81 """
|
|
82
|
|
83 m = _domain_re.match(str)
|
|
84 if (not m) : raise ValueError("Domain: "+ str)
|
|
85
|
|
86 dom = Domain()
|
|
87 dom.sid = m.group(1)
|
|
88 dom.sccs = m.group(2)
|
|
89 dom.residues = Residues(m.group(3))
|
|
90 if not dom.residues.pdbid :
|
|
91 dom.residues.pdbid= dom.sid[1:5]
|
|
92 dom.description = m.group(4).strip()
|
|
93
|
|
94 return dom
|
|
95
|
|
96
|
|
97 class Raf(FileIndex) :
|
|
98 """ASTRAL RAF (Rapid Access Format) Sequence Maps.
|
|
99
|
|
100 The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES
|
|
101 records (representing the sequence of the molecule used in an experiment)
|
|
102 and the ATOM records (representing the atoms experimentally observed).
|
|
103
|
|
104 This data is derived from the Protein Data Bank CIF files. Known errors in
|
|
105 the CIF files are corrected manually, with the original PDB file serving as
|
|
106 the final arbiter in case of discrepancies.
|
|
107
|
|
108 Residues are referenced by residue ID. This consists of a the PDB residue
|
|
109 sequence number (up to 4 digits) and an optional PDB insertion code (an
|
|
110 ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1"
|
|
111
|
|
112 See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html
|
|
113
|
|
114 The RAF file itself is about 50 MB. Each line consists of a sequence map of
|
|
115 a different protein chain. This index provides rapid, random
|
|
116 access of RAF records without having to load the entire file into memory.
|
|
117
|
|
118 This class does not load the entire RAF file into memory. Instead, it
|
|
119 reads the file once, noting the location and content of each RafSeqMap.
|
|
120 The index key is a concatenation of the PDB ID and chain ID. e.g
|
|
121 "2drcA", "155c_". RAF uses an underscore to indicate blank
|
|
122 chain IDs. Custom maps of subsequences or spanning multiple chains, can
|
|
123 be constructed with the get_seqmap method.
|
|
124
|
|
125 """
|
|
126 def __init__(self, raf_file) :
|
|
127 def linekey(line) :
|
|
128 if not line or len(line)<5 or line.isspace() or line[0]=='#':
|
|
129 return None
|
|
130 return line[0:5]
|
|
131 def parser( f) : return RafSeqMap(f.readline())
|
|
132
|
|
133 FileIndex.__init__(self, raf_file, linekey, parser)
|
|
134
|
|
135
|
|
136 def get_seqmap(self, residues) :
|
|
137 """Get the sequence map for a collection of residues.
|
|
138
|
|
139 residues -- A SCOP style description of a collection of residues from a
|
|
140 PDB strucure, (e.g. '(1bba A:10-20,B:)'), as a string or a
|
|
141 scop.Residues instance.
|
|
142 """
|
|
143 if type(residues)== str :
|
|
144 residues = Residues(residues)
|
|
145
|
|
146 pdbid = residues.pdbid
|
|
147 frags = residues.fragments
|
|
148 if not frags: frags =(('_','',''),) # All residues of unnamed chain
|
|
149
|
|
150 seqMap = None
|
|
151 for frag in frags :
|
|
152 chainid = frag[0]
|
|
153 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_':
|
|
154 chainid = '_'
|
|
155 sid = pdbid + chainid
|
|
156
|
|
157 sm = self[sid]
|
|
158
|
|
159 # Cut out fragment of interest
|
|
160 start = 0
|
|
161 end = len(sm.res)
|
|
162 if frag[1] : start = int(sm.index(frag[1], chainid))
|
|
163 if frag[2] : end = int(sm.index(frag[2], chainid)+1)
|
|
164
|
|
165 sm = sm[start:end]
|
|
166
|
|
167 if seqMap is None :
|
|
168 seqMap = sm
|
|
169 else :
|
|
170 seqMap += sm
|
|
171
|
|
172 return seqMap
|
|
173 # End Raf
|
|
174
|
|
175 class RafSeqMap(object) :
|
|
176 """ASTRAL RAF (Rapid Access Format) Sequence Maps.
|
|
177
|
|
178 RafSeqMap is a list like object; You can find the location of particular
|
|
179 residues with index(), slice this RafSeqMap into fragments, and glue
|
|
180 fragments back together with extend().
|
|
181
|
|
182 - pdbid -- The PDB 4 character ID
|
|
183 - pdb_datestamp -- From the PDB file
|
|
184 - version -- The RAF format version. e.g. 0.01
|
|
185 - flags -- RAF flags. (See release notes for more information.)
|
|
186 - res -- A list of Res objects, one for each residue in this sequence map
|
|
187 """
|
|
188
|
|
189 def __init__(self, raf_record=None) :
|
|
190 """Parses a RAF record into a RafSeqMap object."""
|
|
191
|
|
192 self.pdbid = ''
|
|
193 self.pdb_datestamp = ''
|
|
194 self.version = ''
|
|
195 self.flags = ''
|
|
196 self.res = []
|
|
197
|
|
198 if not raf_record : return
|
|
199
|
|
200 header_len = 38
|
|
201 line = raf_record.rstrip() # no trailing whitespace
|
|
202
|
|
203 if len(line)<header_len:
|
|
204 raise ValueError("Incomplete header: "+line)
|
|
205
|
|
206 self.pdbid = line[0:4]
|
|
207 chainid = line[4:5]
|
|
208
|
|
209 self.version = line[6:10]
|
|
210
|
|
211 # Raf format versions 0.01 and 0.02 are identical for practical purposes
|
|
212 if(self.version != "0.01" and self.version !="0.02") :
|
|
213 raise ValueError("Incompatible RAF version: "+self.version)
|
|
214
|
|
215 self.pdb_datestamp = line[14:20]
|
|
216 self.flags = line[21:27]
|
|
217
|
|
218 for i in range(header_len, len(line), 7) :
|
|
219 f = line[i : i+7]
|
|
220 if len(f)!=7:
|
|
221 raise ValueError("Corrupt Field: ("+f+")" )
|
|
222 r = Res()
|
|
223 r.chainid = chainid
|
|
224 r.resid = f[0:5].strip()
|
|
225 r.atom = normalize_letters(f[5:6])
|
|
226 r.seqres = normalize_letters(f[6:7])
|
|
227
|
|
228 self.res.append(r)
|
|
229 # end __init__
|
|
230
|
|
231 #@staticmethod
|
|
232 def records(raf_file) :
|
|
233 """Iterates over a Raf file, generating RafSeqMaps """
|
|
234 for line in raf_file:
|
|
235 if line[0] =='#': continue # A comment
|
|
236 if line.isspace() : continue
|
|
237 yield RafSeqMap(line)
|
|
238 records = staticmethod(records)
|
|
239
|
|
240 def index(self, resid, chainid="_") :
|
|
241 for i in range(0, len(self.res)) :
|
|
242 if self.res[i].resid == resid and self.res[i].chainid == chainid :
|
|
243 return i
|
|
244 raise KeyError("No such residue "+chainid+resid)
|
|
245
|
|
246 def __getslice__(self, i, j) :
|
|
247 s = copy(self)
|
|
248 s.res = s.res[i:j]
|
|
249 return s
|
|
250
|
|
251 def append(self, res) :
|
|
252 """Append another Res object onto the list of residue mappings."""
|
|
253 self.res.append(res)
|
|
254
|
|
255 def extend(self, other) :
|
|
256 """Append another RafSeqMap onto the end of self.
|
|
257
|
|
258 Both RafSeqMaps must have the same PDB ID, PDB datestamp and
|
|
259 RAF version. The RAF flags are erased if they are inconsistent. This
|
|
260 may happen when fragments are taken from different chains.
|
|
261 """
|
|
262 if not isinstance(other, RafSeqMap):
|
|
263 raise TypeError("Can only extend a RafSeqMap with a RafSeqMap.")
|
|
264 if self.pdbid != other.pdbid :
|
|
265 raise TypeError("Cannot add fragments from different proteins.")
|
|
266 if self.version != other.version :
|
|
267 raise TypeError("Incompatible rafs.")
|
|
268 if self.pdb_datestamp != other.pdb_datestamp :
|
|
269 raise TypeError("Different pdb dates!")
|
|
270 if self.flags != other.flags :
|
|
271 self.flags = ''
|
|
272 self.res += other.res
|
|
273
|
|
274 def __iadd__(self, other) :
|
|
275 self.extend(other)
|
|
276 return self
|
|
277
|
|
278 def __add__(self, other) :
|
|
279 s = copy(self)
|
|
280 s.extend(other)
|
|
281 return s
|
|
282
|
|
283 def extract_atoms(self, pdb_handle, out_handle) :
|
|
284 """Extract all relevant ATOM and HETATOM records from a PDB file.
|
|
285
|
|
286 The PDB file is scanned for ATOM and HETATOM records. If the
|
|
287 chain ID, residue ID (seqNum and iCode), and residue type match
|
|
288 a residue in this sequence map, then the record is echoed to the
|
|
289 output handle.
|
|
290
|
|
291 This is typically used to find the coordinates of a domain, or other
|
|
292 residue subset.
|
|
293
|
|
294 pdb_file -- A handle to the relevant PDB file.
|
|
295 out_file -- All output is written to this stream.
|
|
296 """
|
|
297 resSet = {}
|
|
298 for r in self.res :
|
|
299 if r.atom=='X' : # Unknown residue type
|
|
300 continue
|
|
301 chainid = r.chainid
|
|
302 if chainid == '_':
|
|
303 chainid = ' '
|
|
304 resid = r.resid
|
|
305 resSet[(chainid,resid)] = r
|
|
306
|
|
307 resFound = {}
|
|
308 for line in pdb_handle :
|
|
309 if line.startswith("ATOM ") or line.startswith("HETATM") :
|
|
310 chainid = line[21:22]
|
|
311 resid = line[22:27].strip()
|
|
312 key = (chainid, resid)
|
|
313 if key in resSet:
|
|
314 res = resSet[key]
|
|
315 atom_aa = res.atom
|
|
316 resName = line[17:20].capitilize()
|
|
317 if resName in to_one_letter_code :
|
|
318 if to_one_letter_code[resName] == atom_aa :
|
|
319 out_handle.write(line)
|
|
320 resFound[key] = res
|
|
321
|
|
322 if len(resSet) != len(resFound) :
|
|
323 raise RuntimeError('I could not find at least one ATOM or '
|
|
324 'HETATM record for each and every residue in this sequence map.')
|
|
325
|
|
326
|
|
327 class Res(object) :
|
|
328 """ A single residue mapping from a RAF record.
|
|
329
|
|
330 - chainid -- A single character chain ID.
|
|
331 - resid -- The residue ID.
|
|
332 - atom -- amino acid one-letter code from ATOM records.
|
|
333 - seqres -- amino acid one-letter code from SEQRES records.
|
|
334 """
|
|
335 def __init__(self) :
|
|
336 self.chainid = ''
|
|
337 self.resid = ''
|
|
338 self.atom = ''
|
|
339 self.seqres = ''
|
|
340
|
|
341 |