0
|
1
|
|
2 # Copyright 2000 by Jeffrey Chang. All rights reserved.
|
|
3 # Copyright 2001 by Gavin E. Crooks. All rights reserved.
|
|
4 # Modifications Copyright 2004/2005 James Casbon.
|
|
5 # Copyright 2005 by Regents of the University of California. All rights Reserved.
|
|
6 # (Major rewrite for conformance to corebio. Gavin Crooks)
|
|
7 #
|
|
8 # This code is derived from the Biopython distribution and is governed by it's
|
|
9 # license. Please see the LICENSE file that should have been included
|
|
10 # as part of this package.
|
|
11
|
|
12
|
|
13 """ SCOP: Structural Classification of Proteins.
|
|
14
|
|
15 The SCOP database aims to provide a manually constructed classification of
|
|
16 all know protein structures into a hierarchy, the main levels of which
|
|
17 are family, superfamily and fold.
|
|
18
|
|
19 * SCOP: http://scop.mrc-lmb.cam.ac.uk/scop/
|
|
20 * Introduction: http://scop.mrc-lmb.cam.ac.uk/scop/intro.html
|
|
21 * SCOP parsable files: http://scop.mrc-lmb.cam.ac.uk/scop/parse/
|
|
22
|
|
23 The Scop object in this module represents the entire SCOP classification. It
|
|
24 can be built from the three SCOP parsable files (see DesRecord, HieRecord and
|
|
25 ClaRecord), modified is so desired, and converted back to the same file formats.
|
|
26 A single SCOP domain (represented by the Domain class) can be obtained from
|
|
27 Scop using the domain's SCOP identifier (sid).
|
|
28
|
|
29 Classes:
|
|
30 - Scop -- The entire SCOP hierarchy.
|
|
31 - Node -- A node in the SCOP hierarchy.
|
|
32 - Domain -- A SCOP domain.
|
|
33 - Residues -- A collection of residues from a PDB structure.
|
|
34 - HieRecord -- Handle the SCOP HIErarchy files.
|
|
35 - DesRecord -- Handle the SCOP DEScription file.
|
|
36 - ClaRecord -- Handle the SCOP CLAssification file.
|
|
37
|
|
38
|
|
39 nodeCodeDict -- A mapping between known 2 letter node codes and a longer
|
|
40 description. The known node types are 'cl' (class), 'cf'
|
|
41 (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain),
|
|
42 'sp' (species), 'px' (domain). Additional node types may
|
|
43 be added in the future.
|
|
44 """
|
|
45
|
|
46 import os, re
|
|
47
|
|
48
|
|
49 nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily',
|
|
50 'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'}
|
|
51
|
|
52
|
|
53 _nodetype_to_code= dict([[v,k] for k,v in nodeCodeDict.items()])
|
|
54
|
|
55
|
|
56 nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ]
|
|
57
|
|
58
|
|
59 def cmp_sccs(sccs1, sccs2) :
|
|
60 """Order SCOP concise classification strings (sccs).
|
|
61
|
|
62 a.4.5.1 < a.4.5.11 < b.1.1.1
|
|
63
|
|
64 A sccs (e.g. a.4.5.11) compactly represents a domain's classification.
|
|
65 The letter represents the class, and the numbers are the fold,
|
|
66 superfamily, and family, respectively.
|
|
67
|
|
68 """
|
|
69
|
|
70 s1 = sccs1.split(".")
|
|
71 s2 = sccs2.split(".")
|
|
72
|
|
73 if s1[0] != s2[0]: return cmp(s1[0], s2[0])
|
|
74
|
|
75 s1 = map(int, s1[1:])
|
|
76 s2 = map(int, s2[1:])
|
|
77
|
|
78 return cmp(s1,s2)
|
|
79
|
|
80
|
|
81
|
|
82 def _open_scop_file(scop_dir_path, version, filetype) :
|
|
83 filename = "dir.%s.scop.txt_%s" % (filetype,version)
|
|
84 afile = open(os.path.join( scop_dir_path, filename))
|
|
85 return afile
|
|
86
|
|
87
|
|
88 class Scop(object):
|
|
89 """The entire SCOP hierarchy.
|
|
90
|
|
91 root -- The root node of the hierarchy
|
|
92 domains -- A list of all domains
|
|
93 nodes_by_sid -- A dictionary of nodes indexed by SCOP identifier
|
|
94 (e.g. 'd1hbia_')
|
|
95 domains_by_sunid -- A dictionary of domains indexed by SCOP uniquie
|
|
96 identifiers (e.g. 14996)
|
|
97 """
|
|
98 def __init__(self):
|
|
99 """ An empty Scop object.
|
|
100
|
|
101 See also Scop.parse() and Scop.parse_files()
|
|
102 """
|
|
103 self.root = None
|
|
104 self.domains = []
|
|
105 self.nodes_by_sunid = dict()
|
|
106 self.domains_by_sid = dict()
|
|
107
|
|
108 #@classmethod
|
|
109 def parse(cls, dir_path, version) :
|
|
110 """Build the SCOP hierarchy from the SCOP parsable files.
|
|
111
|
|
112 - dir_path -- A directory that contains the SCOP files
|
|
113 - version -- The SCOP version (as a string)
|
|
114
|
|
115 The SCOP files are named dir.XXX.scop.txt_VERSION, where XXX
|
|
116 is 'cla', 'des' or 'hie'.
|
|
117 """
|
|
118 cla_file = None
|
|
119 des_file = None
|
|
120 hie_file = None
|
|
121 try :
|
|
122 cla_file = _open_scop_file( dir_path, version, 'cla')
|
|
123 des_file = _open_scop_file( dir_path, version, 'des')
|
|
124 hie_file = _open_scop_file( dir_path, version, 'hie')
|
|
125 scop = cls.parse_files(cla_file, des_file, hie_file)
|
|
126 finally :
|
|
127 # If we opened the files, we close the files
|
|
128 if cla_file : cla_file.close()
|
|
129 if des_file : des_file.close()
|
|
130 if hie_file : hie_file.close()
|
|
131
|
|
132 return scop
|
|
133 parse = classmethod(parse)
|
|
134
|
|
135 #@classmethod
|
|
136 def parse_files(cls, cla_file, des_file, hie_file):
|
|
137 """Build the SCOP hierarchy from the SCOP parsable files.
|
|
138
|
|
139 - cla_file -- the CLA clasification file
|
|
140 - des_file -- the DES description file
|
|
141 - hie_file -- the HIE hierarchy file
|
|
142 """
|
|
143
|
|
144 self = cls()
|
|
145
|
|
146 sunidDict = {}
|
|
147
|
|
148 root = Node()
|
|
149 domains = []
|
|
150 root.sunid=0
|
|
151 root.type='ro'
|
|
152 sunidDict[root.sunid] = root
|
|
153
|
|
154 root.description = 'SCOP Root'
|
|
155
|
|
156 # Build the rest of the nodes using the DES file
|
|
157 for rec in DesRecord.records(des_file):
|
|
158 if rec.nodetype =='px' :
|
|
159 n = Domain()
|
|
160 n.sid = rec.name
|
|
161 domains.append(n)
|
|
162 else :
|
|
163 n = Node()
|
|
164 n.sunid = rec.sunid
|
|
165 n.type = rec.nodetype
|
|
166 n.sccs = rec.sccs
|
|
167 n.description = rec.description
|
|
168
|
|
169 sunidDict[n.sunid] = n
|
|
170
|
|
171 # Glue all of the Nodes together using the HIE file
|
|
172 for rec in HieRecord.records(hie_file):
|
|
173 if not rec.sunid in sunidDict :
|
|
174 print rec.sunid #FIXME: HUH?
|
|
175
|
|
176 n = sunidDict[rec.sunid]
|
|
177 if rec.parent !='': # Not root node
|
|
178 if not rec.parent in sunidDict :
|
|
179 raise ValueError("Incomplete data?")
|
|
180 n.parent = sunidDict[rec.parent]
|
|
181
|
|
182 for c in rec.children:
|
|
183 if not c in sunidDict :
|
|
184 raise ValueError("Incomplete data?")
|
|
185 n.children.append(sunidDict[c])
|
|
186
|
|
187
|
|
188 # Fill in the gaps with information from the CLA file
|
|
189 sidDict = {}
|
|
190 for rec in ClaRecord.records(cla_file):
|
|
191 n = sunidDict[rec.sunid]
|
|
192 assert n.sccs == rec.sccs
|
|
193 assert n.sid == rec.sid
|
|
194 n.residues = rec.residues
|
|
195 sidDict[n.sid] = n
|
|
196
|
|
197 # Clean up
|
|
198 self.root = root
|
|
199 self.nodes_by_sunid = sunidDict
|
|
200 self.domains_by_sid = sidDict
|
|
201 self.domains = tuple(domains)
|
|
202
|
|
203 return self
|
|
204 parse_files = classmethod(parse_files)
|
|
205
|
|
206
|
|
207 def write_hie(self, stream) :
|
|
208 """Build an HIE SCOP parsable file from this object"""
|
|
209 nodes = self.nodes_by_sunid.values()
|
|
210 # We order nodes to ease comparison with original file
|
|
211 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
|
|
212
|
|
213 for n in nodes :
|
|
214 stream.write(str(n.to_hie_record()))
|
|
215
|
|
216
|
|
217 def write_des(self, stream) :
|
|
218 """Build a DES SCOP parsable file from this object"""
|
|
219 nodes = self.nodes_by_sunid.values()
|
|
220 # Origional SCOP file is not ordered?
|
|
221 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
|
|
222
|
|
223 for n in nodes :
|
|
224 if n != self.root :
|
|
225 stream.write(str(n.to_des_record()))
|
|
226
|
|
227
|
|
228 def write_cla(self, stream) :
|
|
229 """Build a CLA SCOP parsable file from this object"""
|
|
230 nodes = self.domains_by_sid.values()
|
|
231 # We order nodes to ease comparison with original file
|
|
232 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
|
|
233
|
|
234 for n in nodes :
|
|
235 stream.write(str(n.to_cla_record()))
|
|
236 # End Scop
|
|
237
|
|
238
|
|
239
|
|
240 class Node(object) :
|
|
241 """ A node in the Scop hierarchy
|
|
242
|
|
243 sunid -- SCOP unique identifiers. e.g. '14986'
|
|
244 parent -- The parent node
|
|
245 children -- A list of child nodes
|
|
246 sccs -- SCOP concise classification string. e.g. 'a.1.1.2'
|
|
247 type -- A 2 letter node type code. e.g. 'px' for domains
|
|
248 description --
|
|
249
|
|
250 """
|
|
251 def __init__(self) :
|
|
252 """A new, uninitilized SCOP node."""
|
|
253 self.sunid=''
|
|
254 self.parent = None
|
|
255 self.children=[]
|
|
256 self.sccs = ''
|
|
257 self.type =''
|
|
258 self.description =''
|
|
259
|
|
260 def __str__(self) :
|
|
261 s = []
|
|
262 s.append(str(self.sunid))
|
|
263 s.append(self.sccs)
|
|
264 s.append(self.type)
|
|
265 s.append(self.description)
|
|
266
|
|
267 return " ".join(s)
|
|
268
|
|
269 def to_hie_record(self):
|
|
270 """Return an Hie.Record"""
|
|
271 rec = HieRecord()
|
|
272 rec.sunid = str(self.sunid)
|
|
273 if self.parent : # Not root node
|
|
274 rec.parent = str(self.parent.sunid)
|
|
275 else:
|
|
276 rec.parent = '-'
|
|
277 for c in self.children :
|
|
278 rec.children.append(str(c.sunid))
|
|
279 return rec
|
|
280
|
|
281 def to_des_record(self):
|
|
282 """Return a Des.Record"""
|
|
283 rec = DesRecord()
|
|
284 rec.sunid = str(self.sunid)
|
|
285 rec.nodetype = self.type
|
|
286 rec.sccs = self.sccs
|
|
287 rec.description = self.description
|
|
288 return rec
|
|
289
|
|
290 def descendents( self, node_type) :
|
|
291 """ Return a list of all decendent nodes of the given type. Node type
|
|
292 can be a two letter code or longer description. e.g. 'fa' or 'family'
|
|
293 """
|
|
294 if node_type in _nodetype_to_code:
|
|
295 node_type = _nodetype_to_code[node_type]
|
|
296
|
|
297 nodes = [self]
|
|
298
|
|
299 while nodes[0].type != node_type:
|
|
300 if nodes[0].type == 'px' :
|
|
301 return [] # Fell of the bottom of the hierarchy
|
|
302 child_list = []
|
|
303 for n in nodes:
|
|
304 for child in n.children:
|
|
305 child_list.append( child )
|
|
306 nodes = child_list
|
|
307
|
|
308 return nodes
|
|
309
|
|
310
|
|
311 def ascendent( self, node_type) :
|
|
312 """ Return the ancestor node of the given type, or None. Node type can
|
|
313 be a two letter code or longer description. e.g. 'fa' or 'family'
|
|
314 """
|
|
315 if node_type in _nodetype_to_code :
|
|
316 node_type = _nodetype_to_code[node_type]
|
|
317
|
|
318 n = self
|
|
319 if n.type == node_type: return None
|
|
320 while n.type != node_type:
|
|
321 if n.type == 'ro':
|
|
322 return None # Fell of the top of the hierarchy
|
|
323 n = n.parent
|
|
324
|
|
325 return n
|
|
326 # End Node
|
|
327
|
|
328
|
|
329 class Domain(Node) :
|
|
330 """ A SCOP domain. A leaf node in the Scop hierarchy.
|
|
331
|
|
332 - sid -- The SCOP domain identifier. e.g. 'd5hbib_'
|
|
333 - residues -- A Residue object. It defines the collection
|
|
334 of PDB atoms that make up this domain.
|
|
335 """
|
|
336 def __init__(self) :
|
|
337 Node.__init__(self)
|
|
338 self.sid = ''
|
|
339 self.residues = None
|
|
340
|
|
341 def __str__(self) :
|
|
342 s = []
|
|
343 s.append(self.sid)
|
|
344 s.append(self.sccs)
|
|
345 s.append("("+str(self.residues)+")")
|
|
346
|
|
347 if not self.parent :
|
|
348 s.append(self.description)
|
|
349 else :
|
|
350 sp = self.parent
|
|
351 dm = sp.parent
|
|
352 s.append(dm.description)
|
|
353 s.append("{"+sp.description+"}")
|
|
354
|
|
355 return " ".join(s)
|
|
356
|
|
357 def to_des_record(self):
|
|
358 """Return a des.Record"""
|
|
359 rec = Node.to_des_record(self)
|
|
360 rec.name = self.sid
|
|
361 return rec
|
|
362
|
|
363 def to_cla_record(self) :
|
|
364 """Return a cla.Record"""
|
|
365 rec = ClaRecord()
|
|
366 rec.sid = self.sid
|
|
367 rec.residues = self.residues
|
|
368 rec.sccs = self.sccs
|
|
369 rec.sunid = self.sunid
|
|
370
|
|
371 n = self
|
|
372 while n.sunid != 0: # Not root node
|
|
373 rec.hierarchy.append( (n.type, str(n.sunid)) )
|
|
374 n = n.parent
|
|
375
|
|
376 rec.hierarchy.reverse()
|
|
377
|
|
378 return rec
|
|
379 # End Domain
|
|
380
|
|
381
|
|
382
|
|
383 class DesRecord(object):
|
|
384 """ Handle the SCOP DEScription file.
|
|
385
|
|
386 The file format is described in the scop
|
|
387 "release notes.":http://scop.berkeley.edu/release-notes-1.55.html
|
|
388 The latest DES file can be found
|
|
389 "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
|
|
390
|
|
391 The DES file consisnt of one DES record per line. Each record
|
|
392 holds information for one node in the SCOP hierarchy, and consist
|
|
393 of 5 tab deliminated fields,
|
|
394 sunid, node type, sccs, node name, node description.
|
|
395
|
|
396 For example ::
|
|
397
|
|
398 21953 px b.1.2.1 d1dan.1 1dan T:,U:91-106
|
|
399 48724 cl b - All beta proteins
|
|
400 48725 cf b.1 - Immunoglobulin-like beta-sandwich
|
|
401 49265 sf b.1.2 - Fibronectin type III
|
|
402 49266 fa b.1.2.1 - Fibronectin type III
|
|
403
|
|
404
|
|
405 - sunid -- SCOP unique identifiers
|
|
406 - nodetype -- One of 'cl' (class), 'cf' (fold), 'sf' (superfamily),
|
|
407 'fa' (family), 'dm' (protein), 'sp' (species),
|
|
408 'px' (domain). Additional node types may be added.
|
|
409 - sccs -- SCOP concise classification strings. e.g. b.1.2.1
|
|
410 - name -- The SCOP ID (sid) for domains (e.g. d1anu1),
|
|
411 currently empty for other node types
|
|
412 - description -- e.g. "All beta proteins","Fibronectin type III",
|
|
413 """
|
|
414 def __init__(self, record=None):
|
|
415
|
|
416 if not record :
|
|
417 self.sunid = ''
|
|
418 self.nodetype = ''
|
|
419 self.sccs = ''
|
|
420 self.name = ''
|
|
421 self.description =''
|
|
422 else :
|
|
423 entry = record.rstrip() # no trailing whitespace
|
|
424 columns = entry.split("\t") # separate the tab-delineated cols
|
|
425 if len(columns) != 5:
|
|
426 raise ValueError("I don't understand the format of %s" % entry)
|
|
427
|
|
428 self.sunid, self.nodetype, self.sccs, self.name, self.description \
|
|
429 = columns
|
|
430 if self.name == '-' : self.name =''
|
|
431 self.sunid = int(self.sunid)
|
|
432
|
|
433 def __str__(self):
|
|
434 s = []
|
|
435 s.append(self.sunid)
|
|
436 s.append(self.nodetype)
|
|
437 s.append(self.sccs)
|
|
438 if self.name :
|
|
439 s.append(self.name)
|
|
440 else :
|
|
441 s.append("-")
|
|
442 s.append(self.description)
|
|
443 return "\t".join(map(str,s)) + "\n"
|
|
444
|
|
445 #@staticmethod
|
|
446 def records(des_file):
|
|
447 """Iterates over a DES file, generating DesRecords """
|
|
448 for line in des_file:
|
|
449 if line[0] =='#': continue # A comment
|
|
450 if line.isspace() : continue
|
|
451 yield DesRecord(line)
|
|
452 records = staticmethod(records)
|
|
453 # End DesRecord
|
|
454
|
|
455 class HieRecord(object):
|
|
456 """Handle the SCOP HIErarchy files, which describe the SCOP hierarchy in
|
|
457 terms of SCOP unique identifiers (sunid).
|
|
458
|
|
459 The file format is described in the scop
|
|
460 "release notes.":http://scop.berkeley.edu/release-notes-1.55.html
|
|
461 The latest HIE file can be found
|
|
462 "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
|
|
463
|
|
464 "Release 1.55":http://scop.berkeley.edu/parse/dir.hie.scop.txt_1.55
|
|
465 Records consist of 3 tab deliminated fields; node's sunid,
|
|
466 parent's sunid, and a list of children's sunids. For example ::
|
|
467
|
|
468 0 - 46456,48724,51349,53931,56572,56835,56992,57942
|
|
469 21953 49268 -
|
|
470 49267 49266 49268,49269
|
|
471
|
|
472 Each record holds information for one node in the SCOP hierarchy.
|
|
473
|
|
474 sunid -- SCOP unique identifiers of this node
|
|
475 parent -- Parents sunid
|
|
476 children -- Sequence of childrens sunids
|
|
477 """
|
|
478 def __init__(self, record = None):
|
|
479 self.sunid = None
|
|
480 self.parent = None
|
|
481 self.children = []
|
|
482
|
|
483 if not record : return
|
|
484
|
|
485 # Parses HIE records.
|
|
486 entry = record.rstrip() # no trailing whitespace
|
|
487 columns = entry.split('\t') # separate the tab-delineated cols
|
|
488 if len(columns) != 3:
|
|
489 raise ValueError("I don't understand the format of %s" % entry)
|
|
490
|
|
491 self.sunid, self.parent, children = columns
|
|
492
|
|
493 if self.sunid =='-' : self.sunid = ''
|
|
494 if self.parent =='-' : self.parent = ''
|
|
495 else : self.parent = int( self.parent )
|
|
496
|
|
497 if children =='-' :
|
|
498 self.children = ()
|
|
499 else :
|
|
500 self.children = children.split(',')
|
|
501 self.children = map ( int, self.children )
|
|
502
|
|
503 self.sunid = int(self.sunid)
|
|
504
|
|
505 def __str__(self):
|
|
506 s = []
|
|
507 s.append(str(self.sunid))
|
|
508
|
|
509 if self.parent:
|
|
510 s.append(str(self.parent))
|
|
511 else:
|
|
512 if self.sunid != 0:
|
|
513 s.append('0')
|
|
514 else:
|
|
515 s.append('-')
|
|
516
|
|
517 if self.children :
|
|
518 child_str = map(str, self.children)
|
|
519 s.append(",".join(child_str))
|
|
520 else:
|
|
521 s.append('-')
|
|
522
|
|
523 return "\t".join(s) + "\n"
|
|
524
|
|
525
|
|
526 #@staticmethod
|
|
527 def records(hie_file):
|
|
528 """Iterates over a DOM file, generating DomRecords """
|
|
529 for line in hie_file:
|
|
530 if line[0] =='#': continue # A comment
|
|
531 if line.isspace() : continue
|
|
532 yield HieRecord(line)
|
|
533 records = staticmethod(records)
|
|
534 # End HieRecord
|
|
535
|
|
536
|
|
537
|
|
538 class ClaRecord(object):
|
|
539 """Handle the SCOP CLAssification file, which describes SCOP domains.
|
|
540
|
|
541 The file format is described in the scop
|
|
542 "release notes.":http://scop.berkeley.edu/release-notes-1.55.html
|
|
543 The latest CLA file can be found
|
|
544 "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
|
|
545
|
|
546 sid -- SCOP identifier. e.g. d1danl2
|
|
547 residues -- The domain definition as a Residues object
|
|
548 sccs -- SCOP concise classification strings. e.g. b.1.2.1
|
|
549 sunid -- SCOP unique identifier for this domain
|
|
550 hierarchy -- A sequence of tuples (nodetype, sunid) describing the
|
|
551 location of this domain in the SCOP hierarchy.
|
|
552 See the Scop module for a description of nodetypes.
|
|
553 """
|
|
554 def __init__(self, record=None):
|
|
555 self.sid = ''
|
|
556 self.residues = None
|
|
557 self.sccs = ''
|
|
558 self.sunid =''
|
|
559 self.hierarchy = []
|
|
560
|
|
561 if not record: return
|
|
562
|
|
563 # Parse a tab-deliminated CLA record.
|
|
564 entry = record.rstrip() # no trailing whitespace
|
|
565 columns = entry.split('\t') # separate the tab-delineated cols
|
|
566 if len(columns) != 6:
|
|
567 raise ValueError("I don't understand the format of %s" % entry)
|
|
568
|
|
569 self.sid, pdbid, residues, self.sccs, self.sunid, hierarchy = columns
|
|
570 self.residues = Residues(residues)
|
|
571 self.residues.pdbid = pdbid
|
|
572 self.sunid = int(self.sunid)
|
|
573
|
|
574 h = []
|
|
575 for ht in hierarchy.split(",") :
|
|
576 h.append( ht.split('='))
|
|
577 for ht in h:
|
|
578 ht[1] = int(ht[1])
|
|
579 self.hierarchy = h
|
|
580
|
|
581 def __str__(self):
|
|
582 s = []
|
|
583 s.append(self.sid)
|
|
584 s += str(self.residues).split(" ")
|
|
585 s.append(self.sccs)
|
|
586 s.append(self.sunid)
|
|
587
|
|
588 h=[]
|
|
589 for ht in self.hierarchy:
|
|
590 h.append("=".join(map(str,ht)))
|
|
591 s.append(",".join(h))
|
|
592
|
|
593 return "\t".join(map(str,s)) + "\n"
|
|
594
|
|
595 #@staticmethod
|
|
596 def records(cla_file):
|
|
597 """Iterates over a DOM file, generating DomRecords """
|
|
598 for line in cla_file:
|
|
599 if line[0] =='#': continue # A comment
|
|
600 if line.isspace() : continue
|
|
601 yield ClaRecord(line)
|
|
602 records = staticmethod(records)
|
|
603
|
|
604 # End ClaRecord
|
|
605
|
|
606
|
|
607
|
|
608
|
|
609 class DomRecord(object):
|
|
610 """Handle the SCOP DOMain file.
|
|
611
|
|
612 The DOM file has been officially deprecated. For more information see
|
|
613 the SCOP"release notes.":http://scop.berkeley.edu/release-notes-1.55.html
|
|
614 The DOM files for older releases can be found
|
|
615 "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
|
|
616
|
|
617 DOM records consist of 4 tab deliminated fields;
|
|
618 sid, pdbid, residues, hierarchy
|
|
619 For example ::
|
|
620
|
|
621 d1sctg_ 1sct g: 1.001.001.001.001.001
|
|
622 d1scth_ 1sct h: 1.001.001.001.001.001
|
|
623 d1flp__ 1flp - 1.001.001.001.001.002
|
|
624 d1moh__ 1moh - 1.001.001.001.001.002
|
|
625
|
|
626 sid -- The SCOP ID of the entry, e.g. d1anu1
|
|
627 residues -- The domain definition as a Residues object
|
|
628 hierarchy -- A string specifying where this domain is in the hierarchy.
|
|
629 """
|
|
630 def __init__(self, record= None):
|
|
631 self.sid = ''
|
|
632 self.residues = []
|
|
633 self.hierarchy = ''
|
|
634
|
|
635 if record:
|
|
636 entry = record.rstrip() # no trailing whitespace
|
|
637 columns = entry.split("\t") # separate the tab-delineated cols
|
|
638 if len(columns) != 4:
|
|
639 raise ValueError("I don't understand the format of %s" % entry)
|
|
640 self.sid, pdbid, res, self.hierarchy = columns
|
|
641 self.residues = Residues(res)
|
|
642 self.residues.pdbid = pdbid
|
|
643
|
|
644 def __str__(self):
|
|
645 s = []
|
|
646 s.append(self.sid)
|
|
647 s.append(str(self.residues).replace(" ","\t") )
|
|
648 s.append(self.hierarchy)
|
|
649 return "\t".join(s) + "\n"
|
|
650
|
|
651 #@staticmethod
|
|
652 def records(dom_file):
|
|
653 """Iterates over a DOM file, generating DomRecords """
|
|
654 for line in dom_file:
|
|
655 if line[0] =='#': continue # A comment
|
|
656 if line.isspace() : continue
|
|
657 yield DomRecord(line)
|
|
658 records = staticmethod(records)
|
|
659 # End DomRecord
|
|
660
|
|
661
|
|
662
|
|
663
|
|
664 _pdbid_re = re.compile(r"^(\w\w\w\w)(?:$|\s+|_)(.*)")
|
|
665 _fragment_re = re.compile(r"\(?(\w:)?(-?\w*)-?(-?\w*)\)?(.*)")
|
|
666
|
|
667 class Residues(object) :
|
|
668 """A collection of residues from a PDB structure.
|
|
669
|
|
670 This class provides code to work with SCOP domain definitions. These
|
|
671 are concisely expressed as a one or more chain fragments. For example,
|
|
672 "(1bba A:10-20,B:)" indicates residue 10 through 20 (inclusive) of
|
|
673 chain A, and every residue of chain B in the pdb structure 1bba. The pdb
|
|
674 id and brackets are optional. In addition "-" indicates every residue of
|
|
675 a pbd structure with one unnamed chain.
|
|
676
|
|
677 Start and end residue ids consist of the residue sequence number and an
|
|
678 optional single letter insertion code. e.g. "12", "-1", "1a", "1000"
|
|
679
|
|
680
|
|
681 pdbid -- An optional PDB id, e.g. "1bba"
|
|
682 fragments -- A sequence of tuples (chainID, startResID, endResID)
|
|
683 """
|
|
684
|
|
685
|
|
686 def __init__(self, str=None) :
|
|
687 self.pdbid = ''
|
|
688 self.fragments = ()
|
|
689 if str is not None : self._parse(str)
|
|
690
|
|
691
|
|
692 def _parse(self, string):
|
|
693 string = string.strip()
|
|
694
|
|
695 #Is there a pdbid at the front? e.g. 1bba A:1-100
|
|
696 m = _pdbid_re.match(string)
|
|
697 if m is not None :
|
|
698 self.pdbid = m.group(1)
|
|
699 string = m.group(2) # Everything else
|
|
700
|
|
701 if string=='' or string == '-' or string=='(-)': # no fragments, whole sequence
|
|
702 return
|
|
703
|
|
704 fragments = []
|
|
705 for l in string.split(",") :
|
|
706 m = _fragment_re.match(l)
|
|
707 if m is None:
|
|
708 raise ValueError("I don't understand the format of %s" % l)
|
|
709 chain, start, end, postfix = m.groups()
|
|
710
|
|
711 if postfix != "" :
|
|
712 raise ValueError("I don't understand the format of %s" % l )
|
|
713
|
|
714 if chain:
|
|
715 if chain[-1] != ':':
|
|
716 raise ValueError("I don't understand the chain in %s" % l)
|
|
717 chain = chain[:-1] # chop off the ':'
|
|
718 else :
|
|
719 chain =""
|
|
720
|
|
721 fragments.append((chain, start, end))
|
|
722 self.fragments = tuple(fragments)
|
|
723
|
|
724 def __str__(self):
|
|
725 prefix =""
|
|
726 if self.pdbid :
|
|
727 prefix =self.pdbid +' '
|
|
728
|
|
729 if not self.fragments: return prefix+'-'
|
|
730 strs = []
|
|
731 for chain, start, end in self.fragments:
|
|
732 s = []
|
|
733 if chain: s.append("%s:" % chain)
|
|
734 if start: s.append("%s-%s" % (start, end))
|
|
735 strs.append("".join(s))
|
|
736 return prefix+ ",".join(strs)
|
|
737 # End Residues
|
|
738
|
|
739
|
|
740
|
|
741
|
|
742
|
|
743
|
|
744
|
|
745
|
|
746
|
|
747
|
|
748
|
|
749
|
|
750
|
|
751
|
|
752
|
|
753
|
|
754
|
|
755
|
|
756
|
|
757
|