comparison corebio/resource/scop.py @ 7:8d676bbd1f2d

Uploaded
author davidmurphy
date Mon, 16 Jan 2012 07:03:36 -0500
parents c55bdc2fb9fa
children
comparison
equal deleted inserted replaced
6:4a4aca3d57c9 7:8d676bbd1f2d
1
2 # Copyright 2000 by Jeffrey Chang. All rights reserved.
3 # Copyright 2001 by Gavin E. Crooks. All rights reserved.
4 # Modifications Copyright 2004/2005 James Casbon.
5 # Copyright 2005 by Regents of the University of California. All rights Reserved.
6 # (Major rewrite for conformance to corebio. Gavin Crooks)
7 #
8 # This code is derived from the Biopython distribution and is governed by it's
9 # license. Please see the LICENSE file that should have been included
10 # as part of this package.
11
12
13 """ SCOP: Structural Classification of Proteins.
14
15 The SCOP database aims to provide a manually constructed classification of
16 all know protein structures into a hierarchy, the main levels of which
17 are family, superfamily and fold.
18
19 * SCOP: http://scop.mrc-lmb.cam.ac.uk/scop/
20 * Introduction: http://scop.mrc-lmb.cam.ac.uk/scop/intro.html
21 * SCOP parsable files: http://scop.mrc-lmb.cam.ac.uk/scop/parse/
22
23 The Scop object in this module represents the entire SCOP classification. It
24 can be built from the three SCOP parsable files (see DesRecord, HieRecord and
25 ClaRecord), modified is so desired, and converted back to the same file formats.
26 A single SCOP domain (represented by the Domain class) can be obtained from
27 Scop using the domain's SCOP identifier (sid).
28
29 Classes:
30 - Scop -- The entire SCOP hierarchy.
31 - Node -- A node in the SCOP hierarchy.
32 - Domain -- A SCOP domain.
33 - Residues -- A collection of residues from a PDB structure.
34 - HieRecord -- Handle the SCOP HIErarchy files.
35 - DesRecord -- Handle the SCOP DEScription file.
36 - ClaRecord -- Handle the SCOP CLAssification file.
37
38
39 nodeCodeDict -- A mapping between known 2 letter node codes and a longer
40 description. The known node types are 'cl' (class), 'cf'
41 (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain),
42 'sp' (species), 'px' (domain). Additional node types may
43 be added in the future.
44 """
45
46 import os, re
47
48
49 nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily',
50 'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'}
51
52
53 _nodetype_to_code= dict([[v,k] for k,v in nodeCodeDict.items()])
54
55
56 nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ]
57
58
59 def cmp_sccs(sccs1, sccs2) :
60 """Order SCOP concise classification strings (sccs).
61
62 a.4.5.1 < a.4.5.11 < b.1.1.1
63
64 A sccs (e.g. a.4.5.11) compactly represents a domain's classification.
65 The letter represents the class, and the numbers are the fold,
66 superfamily, and family, respectively.
67
68 """
69
70 s1 = sccs1.split(".")
71 s2 = sccs2.split(".")
72
73 if s1[0] != s2[0]: return cmp(s1[0], s2[0])
74
75 s1 = map(int, s1[1:])
76 s2 = map(int, s2[1:])
77
78 return cmp(s1,s2)
79
80
81
82 def _open_scop_file(scop_dir_path, version, filetype) :
83 filename = "dir.%s.scop.txt_%s" % (filetype,version)
84 afile = open(os.path.join( scop_dir_path, filename))
85 return afile
86
87
88 class Scop(object):
89 """The entire SCOP hierarchy.
90
91 root -- The root node of the hierarchy
92 domains -- A list of all domains
93 nodes_by_sid -- A dictionary of nodes indexed by SCOP identifier
94 (e.g. 'd1hbia_')
95 domains_by_sunid -- A dictionary of domains indexed by SCOP uniquie
96 identifiers (e.g. 14996)
97 """
98 def __init__(self):
99 """ An empty Scop object.
100
101 See also Scop.parse() and Scop.parse_files()
102 """
103 self.root = None
104 self.domains = []
105 self.nodes_by_sunid = dict()
106 self.domains_by_sid = dict()
107
108 #@classmethod
109 def parse(cls, dir_path, version) :
110 """Build the SCOP hierarchy from the SCOP parsable files.
111
112 - dir_path -- A directory that contains the SCOP files
113 - version -- The SCOP version (as a string)
114
115 The SCOP files are named dir.XXX.scop.txt_VERSION, where XXX
116 is 'cla', 'des' or 'hie'.
117 """
118 cla_file = None
119 des_file = None
120 hie_file = None
121 try :
122 cla_file = _open_scop_file( dir_path, version, 'cla')
123 des_file = _open_scop_file( dir_path, version, 'des')
124 hie_file = _open_scop_file( dir_path, version, 'hie')
125 scop = cls.parse_files(cla_file, des_file, hie_file)
126 finally :
127 # If we opened the files, we close the files
128 if cla_file : cla_file.close()
129 if des_file : des_file.close()
130 if hie_file : hie_file.close()
131
132 return scop
133 parse = classmethod(parse)
134
135 #@classmethod
136 def parse_files(cls, cla_file, des_file, hie_file):
137 """Build the SCOP hierarchy from the SCOP parsable files.
138
139 - cla_file -- the CLA clasification file
140 - des_file -- the DES description file
141 - hie_file -- the HIE hierarchy file
142 """
143
144 self = cls()
145
146 sunidDict = {}
147
148 root = Node()
149 domains = []
150 root.sunid=0
151 root.type='ro'
152 sunidDict[root.sunid] = root
153
154 root.description = 'SCOP Root'
155
156 # Build the rest of the nodes using the DES file
157 for rec in DesRecord.records(des_file):
158 if rec.nodetype =='px' :
159 n = Domain()
160 n.sid = rec.name
161 domains.append(n)
162 else :
163 n = Node()
164 n.sunid = rec.sunid
165 n.type = rec.nodetype
166 n.sccs = rec.sccs
167 n.description = rec.description
168
169 sunidDict[n.sunid] = n
170
171 # Glue all of the Nodes together using the HIE file
172 for rec in HieRecord.records(hie_file):
173 if not rec.sunid in sunidDict :
174 print rec.sunid #FIXME: HUH?
175
176 n = sunidDict[rec.sunid]
177 if rec.parent !='': # Not root node
178 if not rec.parent in sunidDict :
179 raise ValueError("Incomplete data?")
180 n.parent = sunidDict[rec.parent]
181
182 for c in rec.children:
183 if not c in sunidDict :
184 raise ValueError("Incomplete data?")
185 n.children.append(sunidDict[c])
186
187
188 # Fill in the gaps with information from the CLA file
189 sidDict = {}
190 for rec in ClaRecord.records(cla_file):
191 n = sunidDict[rec.sunid]
192 assert n.sccs == rec.sccs
193 assert n.sid == rec.sid
194 n.residues = rec.residues
195 sidDict[n.sid] = n
196
197 # Clean up
198 self.root = root
199 self.nodes_by_sunid = sunidDict
200 self.domains_by_sid = sidDict
201 self.domains = tuple(domains)
202
203 return self
204 parse_files = classmethod(parse_files)
205
206
207 def write_hie(self, stream) :
208 """Build an HIE SCOP parsable file from this object"""
209 nodes = self.nodes_by_sunid.values()
210 # We order nodes to ease comparison with original file
211 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
212
213 for n in nodes :
214 stream.write(str(n.to_hie_record()))
215
216
217 def write_des(self, stream) :
218 """Build a DES SCOP parsable file from this object"""
219 nodes = self.nodes_by_sunid.values()
220 # Origional SCOP file is not ordered?
221 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
222
223 for n in nodes :
224 if n != self.root :
225 stream.write(str(n.to_des_record()))
226
227
228 def write_cla(self, stream) :
229 """Build a CLA SCOP parsable file from this object"""
230 nodes = self.domains_by_sid.values()
231 # We order nodes to ease comparison with original file
232 nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
233
234 for n in nodes :
235 stream.write(str(n.to_cla_record()))
236 # End Scop
237
238
239
240 class Node(object) :
241 """ A node in the Scop hierarchy
242
243 sunid -- SCOP unique identifiers. e.g. '14986'
244 parent -- The parent node
245 children -- A list of child nodes
246 sccs -- SCOP concise classification string. e.g. 'a.1.1.2'
247 type -- A 2 letter node type code. e.g. 'px' for domains
248 description --
249
250 """
251 def __init__(self) :
252 """A new, uninitilized SCOP node."""
253 self.sunid=''
254 self.parent = None
255 self.children=[]
256 self.sccs = ''
257 self.type =''
258 self.description =''
259
260 def __str__(self) :
261 s = []
262 s.append(str(self.sunid))
263 s.append(self.sccs)
264 s.append(self.type)
265 s.append(self.description)
266
267 return " ".join(s)
268
269 def to_hie_record(self):
270 """Return an Hie.Record"""
271 rec = HieRecord()
272 rec.sunid = str(self.sunid)
273 if self.parent : # Not root node
274 rec.parent = str(self.parent.sunid)
275 else:
276 rec.parent = '-'
277 for c in self.children :
278 rec.children.append(str(c.sunid))
279 return rec
280
281 def to_des_record(self):
282 """Return a Des.Record"""
283 rec = DesRecord()
284 rec.sunid = str(self.sunid)
285 rec.nodetype = self.type
286 rec.sccs = self.sccs
287 rec.description = self.description
288 return rec
289
290 def descendents( self, node_type) :
291 """ Return a list of all decendent nodes of the given type. Node type
292 can be a two letter code or longer description. e.g. 'fa' or 'family'
293 """
294 if node_type in _nodetype_to_code:
295 node_type = _nodetype_to_code[node_type]
296
297 nodes = [self]
298
299 while nodes[0].type != node_type:
300 if nodes[0].type == 'px' :
301 return [] # Fell of the bottom of the hierarchy
302 child_list = []
303 for n in nodes:
304 for child in n.children:
305 child_list.append( child )
306 nodes = child_list
307
308 return nodes
309
310
311 def ascendent( self, node_type) :
312 """ Return the ancestor node of the given type, or None. Node type can
313 be a two letter code or longer description. e.g. 'fa' or 'family'
314 """
315 if node_type in _nodetype_to_code :
316 node_type = _nodetype_to_code[node_type]
317
318 n = self
319 if n.type == node_type: return None
320 while n.type != node_type:
321 if n.type == 'ro':
322 return None # Fell of the top of the hierarchy
323 n = n.parent
324
325 return n
326 # End Node
327
328
329 class Domain(Node) :
330 """ A SCOP domain. A leaf node in the Scop hierarchy.
331
332 - sid -- The SCOP domain identifier. e.g. 'd5hbib_'
333 - residues -- A Residue object. It defines the collection
334 of PDB atoms that make up this domain.
335 """
336 def __init__(self) :
337 Node.__init__(self)
338 self.sid = ''
339 self.residues = None
340
341 def __str__(self) :
342 s = []
343 s.append(self.sid)
344 s.append(self.sccs)
345 s.append("("+str(self.residues)+")")
346
347 if not self.parent :
348 s.append(self.description)
349 else :
350 sp = self.parent
351 dm = sp.parent
352 s.append(dm.description)
353 s.append("{"+sp.description+"}")
354
355 return " ".join(s)
356
357 def to_des_record(self):
358 """Return a des.Record"""
359 rec = Node.to_des_record(self)
360 rec.name = self.sid
361 return rec
362
363 def to_cla_record(self) :
364 """Return a cla.Record"""
365 rec = ClaRecord()
366 rec.sid = self.sid
367 rec.residues = self.residues
368 rec.sccs = self.sccs
369 rec.sunid = self.sunid
370
371 n = self
372 while n.sunid != 0: # Not root node
373 rec.hierarchy.append( (n.type, str(n.sunid)) )
374 n = n.parent
375
376 rec.hierarchy.reverse()
377
378 return rec
379 # End Domain
380
381
382
383 class DesRecord(object):
384 """ Handle the SCOP DEScription file.
385
386 The file format is described in the scop
387 "release notes.":http://scop.berkeley.edu/release-notes-1.55.html
388 The latest DES file can be found
389 "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
390
391 The DES file consisnt of one DES record per line. Each record
392 holds information for one node in the SCOP hierarchy, and consist
393 of 5 tab deliminated fields,
394 sunid, node type, sccs, node name, node description.
395
396 For example ::
397
398 21953 px b.1.2.1 d1dan.1 1dan T:,U:91-106
399 48724 cl b - All beta proteins
400 48725 cf b.1 - Immunoglobulin-like beta-sandwich
401 49265 sf b.1.2 - Fibronectin type III
402 49266 fa b.1.2.1 - Fibronectin type III
403
404
405 - sunid -- SCOP unique identifiers
406 - nodetype -- One of 'cl' (class), 'cf' (fold), 'sf' (superfamily),
407 'fa' (family), 'dm' (protein), 'sp' (species),
408 'px' (domain). Additional node types may be added.
409 - sccs -- SCOP concise classification strings. e.g. b.1.2.1
410 - name -- The SCOP ID (sid) for domains (e.g. d1anu1),
411 currently empty for other node types
412 - description -- e.g. "All beta proteins","Fibronectin type III",
413 """
414 def __init__(self, record=None):
415
416 if not record :
417 self.sunid = ''
418 self.nodetype = ''
419 self.sccs = ''
420 self.name = ''
421 self.description =''
422 else :
423 entry = record.rstrip() # no trailing whitespace
424 columns = entry.split("\t") # separate the tab-delineated cols
425 if len(columns) != 5:
426 raise ValueError("I don't understand the format of %s" % entry)
427
428 self.sunid, self.nodetype, self.sccs, self.name, self.description \
429 = columns
430 if self.name == '-' : self.name =''
431 self.sunid = int(self.sunid)
432
433 def __str__(self):
434 s = []
435 s.append(self.sunid)
436 s.append(self.nodetype)
437 s.append(self.sccs)
438 if self.name :
439 s.append(self.name)
440 else :
441 s.append("-")
442 s.append(self.description)
443 return "\t".join(map(str,s)) + "\n"
444
445 #@staticmethod
446 def records(des_file):
447 """Iterates over a DES file, generating DesRecords """
448 for line in des_file:
449 if line[0] =='#': continue # A comment
450 if line.isspace() : continue
451 yield DesRecord(line)
452 records = staticmethod(records)
453 # End DesRecord
454
455 class HieRecord(object):
456 """Handle the SCOP HIErarchy files, which describe the SCOP hierarchy in
457 terms of SCOP unique identifiers (sunid).
458
459 The file format is described in the scop
460 "release notes.":http://scop.berkeley.edu/release-notes-1.55.html
461 The latest HIE file can be found
462 "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
463
464 "Release 1.55":http://scop.berkeley.edu/parse/dir.hie.scop.txt_1.55
465 Records consist of 3 tab deliminated fields; node's sunid,
466 parent's sunid, and a list of children's sunids. For example ::
467
468 0 - 46456,48724,51349,53931,56572,56835,56992,57942
469 21953 49268 -
470 49267 49266 49268,49269
471
472 Each record holds information for one node in the SCOP hierarchy.
473
474 sunid -- SCOP unique identifiers of this node
475 parent -- Parents sunid
476 children -- Sequence of childrens sunids
477 """
478 def __init__(self, record = None):
479 self.sunid = None
480 self.parent = None
481 self.children = []
482
483 if not record : return
484
485 # Parses HIE records.
486 entry = record.rstrip() # no trailing whitespace
487 columns = entry.split('\t') # separate the tab-delineated cols
488 if len(columns) != 3:
489 raise ValueError("I don't understand the format of %s" % entry)
490
491 self.sunid, self.parent, children = columns
492
493 if self.sunid =='-' : self.sunid = ''
494 if self.parent =='-' : self.parent = ''
495 else : self.parent = int( self.parent )
496
497 if children =='-' :
498 self.children = ()
499 else :
500 self.children = children.split(',')
501 self.children = map ( int, self.children )
502
503 self.sunid = int(self.sunid)
504
505 def __str__(self):
506 s = []
507 s.append(str(self.sunid))
508
509 if self.parent:
510 s.append(str(self.parent))
511 else:
512 if self.sunid != 0:
513 s.append('0')
514 else:
515 s.append('-')
516
517 if self.children :
518 child_str = map(str, self.children)
519 s.append(",".join(child_str))
520 else:
521 s.append('-')
522
523 return "\t".join(s) + "\n"
524
525
526 #@staticmethod
527 def records(hie_file):
528 """Iterates over a DOM file, generating DomRecords """
529 for line in hie_file:
530 if line[0] =='#': continue # A comment
531 if line.isspace() : continue
532 yield HieRecord(line)
533 records = staticmethod(records)
534 # End HieRecord
535
536
537
538 class ClaRecord(object):
539 """Handle the SCOP CLAssification file, which describes SCOP domains.
540
541 The file format is described in the scop
542 "release notes.":http://scop.berkeley.edu/release-notes-1.55.html
543 The latest CLA file can be found
544 "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
545
546 sid -- SCOP identifier. e.g. d1danl2
547 residues -- The domain definition as a Residues object
548 sccs -- SCOP concise classification strings. e.g. b.1.2.1
549 sunid -- SCOP unique identifier for this domain
550 hierarchy -- A sequence of tuples (nodetype, sunid) describing the
551 location of this domain in the SCOP hierarchy.
552 See the Scop module for a description of nodetypes.
553 """
554 def __init__(self, record=None):
555 self.sid = ''
556 self.residues = None
557 self.sccs = ''
558 self.sunid =''
559 self.hierarchy = []
560
561 if not record: return
562
563 # Parse a tab-deliminated CLA record.
564 entry = record.rstrip() # no trailing whitespace
565 columns = entry.split('\t') # separate the tab-delineated cols
566 if len(columns) != 6:
567 raise ValueError("I don't understand the format of %s" % entry)
568
569 self.sid, pdbid, residues, self.sccs, self.sunid, hierarchy = columns
570 self.residues = Residues(residues)
571 self.residues.pdbid = pdbid
572 self.sunid = int(self.sunid)
573
574 h = []
575 for ht in hierarchy.split(",") :
576 h.append( ht.split('='))
577 for ht in h:
578 ht[1] = int(ht[1])
579 self.hierarchy = h
580
581 def __str__(self):
582 s = []
583 s.append(self.sid)
584 s += str(self.residues).split(" ")
585 s.append(self.sccs)
586 s.append(self.sunid)
587
588 h=[]
589 for ht in self.hierarchy:
590 h.append("=".join(map(str,ht)))
591 s.append(",".join(h))
592
593 return "\t".join(map(str,s)) + "\n"
594
595 #@staticmethod
596 def records(cla_file):
597 """Iterates over a DOM file, generating DomRecords """
598 for line in cla_file:
599 if line[0] =='#': continue # A comment
600 if line.isspace() : continue
601 yield ClaRecord(line)
602 records = staticmethod(records)
603
604 # End ClaRecord
605
606
607
608
609 class DomRecord(object):
610 """Handle the SCOP DOMain file.
611
612 The DOM file has been officially deprecated. For more information see
613 the SCOP"release notes.":http://scop.berkeley.edu/release-notes-1.55.html
614 The DOM files for older releases can be found
615 "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
616
617 DOM records consist of 4 tab deliminated fields;
618 sid, pdbid, residues, hierarchy
619 For example ::
620
621 d1sctg_ 1sct g: 1.001.001.001.001.001
622 d1scth_ 1sct h: 1.001.001.001.001.001
623 d1flp__ 1flp - 1.001.001.001.001.002
624 d1moh__ 1moh - 1.001.001.001.001.002
625
626 sid -- The SCOP ID of the entry, e.g. d1anu1
627 residues -- The domain definition as a Residues object
628 hierarchy -- A string specifying where this domain is in the hierarchy.
629 """
630 def __init__(self, record= None):
631 self.sid = ''
632 self.residues = []
633 self.hierarchy = ''
634
635 if record:
636 entry = record.rstrip() # no trailing whitespace
637 columns = entry.split("\t") # separate the tab-delineated cols
638 if len(columns) != 4:
639 raise ValueError("I don't understand the format of %s" % entry)
640 self.sid, pdbid, res, self.hierarchy = columns
641 self.residues = Residues(res)
642 self.residues.pdbid = pdbid
643
644 def __str__(self):
645 s = []
646 s.append(self.sid)
647 s.append(str(self.residues).replace(" ","\t") )
648 s.append(self.hierarchy)
649 return "\t".join(s) + "\n"
650
651 #@staticmethod
652 def records(dom_file):
653 """Iterates over a DOM file, generating DomRecords """
654 for line in dom_file:
655 if line[0] =='#': continue # A comment
656 if line.isspace() : continue
657 yield DomRecord(line)
658 records = staticmethod(records)
659 # End DomRecord
660
661
662
663
664 _pdbid_re = re.compile(r"^(\w\w\w\w)(?:$|\s+|_)(.*)")
665 _fragment_re = re.compile(r"\(?(\w:)?(-?\w*)-?(-?\w*)\)?(.*)")
666
667 class Residues(object) :
668 """A collection of residues from a PDB structure.
669
670 This class provides code to work with SCOP domain definitions. These
671 are concisely expressed as a one or more chain fragments. For example,
672 "(1bba A:10-20,B:)" indicates residue 10 through 20 (inclusive) of
673 chain A, and every residue of chain B in the pdb structure 1bba. The pdb
674 id and brackets are optional. In addition "-" indicates every residue of
675 a pbd structure with one unnamed chain.
676
677 Start and end residue ids consist of the residue sequence number and an
678 optional single letter insertion code. e.g. "12", "-1", "1a", "1000"
679
680
681 pdbid -- An optional PDB id, e.g. "1bba"
682 fragments -- A sequence of tuples (chainID, startResID, endResID)
683 """
684
685
686 def __init__(self, str=None) :
687 self.pdbid = ''
688 self.fragments = ()
689 if str is not None : self._parse(str)
690
691
692 def _parse(self, string):
693 string = string.strip()
694
695 #Is there a pdbid at the front? e.g. 1bba A:1-100
696 m = _pdbid_re.match(string)
697 if m is not None :
698 self.pdbid = m.group(1)
699 string = m.group(2) # Everything else
700
701 if string=='' or string == '-' or string=='(-)': # no fragments, whole sequence
702 return
703
704 fragments = []
705 for l in string.split(",") :
706 m = _fragment_re.match(l)
707 if m is None:
708 raise ValueError("I don't understand the format of %s" % l)
709 chain, start, end, postfix = m.groups()
710
711 if postfix != "" :
712 raise ValueError("I don't understand the format of %s" % l )
713
714 if chain:
715 if chain[-1] != ':':
716 raise ValueError("I don't understand the chain in %s" % l)
717 chain = chain[:-1] # chop off the ':'
718 else :
719 chain =""
720
721 fragments.append((chain, start, end))
722 self.fragments = tuple(fragments)
723
724 def __str__(self):
725 prefix =""
726 if self.pdbid :
727 prefix =self.pdbid +' '
728
729 if not self.fragments: return prefix+'-'
730 strs = []
731 for chain, start, end in self.fragments:
732 s = []
733 if chain: s.append("%s:" % chain)
734 if start: s.append("%s-%s" % (start, end))
735 strs.append("".join(s))
736 return prefix+ ",".join(strs)
737 # End Residues
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757