diff corebio/resource/scop.py @ 4:4d47ab2b7bcc

Uploaded
author davidmurphy
date Fri, 13 Jan 2012 07:18:19 -0500
parents c55bdc2fb9fa
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/corebio/resource/scop.py	Fri Jan 13 07:18:19 2012 -0500
@@ -0,0 +1,757 @@
+
+# Copyright 2000 by Jeffrey Chang.  All rights reserved.
+# Copyright 2001 by Gavin E. Crooks.  All rights reserved.
+# Modifications Copyright 2004/2005 James Casbon. 
+# Copyright 2005 by Regents of the University of California. All rights Reserved.
+#   (Major rewrite for conformance to corebio. Gavin Crooks)
+#
+# This code is derived from the Biopython distribution and is governed by it's
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+
+
+""" SCOP: Structural Classification of Proteins.
+
+The SCOP database aims to provide a manually constructed classification of
+all know protein structures into a hierarchy, the main levels of which
+are family, superfamily and fold.
+
+* SCOP: http://scop.mrc-lmb.cam.ac.uk/scop/
+* Introduction: http://scop.mrc-lmb.cam.ac.uk/scop/intro.html
+* SCOP parsable files: http://scop.mrc-lmb.cam.ac.uk/scop/parse/
+
+The Scop object in this module represents the entire SCOP classification. It
+can be built from the three SCOP parsable files (see DesRecord, HieRecord and
+ClaRecord), modified is so desired, and converted back to the same file formats.
+A single SCOP domain (represented by the Domain class) can be obtained from
+Scop using the domain's SCOP identifier (sid).
+
+Classes:
+ - Scop     -- The entire SCOP hierarchy.
+ - Node     -- A node in the SCOP hierarchy.
+ - Domain   -- A SCOP domain.
+ - Residues -- A collection of residues from a PDB structure.
+ - HieRecord -- Handle the SCOP HIErarchy files.
+ - DesRecord -- Handle the SCOP DEScription file.
+ - ClaRecord -- Handle the SCOP CLAssification file.
+
+
+nodeCodeDict  -- A mapping between known 2 letter node codes and a longer
+                  description. The known node types are 'cl' (class), 'cf'
+                  (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain), 
+                  'sp' (species), 'px' (domain). Additional node types may
+                  be added in the future.
+"""
+
+import os, re
+
+
+nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily',
+                 'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'}
+
+
+_nodetype_to_code= dict([[v,k] for k,v in nodeCodeDict.items()])
+
+
+nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ] 
+
+
+def cmp_sccs(sccs1, sccs2) :
+    """Order SCOP concise classification strings (sccs).
+
+    a.4.5.1 < a.4.5.11 < b.1.1.1 
+
+    A sccs (e.g. a.4.5.11) compactly represents a domain's classification.
+    The letter represents the class, and the numbers are the fold,
+    superfamily, and family, respectively.
+
+    """
+
+    s1 = sccs1.split(".")
+    s2 = sccs2.split(".")
+
+    if s1[0] != s2[0]: return cmp(s1[0], s2[0])
+
+    s1 = map(int, s1[1:])
+    s2 = map(int, s2[1:])
+
+    return cmp(s1,s2)
+
+
+
+def _open_scop_file(scop_dir_path, version, filetype) :
+    filename = "dir.%s.scop.txt_%s" % (filetype,version)
+    afile = open(os.path.join( scop_dir_path, filename))
+    return afile
+
+
+class Scop(object):
+    """The entire SCOP hierarchy.
+
+    root             -- The root node of the hierarchy 
+    domains          -- A list of all domains
+    nodes_by_sid     -- A dictionary of nodes indexed by SCOP identifier 
+                        (e.g. 'd1hbia_')
+    domains_by_sunid -- A dictionary of domains indexed by SCOP uniquie
+                        identifiers (e.g. 14996)
+    """
+    def __init__(self):
+        """ An empty Scop object.
+        
+        See also Scop.parse() and Scop.parse_files()
+        """
+        self.root = None
+        self.domains = []
+        self.nodes_by_sunid = dict()
+        self.domains_by_sid = dict()
+
+    #@classmethod
+    def parse(cls, dir_path, version) :
+        """Build the SCOP hierarchy from the SCOP parsable files.
+            
+         - dir_path -- A directory that contains the SCOP files
+         - version  -- The SCOP version (as a string)
+         
+        The SCOP files are named dir.XXX.scop.txt_VERSION, where XXX
+        is 'cla', 'des' or 'hie'.
+        """
+        cla_file = None
+        des_file = None
+        hie_file = None
+        try :
+            cla_file = _open_scop_file( dir_path, version, 'cla')
+            des_file = _open_scop_file( dir_path, version, 'des')
+            hie_file = _open_scop_file( dir_path, version, 'hie')
+            scop = cls.parse_files(cla_file, des_file, hie_file) 
+        finally :
+            # If we opened the files, we close the files
+            if cla_file : cla_file.close()
+            if des_file : des_file.close()
+            if hie_file : hie_file.close()
+        
+        return scop
+    parse = classmethod(parse)
+    
+    #@classmethod
+    def parse_files(cls, cla_file, des_file, hie_file):
+        """Build the SCOP hierarchy from the SCOP parsable files.
+        
+            - cla_file -- the CLA clasification file
+            - des_file -- the DES description file
+            - hie_file -- the HIE hierarchy file
+        """
+
+        self = cls()
+
+        sunidDict = {}
+
+        root = Node()
+        domains = []
+        root.sunid=0
+        root.type='ro'
+        sunidDict[root.sunid] = root
+
+        root.description = 'SCOP Root'
+
+        # Build the rest of the nodes using the DES file
+        for rec in DesRecord.records(des_file):
+            if rec.nodetype =='px' :
+                n = Domain()
+                n.sid = rec.name
+                domains.append(n)
+            else : 
+                n = Node()
+            n.sunid = rec.sunid
+            n.type = rec.nodetype
+            n.sccs = rec.sccs
+            n.description = rec.description
+                
+            sunidDict[n.sunid] = n
+ 
+        # Glue all of the Nodes together using the HIE file
+        for rec in HieRecord.records(hie_file):
+            if not rec.sunid in sunidDict :
+                print rec.sunid                     #FIXME: HUH?
+                    
+            n = sunidDict[rec.sunid]
+            if rec.parent !='': # Not root node
+                if not rec.parent in sunidDict :
+                    raise ValueError("Incomplete data?")
+                n.parent = sunidDict[rec.parent]
+                
+            for c in rec.children:
+                if not c in sunidDict :
+                    raise ValueError("Incomplete data?")
+                n.children.append(sunidDict[c])
+
+                        
+        # Fill in the gaps with information from the CLA file
+        sidDict = {}
+        for rec in ClaRecord.records(cla_file):
+            n = sunidDict[rec.sunid]
+            assert n.sccs == rec.sccs
+            assert n.sid == rec.sid
+            n.residues = rec.residues
+            sidDict[n.sid] = n
+
+        # Clean up
+        self.root = root
+        self.nodes_by_sunid = sunidDict
+        self.domains_by_sid = sidDict
+        self.domains = tuple(domains)
+
+        return self
+    parse_files = classmethod(parse_files)
+
+
+    def write_hie(self, stream) :
+        """Build an HIE SCOP parsable file from this object"""
+        nodes = self.nodes_by_sunid.values()
+        # We order nodes to ease comparison with original file
+        nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
+
+        for n in nodes :
+            stream.write(str(n.to_hie_record()))
+
+
+    def write_des(self, stream) :
+        """Build a DES SCOP parsable file from this object""" 
+        nodes = self.nodes_by_sunid.values()
+        # Origional SCOP file is not ordered?
+        nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
+
+        for n in nodes :
+            if n != self.root :
+                stream.write(str(n.to_des_record()))
+
+
+    def write_cla(self, stream) :
+        """Build a CLA SCOP parsable file from this object"""                
+        nodes = self.domains_by_sid.values()
+        # We order nodes to ease comparison with original file
+        nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
+
+        for n in nodes :
+            stream.write(str(n.to_cla_record()))
+# End Scop
+     
+
+  
+class Node(object) :
+    """ A node in the Scop hierarchy
+
+    sunid  -- SCOP unique identifiers. e.g. '14986'
+    parent -- The parent node
+    children -- A list of child nodes
+    sccs     -- SCOP concise classification string. e.g. 'a.1.1.2'
+    type     -- A 2 letter node type code. e.g. 'px' for domains
+    description -- 
+        
+    """
+    def __init__(self) :
+        """A new, uninitilized SCOP node."""
+        self.sunid=''    
+        self.parent = None
+        self.children=[]
+        self.sccs = ''   
+        self.type =''    
+        self.description =''
+
+    def __str__(self) :
+        s = []
+        s.append(str(self.sunid))
+        s.append(self.sccs)
+        s.append(self.type)
+        s.append(self.description)
+
+        return " ".join(s)
+
+    def to_hie_record(self):
+        """Return an Hie.Record"""
+        rec = HieRecord()
+        rec.sunid = str(self.sunid)
+        if self.parent : # Not root node
+            rec.parent = str(self.parent.sunid)
+        else:
+            rec.parent = '-'
+        for c in self.children :
+            rec.children.append(str(c.sunid))
+        return rec
+    
+    def to_des_record(self):
+        """Return a Des.Record"""        
+        rec = DesRecord()
+        rec.sunid = str(self.sunid)
+        rec.nodetype = self.type
+        rec.sccs = self.sccs
+        rec.description = self.description
+        return rec
+        
+    def descendents( self, node_type) :
+        """ Return a list of all decendent nodes of the given type. Node type
+        can be a two letter code or longer description. e.g. 'fa' or 'family'
+        """
+        if node_type in _nodetype_to_code:
+            node_type = _nodetype_to_code[node_type]
+            
+        nodes = [self]
+
+        while nodes[0].type != node_type:
+            if nodes[0].type == 'px' : 
+                return [] # Fell of the bottom of the hierarchy
+            child_list = []
+            for n in nodes:
+                for child in n.children:
+                    child_list.append( child )
+                nodes = child_list
+                
+        return nodes
+                    
+
+    def ascendent( self, node_type) :
+        """ Return the ancestor node of the given type, or None. Node type can 
+        be a two letter code or longer description. e.g. 'fa' or 'family'
+        """
+        if node_type in _nodetype_to_code :
+            node_type = _nodetype_to_code[node_type]
+
+        n = self
+        if n.type == node_type: return None
+        while n.type != node_type:
+            if n.type == 'ro': 
+                return None # Fell of the top of the hierarchy
+            n = n.parent            
+                
+        return n
+# End Node                                                            
+    
+
+class Domain(Node) :
+    """ A SCOP domain. A leaf node in the Scop hierarchy.
+
+    - sid      -- The SCOP domain identifier. e.g. 'd5hbib_'
+    - residues -- A Residue object. It defines the collection
+                  of PDB atoms that make up this domain.
+    """
+    def __init__(self) :
+        Node.__init__(self)
+        self.sid = ''         
+        self.residues = None
+
+    def __str__(self) :
+        s = []
+        s.append(self.sid)
+        s.append(self.sccs)
+        s.append("("+str(self.residues)+")")
+
+        if not self.parent :
+            s.append(self.description)
+        else :
+            sp = self.parent
+            dm = sp.parent
+            s.append(dm.description)
+            s.append("{"+sp.description+"}")
+
+        return " ".join(s)
+
+    def to_des_record(self):
+        """Return a des.Record"""
+        rec = Node.to_des_record(self)
+        rec.name = self.sid
+        return rec
+
+    def to_cla_record(self) :
+        """Return a cla.Record"""        
+        rec = ClaRecord()
+        rec.sid = self.sid
+        rec.residues = self.residues
+        rec.sccs = self.sccs
+        rec.sunid = self.sunid
+        
+        n = self
+        while n.sunid != 0: # Not root node
+            rec.hierarchy.append( (n.type, str(n.sunid)) )
+            n = n.parent
+
+        rec.hierarchy.reverse()
+       
+        return rec
+# End Domain
+
+
+
+class DesRecord(object):
+    """ Handle the SCOP DEScription file.
+
+    The file format is described in the scop
+    "release notes.":http://scop.berkeley.edu/release-notes-1.55.html 
+    The latest DES file can be found
+    "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
+  
+    The DES file consisnt of one DES record per line. Each record 
+    holds information for one node in the SCOP hierarchy, and consist
+    of 5 tab deliminated fields,
+    sunid, node type, sccs, node name, node description.
+
+    For example ::
+    
+    21953   px      b.1.2.1 d1dan.1 1dan T:,U:91-106
+    48724   cl      b       -       All beta proteins
+    48725   cf      b.1     -       Immunoglobulin-like beta-sandwich
+    49265   sf      b.1.2   -       Fibronectin type III
+    49266   fa      b.1.2.1 -       Fibronectin type III
+
+   
+    - sunid       -- SCOP unique identifiers
+    - nodetype    -- One of 'cl' (class), 'cf' (fold), 'sf' (superfamily),
+                   'fa' (family), 'dm' (protein), 'sp' (species),
+                   'px' (domain). Additional node types may be added.
+    - sccs        -- SCOP concise classification strings. e.g. b.1.2.1
+    - name        -- The SCOP ID (sid) for domains (e.g. d1anu1),
+                   currently empty for other node types
+    - description --  e.g. "All beta proteins","Fibronectin type III", 
+    """
+    def __init__(self, record=None):
+        
+        if not record :
+            self.sunid = ''
+            self.nodetype = ''
+            self.sccs = ''
+            self.name = ''
+            self.description =''
+        else :
+            entry = record.rstrip()  # no trailing whitespace
+            columns = entry.split("\t")  # separate the tab-delineated cols
+            if len(columns) != 5:
+                raise ValueError("I don't understand the format of %s" % entry)
+        
+            self.sunid, self.nodetype, self.sccs, self.name, self.description \
+                = columns
+            if self.name == '-' : self.name =''
+            self.sunid = int(self.sunid)
+        
+    def __str__(self):
+        s = []
+        s.append(self.sunid)
+        s.append(self.nodetype)        
+        s.append(self.sccs)        
+        if self.name :
+            s.append(self.name)
+        else :
+            s.append("-")
+        s.append(self.description)        
+        return "\t".join(map(str,s)) + "\n"
+
+    #@staticmethod
+    def records(des_file):
+        """Iterates over a DES file, generating DesRecords """
+        for line in des_file:
+            if line[0] =='#':  continue  # A comment 
+            if line.isspace() : continue
+            yield DesRecord(line)
+    records = staticmethod(records)
+# End DesRecord        
+
+class HieRecord(object):
+    """Handle the SCOP HIErarchy files, which describe the SCOP hierarchy in
+    terms of SCOP unique identifiers (sunid).
+
+    The file format is described in the scop
+    "release notes.":http://scop.berkeley.edu/release-notes-1.55.html 
+    The latest HIE file can be found
+    "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
+  
+    "Release 1.55":http://scop.berkeley.edu/parse/dir.hie.scop.txt_1.55     
+    Records consist of 3 tab deliminated fields; node's sunid,
+    parent's sunid, and a list of children's sunids. For example ::
+    
+    0       -       46456,48724,51349,53931,56572,56835,56992,57942
+    21953   49268   -
+    49267   49266   49268,49269
+    
+    Each record holds information for one node in the SCOP hierarchy.
+
+    sunid      -- SCOP unique identifiers of this node
+    parent     -- Parents sunid
+    children   -- Sequence of childrens sunids
+    """
+    def __init__(self, record = None):
+        self.sunid = None
+        self.parent = None
+        self.children = []
+        
+        if not record : return
+        
+        # Parses HIE records.
+        entry = record.rstrip()        # no trailing whitespace
+        columns = entry.split('\t')   # separate the tab-delineated cols
+        if len(columns) != 3:
+            raise ValueError("I don't understand the format of %s" % entry)
+        
+        self.sunid, self.parent, children = columns
+
+        if self.sunid =='-' : self.sunid = ''
+        if self.parent =='-' : self.parent = ''
+        else : self.parent = int( self.parent )
+
+        if children =='-' :
+            self.children = ()
+        else :
+            self.children = children.split(',')
+            self.children = map ( int, self.children )
+
+        self.sunid = int(self.sunid)
+        
+    def __str__(self):
+        s = []
+        s.append(str(self.sunid))
+
+        if self.parent:
+            s.append(str(self.parent))
+        else:
+            if self.sunid != 0:
+                s.append('0')
+            else:
+                s.append('-')
+                
+        if self.children :
+            child_str = map(str, self.children)
+            s.append(",".join(child_str))
+        else:
+            s.append('-')
+
+        return "\t".join(s) + "\n"
+
+
+    #@staticmethod
+    def records(hie_file):
+        """Iterates over a DOM file, generating DomRecords """
+        for line in hie_file:
+            if line[0] =='#':  continue  # A comment 
+            if line.isspace() : continue
+            yield HieRecord(line)
+    records = staticmethod(records)
+# End HieRecord
+
+
+
+class ClaRecord(object):
+    """Handle the SCOP CLAssification file, which describes SCOP domains.
+
+    The file format is described in the scop
+    "release notes.":http://scop.berkeley.edu/release-notes-1.55.html 
+    The latest CLA file can be found
+    "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
+
+    sid         --  SCOP identifier. e.g. d1danl2
+    residues    --  The domain definition as a Residues object
+    sccs        --  SCOP concise classification strings.  e.g. b.1.2.1
+    sunid       --  SCOP unique identifier for this domain
+    hierarchy   --  A sequence of tuples (nodetype, sunid) describing the
+                    location of this domain in the SCOP hierarchy.
+                    See the Scop module for a description of nodetypes.
+    """
+    def __init__(self, record=None):
+        self.sid = ''
+        self.residues = None 
+        self.sccs = ''
+        self.sunid =''
+        self.hierarchy = []
+        
+        if not record: return
+        
+        # Parse a tab-deliminated CLA record.
+        entry = record.rstrip()        # no trailing whitespace
+        columns = entry.split('\t')   # separate the tab-delineated cols
+        if len(columns) != 6:
+            raise ValueError("I don't understand the format of %s" % entry)
+
+        self.sid, pdbid, residues, self.sccs, self.sunid, hierarchy = columns
+        self.residues = Residues(residues)
+        self.residues.pdbid = pdbid
+        self.sunid = int(self.sunid)
+        
+        h = []
+        for ht in hierarchy.split(",") :
+            h.append( ht.split('='))        
+        for ht in h:
+            ht[1] = int(ht[1])
+        self.hierarchy = h
+    
+    def __str__(self):
+        s = []
+        s.append(self.sid)
+        s += str(self.residues).split(" ")
+        s.append(self.sccs)
+        s.append(self.sunid)
+
+        h=[]
+        for ht in self.hierarchy:
+             h.append("=".join(map(str,ht))) 
+        s.append(",".join(h))
+       
+        return "\t".join(map(str,s)) + "\n"
+
+    #@staticmethod
+    def records(cla_file):
+        """Iterates over a DOM file, generating DomRecords """
+        for line in cla_file:
+            if line[0] =='#':  continue  # A comment 
+            if line.isspace() : continue
+            yield ClaRecord(line)
+    records = staticmethod(records)
+
+# End ClaRecord
+   
+   
+
+
+class DomRecord(object):
+    """Handle the SCOP DOMain file.
+
+    The DOM file has been officially deprecated. For more information see
+    the SCOP"release notes.":http://scop.berkeley.edu/release-notes-1.55.html 
+    The DOM files for older releases can be found 
+    "elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
+
+    DOM records consist of 4 tab deliminated fields;
+    sid, pdbid, residues, hierarchy
+    For example ::
+    
+    d1sctg_ 1sct    g:      1.001.001.001.001.001
+    d1scth_ 1sct    h:      1.001.001.001.001.001
+    d1flp__ 1flp    -       1.001.001.001.001.002
+    d1moh__ 1moh    -       1.001.001.001.001.002
+
+    sid -- The SCOP ID of the entry, e.g. d1anu1
+    residues -- The domain definition as a Residues object
+    hierarchy -- A string specifying where this domain is in the hierarchy.
+    """
+    def __init__(self, record= None):
+        self.sid = ''
+        self.residues = []
+        self.hierarchy = ''
+        
+        if record:
+            entry = record.rstrip()  # no trailing whitespace
+            columns = entry.split("\t")  # separate the tab-delineated cols
+            if len(columns) != 4:
+                raise ValueError("I don't understand the format of %s" % entry)
+            self.sid, pdbid, res, self.hierarchy = columns
+            self.residues = Residues(res)
+            self.residues.pdbid = pdbid
+        
+    def __str__(self):
+        s = []
+        s.append(self.sid)
+        s.append(str(self.residues).replace(" ","\t") )
+        s.append(self.hierarchy)
+        return "\t".join(s) + "\n"
+        
+    #@staticmethod
+    def records(dom_file):
+        """Iterates over a DOM file, generating DomRecords """
+        for line in dom_file:
+            if line[0] =='#':  continue  # A comment 
+            if line.isspace() : continue
+            yield DomRecord(line)
+    records = staticmethod(records)
+# End DomRecord
+    
+
+    
+
+_pdbid_re = re.compile(r"^(\w\w\w\w)(?:$|\s+|_)(.*)")
+_fragment_re = re.compile(r"\(?(\w:)?(-?\w*)-?(-?\w*)\)?(.*)")
+
+class Residues(object) :
+    """A collection of residues from a PDB structure.
+
+    This class provides code to work with SCOP domain definitions. These
+    are concisely expressed as a one or more chain fragments. For example,
+    "(1bba A:10-20,B:)" indicates residue 10 through 20 (inclusive) of
+    chain A, and every residue of chain B in the pdb structure 1bba. The pdb
+    id and brackets are optional. In addition "-" indicates every residue of
+    a pbd structure with one unnamed chain.
+
+    Start and end residue ids consist of the residue sequence number and an
+    optional single letter insertion code. e.g. "12", "-1", "1a", "1000"
+
+
+    pdbid -- An optional PDB id, e.g. "1bba"
+    fragments -- A sequence of tuples (chainID, startResID, endResID)
+    """
+
+
+    def __init__(self, str=None) :
+        self.pdbid = ''
+        self.fragments = ()
+        if str is not None : self._parse(str)
+
+
+    def _parse(self, string):
+        string = string.strip()
+
+        #Is there a pdbid at the front? e.g. 1bba A:1-100
+        m = _pdbid_re.match(string)
+        if m is not None :
+            self.pdbid = m.group(1)
+            string = m.group(2) # Everything else
+
+        if string=='' or string == '-' or string=='(-)':  # no fragments, whole sequence
+            return
+    
+        fragments = []
+        for l in string.split(",") :
+            m = _fragment_re.match(l)
+            if m is None:
+                raise ValueError("I don't understand the format of %s" % l)
+            chain, start, end, postfix = m.groups()
+
+            if postfix != "" :
+                 raise ValueError("I don't understand the format of %s" % l )
+
+            if chain:
+                if chain[-1] != ':':
+                    raise ValueError("I don't understand the chain in %s" % l)
+                chain = chain[:-1]   # chop off the ':'
+            else :
+                chain ="" 
+            
+            fragments.append((chain, start, end))
+        self.fragments = tuple(fragments)
+            
+    def __str__(self):
+        prefix =""
+        if self.pdbid :
+            prefix =self.pdbid +' '
+            
+        if not self.fragments: return prefix+'-'
+        strs = []
+        for chain, start, end in self.fragments:
+            s = []
+            if chain: s.append("%s:" % chain)
+            if start: s.append("%s-%s" % (start, end))
+            strs.append("".join(s))
+        return prefix+ ",".join(strs)
+# End Residues
+    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+