Mercurial > repos > davidmurphy > codonlogo
comparison corebio/resource/astral.py @ 7:8d676bbd1f2d
Uploaded
| author | davidmurphy |
|---|---|
| date | Mon, 16 Jan 2012 07:03:36 -0500 |
| parents | c55bdc2fb9fa |
| children |
comparison
equal
deleted
inserted
replaced
| 6:4a4aca3d57c9 | 7:8d676bbd1f2d |
|---|---|
| 1 | |
| 2 # Copyright 2000 by Jeffrey Chang. All rights reserved. | |
| 3 # Copyright 2001 by Gavin E. Crooks. All rights reserved. | |
| 4 # Modifications Copyright 2004/2005 James Casbon. | |
| 5 # Copyright 2005 by Regents of the University of California. All rights reserved | |
| 6 # (Major rewrite for conformance to corebio. Gavin Crooks) | |
| 7 # | |
| 8 # This code is derived from the Biopython distribution and is governed by it's | |
| 9 # license. Please see the LICENSE file that should have been included | |
| 10 # as part of this package. | |
| 11 | |
| 12 """ ASTRAL dataset IO. | |
| 13 | |
| 14 From http://astral.berkeley.edu/ : | |
| 15 | |
| 16 The ASTRAL Compendium for Sequence and Structure Analysis | |
| 17 | |
| 18 The ASTRAL compendium provides databases and tools useful for analyzing protein structures and their sequences. It is partially derived from, and augments the SCOP: Structural Classification of Proteins database. Most of the resources depend upon the coordinate files maintained and distributed by the Protein Data Bank. | |
| 19 | |
| 20 * Classes : | |
| 21 - Raf -- A file ofASTRAL RAF (Rapid Access Format) Sequence Maps. | |
| 22 - RafSeqMap -- A sequence map, a RAF record. | |
| 23 - Res -- A single residue mapping from a RAF record. | |
| 24 | |
| 25 * Functions : | |
| 26 - parse_domain -- Convert an ASTRAL fasta header string into a Scop domain. | |
| 27 - normalize_letters -- Bormalize RAF amino acid codes. | |
| 28 | |
| 29 """ | |
| 30 | |
| 31 # TODO : Need to pull more of James Casbon's Astral code. | |
| 32 | |
| 33 import re | |
| 34 from copy import copy | |
| 35 | |
| 36 from corebio.resource.scop import Domain, Residues | |
| 37 from corebio.data import extended_three_to_one as to_one_letter_code | |
| 38 from corebio.utils import FileIndex | |
| 39 | |
| 40 __all__ = ('astral_evalues', 'astral_percent_identities', | |
| 41 'astral_evalues_filenames', 'normalize_letters', 'parse_domain', | |
| 42 'Raf', 'RafSeqMap', 'Res') | |
| 43 | |
| 44 # Percentage identity filtered ASTRAL SCOP genetic domain sequence subset | |
| 45 astral_percent_identities = [10,20,25,30,35,40,50,70,90,95,100] | |
| 46 | |
| 47 # E-value filtered ASTRAL SCOP genetic domain sequence subsets, based on PDB SEQRES records. | |
| 48 astral_evalues = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15,1e-20, 1e-25, 1e-50] | |
| 49 | |
| 50 # A map between evalues and astral filename suffixes. | |
| 51 astral_evalues_filenames = { | |
| 52 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', | |
| 53 0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', | |
| 54 1e-4: 'e-4', 1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', | |
| 55 1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' } | |
| 56 | |
| 57 | |
| 58 | |
| 59 def normalize_letters(one_letter_code) : | |
| 60 """Convert RAF one-letter amino acid codes into IUPAC standard codes. | |
| 61 Letters are uppercased, and "." ("Unknown") is converted to "X". | |
| 62 """ | |
| 63 if one_letter_code == '.' : | |
| 64 return 'X' | |
| 65 else : | |
| 66 return one_letter_code.upper() | |
| 67 | |
| 68 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") | |
| 69 def parse_domain(str) : | |
| 70 """Convert an ASTRAL fasta header string into a Scop domain. | |
| 71 | |
| 72 An ASTRAL (http://astral.stanford.edu/) header contains a concise | |
| 73 description of a SCOP domain. A very similar format is used when a | |
| 74 Domain object is converted into a string. The Domain returned by this | |
| 75 method contains most of the SCOP information, but it will not be located | |
| 76 within the SCOP hierarchy (i.e. The parent node will be None). The | |
| 77 description is composed of the SCOP protein and species descriptions. | |
| 78 | |
| 79 A typical ASTRAL header looks like -- | |
| 80 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} | |
| 81 """ | |
| 82 | |
| 83 m = _domain_re.match(str) | |
| 84 if (not m) : raise ValueError("Domain: "+ str) | |
| 85 | |
| 86 dom = Domain() | |
| 87 dom.sid = m.group(1) | |
| 88 dom.sccs = m.group(2) | |
| 89 dom.residues = Residues(m.group(3)) | |
| 90 if not dom.residues.pdbid : | |
| 91 dom.residues.pdbid= dom.sid[1:5] | |
| 92 dom.description = m.group(4).strip() | |
| 93 | |
| 94 return dom | |
| 95 | |
| 96 | |
| 97 class Raf(FileIndex) : | |
| 98 """ASTRAL RAF (Rapid Access Format) Sequence Maps. | |
| 99 | |
| 100 The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES | |
| 101 records (representing the sequence of the molecule used in an experiment) | |
| 102 and the ATOM records (representing the atoms experimentally observed). | |
| 103 | |
| 104 This data is derived from the Protein Data Bank CIF files. Known errors in | |
| 105 the CIF files are corrected manually, with the original PDB file serving as | |
| 106 the final arbiter in case of discrepancies. | |
| 107 | |
| 108 Residues are referenced by residue ID. This consists of a the PDB residue | |
| 109 sequence number (up to 4 digits) and an optional PDB insertion code (an | |
| 110 ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1" | |
| 111 | |
| 112 See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html | |
| 113 | |
| 114 The RAF file itself is about 50 MB. Each line consists of a sequence map of | |
| 115 a different protein chain. This index provides rapid, random | |
| 116 access of RAF records without having to load the entire file into memory. | |
| 117 | |
| 118 This class does not load the entire RAF file into memory. Instead, it | |
| 119 reads the file once, noting the location and content of each RafSeqMap. | |
| 120 The index key is a concatenation of the PDB ID and chain ID. e.g | |
| 121 "2drcA", "155c_". RAF uses an underscore to indicate blank | |
| 122 chain IDs. Custom maps of subsequences or spanning multiple chains, can | |
| 123 be constructed with the get_seqmap method. | |
| 124 | |
| 125 """ | |
| 126 def __init__(self, raf_file) : | |
| 127 def linekey(line) : | |
| 128 if not line or len(line)<5 or line.isspace() or line[0]=='#': | |
| 129 return None | |
| 130 return line[0:5] | |
| 131 def parser( f) : return RafSeqMap(f.readline()) | |
| 132 | |
| 133 FileIndex.__init__(self, raf_file, linekey, parser) | |
| 134 | |
| 135 | |
| 136 def get_seqmap(self, residues) : | |
| 137 """Get the sequence map for a collection of residues. | |
| 138 | |
| 139 residues -- A SCOP style description of a collection of residues from a | |
| 140 PDB strucure, (e.g. '(1bba A:10-20,B:)'), as a string or a | |
| 141 scop.Residues instance. | |
| 142 """ | |
| 143 if type(residues)== str : | |
| 144 residues = Residues(residues) | |
| 145 | |
| 146 pdbid = residues.pdbid | |
| 147 frags = residues.fragments | |
| 148 if not frags: frags =(('_','',''),) # All residues of unnamed chain | |
| 149 | |
| 150 seqMap = None | |
| 151 for frag in frags : | |
| 152 chainid = frag[0] | |
| 153 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_': | |
| 154 chainid = '_' | |
| 155 sid = pdbid + chainid | |
| 156 | |
| 157 sm = self[sid] | |
| 158 | |
| 159 # Cut out fragment of interest | |
| 160 start = 0 | |
| 161 end = len(sm.res) | |
| 162 if frag[1] : start = int(sm.index(frag[1], chainid)) | |
| 163 if frag[2] : end = int(sm.index(frag[2], chainid)+1) | |
| 164 | |
| 165 sm = sm[start:end] | |
| 166 | |
| 167 if seqMap is None : | |
| 168 seqMap = sm | |
| 169 else : | |
| 170 seqMap += sm | |
| 171 | |
| 172 return seqMap | |
| 173 # End Raf | |
| 174 | |
| 175 class RafSeqMap(object) : | |
| 176 """ASTRAL RAF (Rapid Access Format) Sequence Maps. | |
| 177 | |
| 178 RafSeqMap is a list like object; You can find the location of particular | |
| 179 residues with index(), slice this RafSeqMap into fragments, and glue | |
| 180 fragments back together with extend(). | |
| 181 | |
| 182 - pdbid -- The PDB 4 character ID | |
| 183 - pdb_datestamp -- From the PDB file | |
| 184 - version -- The RAF format version. e.g. 0.01 | |
| 185 - flags -- RAF flags. (See release notes for more information.) | |
| 186 - res -- A list of Res objects, one for each residue in this sequence map | |
| 187 """ | |
| 188 | |
| 189 def __init__(self, raf_record=None) : | |
| 190 """Parses a RAF record into a RafSeqMap object.""" | |
| 191 | |
| 192 self.pdbid = '' | |
| 193 self.pdb_datestamp = '' | |
| 194 self.version = '' | |
| 195 self.flags = '' | |
| 196 self.res = [] | |
| 197 | |
| 198 if not raf_record : return | |
| 199 | |
| 200 header_len = 38 | |
| 201 line = raf_record.rstrip() # no trailing whitespace | |
| 202 | |
| 203 if len(line)<header_len: | |
| 204 raise ValueError("Incomplete header: "+line) | |
| 205 | |
| 206 self.pdbid = line[0:4] | |
| 207 chainid = line[4:5] | |
| 208 | |
| 209 self.version = line[6:10] | |
| 210 | |
| 211 # Raf format versions 0.01 and 0.02 are identical for practical purposes | |
| 212 if(self.version != "0.01" and self.version !="0.02") : | |
| 213 raise ValueError("Incompatible RAF version: "+self.version) | |
| 214 | |
| 215 self.pdb_datestamp = line[14:20] | |
| 216 self.flags = line[21:27] | |
| 217 | |
| 218 for i in range(header_len, len(line), 7) : | |
| 219 f = line[i : i+7] | |
| 220 if len(f)!=7: | |
| 221 raise ValueError("Corrupt Field: ("+f+")" ) | |
| 222 r = Res() | |
| 223 r.chainid = chainid | |
| 224 r.resid = f[0:5].strip() | |
| 225 r.atom = normalize_letters(f[5:6]) | |
| 226 r.seqres = normalize_letters(f[6:7]) | |
| 227 | |
| 228 self.res.append(r) | |
| 229 # end __init__ | |
| 230 | |
| 231 #@staticmethod | |
| 232 def records(raf_file) : | |
| 233 """Iterates over a Raf file, generating RafSeqMaps """ | |
| 234 for line in raf_file: | |
| 235 if line[0] =='#': continue # A comment | |
| 236 if line.isspace() : continue | |
| 237 yield RafSeqMap(line) | |
| 238 records = staticmethod(records) | |
| 239 | |
| 240 def index(self, resid, chainid="_") : | |
| 241 for i in range(0, len(self.res)) : | |
| 242 if self.res[i].resid == resid and self.res[i].chainid == chainid : | |
| 243 return i | |
| 244 raise KeyError("No such residue "+chainid+resid) | |
| 245 | |
| 246 def __getslice__(self, i, j) : | |
| 247 s = copy(self) | |
| 248 s.res = s.res[i:j] | |
| 249 return s | |
| 250 | |
| 251 def append(self, res) : | |
| 252 """Append another Res object onto the list of residue mappings.""" | |
| 253 self.res.append(res) | |
| 254 | |
| 255 def extend(self, other) : | |
| 256 """Append another RafSeqMap onto the end of self. | |
| 257 | |
| 258 Both RafSeqMaps must have the same PDB ID, PDB datestamp and | |
| 259 RAF version. The RAF flags are erased if they are inconsistent. This | |
| 260 may happen when fragments are taken from different chains. | |
| 261 """ | |
| 262 if not isinstance(other, RafSeqMap): | |
| 263 raise TypeError("Can only extend a RafSeqMap with a RafSeqMap.") | |
| 264 if self.pdbid != other.pdbid : | |
| 265 raise TypeError("Cannot add fragments from different proteins.") | |
| 266 if self.version != other.version : | |
| 267 raise TypeError("Incompatible rafs.") | |
| 268 if self.pdb_datestamp != other.pdb_datestamp : | |
| 269 raise TypeError("Different pdb dates!") | |
| 270 if self.flags != other.flags : | |
| 271 self.flags = '' | |
| 272 self.res += other.res | |
| 273 | |
| 274 def __iadd__(self, other) : | |
| 275 self.extend(other) | |
| 276 return self | |
| 277 | |
| 278 def __add__(self, other) : | |
| 279 s = copy(self) | |
| 280 s.extend(other) | |
| 281 return s | |
| 282 | |
| 283 def extract_atoms(self, pdb_handle, out_handle) : | |
| 284 """Extract all relevant ATOM and HETATOM records from a PDB file. | |
| 285 | |
| 286 The PDB file is scanned for ATOM and HETATOM records. If the | |
| 287 chain ID, residue ID (seqNum and iCode), and residue type match | |
| 288 a residue in this sequence map, then the record is echoed to the | |
| 289 output handle. | |
| 290 | |
| 291 This is typically used to find the coordinates of a domain, or other | |
| 292 residue subset. | |
| 293 | |
| 294 pdb_file -- A handle to the relevant PDB file. | |
| 295 out_file -- All output is written to this stream. | |
| 296 """ | |
| 297 resSet = {} | |
| 298 for r in self.res : | |
| 299 if r.atom=='X' : # Unknown residue type | |
| 300 continue | |
| 301 chainid = r.chainid | |
| 302 if chainid == '_': | |
| 303 chainid = ' ' | |
| 304 resid = r.resid | |
| 305 resSet[(chainid,resid)] = r | |
| 306 | |
| 307 resFound = {} | |
| 308 for line in pdb_handle : | |
| 309 if line.startswith("ATOM ") or line.startswith("HETATM") : | |
| 310 chainid = line[21:22] | |
| 311 resid = line[22:27].strip() | |
| 312 key = (chainid, resid) | |
| 313 if key in resSet: | |
| 314 res = resSet[key] | |
| 315 atom_aa = res.atom | |
| 316 resName = line[17:20].capitilize() | |
| 317 if resName in to_one_letter_code : | |
| 318 if to_one_letter_code[resName] == atom_aa : | |
| 319 out_handle.write(line) | |
| 320 resFound[key] = res | |
| 321 | |
| 322 if len(resSet) != len(resFound) : | |
| 323 raise RuntimeError('I could not find at least one ATOM or ' | |
| 324 'HETATM record for each and every residue in this sequence map.') | |
| 325 | |
| 326 | |
| 327 class Res(object) : | |
| 328 """ A single residue mapping from a RAF record. | |
| 329 | |
| 330 - chainid -- A single character chain ID. | |
| 331 - resid -- The residue ID. | |
| 332 - atom -- amino acid one-letter code from ATOM records. | |
| 333 - seqres -- amino acid one-letter code from SEQRES records. | |
| 334 """ | |
| 335 def __init__(self) : | |
| 336 self.chainid = '' | |
| 337 self.resid = '' | |
| 338 self.atom = '' | |
| 339 self.seqres = '' | |
| 340 | |
| 341 |
