diff corebio/ssearch_io/__init__.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/corebio/ssearch_io/__init__.py	Thu Oct 27 12:09:09 2011 -0400
@@ -0,0 +1,207 @@
+
+#  Copyright (c) 2006 John Gilman
+#
+#  This software is distributed under the MIT Open Source License.
+#  <http://www.opensource.org/licenses/mit-license.html>
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a 
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included
+#  in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
+#  THE SOFTWARE.
+
+""" Parse the output of BLAST and similar sequence search analysis reports. 
+
+The result of a sequence database search is represented by the Report class.
+    o Each Report contains one or more results, one for each database query.
+    o Each Result contains one or more hits
+    o Each Hit may contain one or more Alignments (High scoring Sequence pairs)
+
+CoreBio is often capable of guessing the correct format:
+>>> from corebio import ssearch_io
+>>> afile = open("test_corebio/data/ssearch/ssearch_out.txt")
+>>> report = ssearch_io.read(afile)
+>>> print report
+
+Alternatively, each report type has a seperate module. Each module defines a
+read(fin) method that can parse that report format.
+
+>>> from corebio.ssearch_io import fasta
+>>> report = fasta.read( open("test_corebio/data/ssearch/ssearch_out.txt") )
+>>> print report
+
+Module          Application            Comments  
+---------------------------------------------------------------------------
+fasta           FASTA / SSEARCH     Default (-m 1) or compact (-m 9 -d 0)
+blastxml        NCBI Blast          NCBI XML format                 
+
+Status: Beta 
+"""
+# Dev. References :
+#   Inspired by Bioperls searchIO system
+#       http://www.bioperl.org/wiki/HOWTO:SearchIO
+
+__all__ = ['read', 'Report', 'Result', 
+            'Hit','Annotation', 'Alignment']
+
+
+from corebio.utils import stdrepr
+
+def read(fin) :
+    """ Read and parse an analysis report. 
+    
+    returns :
+        A database search Report.
+    raises :
+        ValueError - If the file cannot be parsed
+    """
+
+    import fasta    
+    import blastxml
+    parsers = (fasta, blastxml)
+    for p in parsers:
+        try:    
+            return p.read(fin)
+        except ValueError, e:
+            pass
+        fin.seek(0)             # FIXME. Non seakable stdin?
+            
+    raise ValueError("Cannot parse sequence file: Tried fasta and blastxml")
+      
+
+
+class Report(object) :
+    """The results of a database search. The Report contains a list of 1 or more
+    Results, one for each query. Each query result containts a list of hits. 
+    Each Hit contains a list of HSP's (High scoring segment pairs).
+    
+    The structure of the report will vary somewhat depending on the source.
+    
+	 algorithm	          -- e.g. 'BLASTX'
+	 algorithm_version	  -- e.g. '2.2.4 [Aug-26-2002]'
+     algorithm_reference	  -- 	 
+	 database_name	      -- e.g. 'test.fa'
+	 database_letters	  -- number of residues in database e.g. 1291	 
+	 database_entries	  -- number of database entries
+
+     parameters           -- Dictionary of parameters used in search
+	 
+	 results              -- A list of list of Results, one per query
+	 """
+    __slots__ = ['algorithm', 'algorithm_version', 'algorithm_reference','database_name', 
+                'database_letters', 'database_entries', 'parameters', 'results']
+
+    def __init__(self) :
+        for name in self.__slots__ : setattr(self, name, None)
+        self.parameters = {}
+        self.results = []
+
+    def __repr__(self):
+        return stdrepr(self)
+
+
+class Result(object) :
+    """ The result from searching a database with a single query sequence.
+    
+    query        -- Information about the query sequence
+    statistics     -- A dictionary of search statistics
+    hits         -- A list of Hits
+    """
+    __slots__ = ['query', 'statistics', 'hits']
+
+    def __init__(self) :
+        for name in self.__slots__ : setattr(self, name, None)
+        self.query = Annotation() 
+        self.statistics = {}
+        self.hits = []
+
+    def __repr__(self):
+        return stdrepr(self)        
+
+        
+class Hit(object) :
+    """ A search hit between a query sequence and a subject sequence.
+    Each hit may have one or more Alignments
+    
+    target       -- Information about the target sequence. 
+    raw_score	 -- Typically the ignficance of the hit in bits, e.g. 92.0
+    significance -- Typically evalue. e.g '2e-022' 
+    alignments   -- A list of alignments between subject and target
+    """
+    __slots__ =['target', 'raw_score', 'bit_score', 'significance', 
+                'alignments']
+    def __init__(self) :
+        for name in self.__slots__ : setattr(self, name, None)
+        self.target      = Annotation()
+        self.alignments	 = []
+        
+    def __repr__(self):
+        return stdrepr(self) 
+
+class Annotation(object) :
+    """ Information about a subject or query sequence.
+    
+    name	     -- subject sequence name, e.g. '443893|124775'
+    description	 -- e.g.  'LaForas sequence'
+    length	     -- subject sequence length, e.g. 331
+    locus	     -- e.g. '124775'
+    accession	 -- e.g. '443893'
+    """
+    # Fixme: change into generic sequence annotation class?
+    __slots__ = ['name', 'description', 'length', 'locus', 'accession', ]
+
+    def __init__(self):
+        for name in self.__slots__ :
+            setattr(self, name, None)
+             
+    def __repr__(self):
+        return stdrepr(self) 
+
+class Alignment(object):
+    """An alignment between query and subject sequences. 
+    For BLAST, these are High scoring Segment pairs (HSPs)
+  
+    raw_score	     -- Typically signficance of the hit in bits, e.g. 92.0
+    significance     -- Typically evalue. e.g '2e-022' 
+
+    similar	          -- number of conserved residues #FIXME eiter frac or num
+    identical	      -- number of identical residues
+    gaps              -- number of gaps    
+    length            -- length of the alignment
+    
+    query_seq	      -- query string from alignment
+    target_seq	      -- hit string from alignment
+    mid_seq	          --
+    
+    query_start       --
+    query_frame       --
+
+    target_start      --
+    target_frame      --
+    
+    """
+    __slots__ = ['raw_score', 'bit_score', 'significance', 'similar',
+     'identical', 'gaps', 'length', 'query_seq', 'target_seq', 'mid_seq',
+      'query_start', 'query_frame', 'target_start', 
+      'target_frame']
+      
+    def __init__(self):
+        for name in self.__slots__ :
+            setattr(self, name, None)
+    
+    def __repr__(self):
+        return stdrepr(self)
+        
+                
\ No newline at end of file