diff corebio/ssearch_io/blastxml.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/corebio/ssearch_io/blastxml.py	Thu Oct 27 12:09:09 2011 -0400
@@ -0,0 +1,249 @@
+
+#  Copyright (c) 2006 John Gilman
+#
+#  This software is distributed under the MIT Open Source License.
+#  <http://www.opensource.org/licenses/mit-license.html>
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a 
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included
+#  in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
+#  THE SOFTWARE.
+
+
+"""Read BLAST XML output.
+
+The DTD is available at
+http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.mod.dtd
+
+"""
+
+# See also
+# 
+# http://bugzilla.open-bio.org/show_bug.cgi?id=1933
+#http://portal.open-bio.org/pipermail/biojava-dev/2004-December/002513.html
+
+
+from corebio.ssearch_io import Report, Result, Hit, Annotation, Alignment
+
+import xml.sax
+from xml.sax.handler import ContentHandler
+
+__all__ = 'read'
+
+def read(fin):
+    """Read BLAST xml output and return a list of Result objects.
+    """
+    parser = xml.sax.make_parser()
+    handler = _BlastHandler() 
+    parser.setContentHandler(handler)
+        
+    #To avoid ValueError: unknown url type: NCBI_BlastOutput.dtd
+    parser.setFeature(xml.sax.handler.feature_validation, 0)
+    parser.setFeature(xml.sax.handler.feature_namespaces, 0)
+    parser.setFeature(xml.sax.handler.feature_external_pes, 0)
+    parser.setFeature(xml.sax.handler.feature_external_ges, 0)
+
+    try :
+        parser.parse(fin)
+    except xml.sax.SAXParseException, e :
+        raise ValueError( "Cannot parse file; "+str(e))
+    return handler.report
+
+class _BlastHandler( ContentHandler) :
+    def __init__(self):
+        """
+        """
+        ContentHandler.__init__(self)
+        self._content = []
+        self.report = None
+        self._result = None
+        self._hit = None
+        self._hsp = None
+
+        
+    def characters(self, ch):
+        self._content.append(ch) 
+   
+    def startDocument(self):
+        self.report = Report()
+        
+    def endDocument(self) :
+        pass
+        
+    def startElement(self, name, attr):
+        if name == 'BlastOutput' :
+            pass
+        elif name == 'Iteration' :
+            result = Result()
+            self._result = result
+            self.report.results.append(result)
+        elif name == 'Parameters' :
+            pass
+        elif name == 'Statistics' :
+            pass
+        elif name == 'Hit' :
+            self._hit = Hit()
+            self._result.hits.append(self._hit)
+        elif name == 'Hsp' :
+            self._hsp = Alignment()
+            self._hit.alignments.append(self._hsp)
+        else :
+            pass
+
+
+    def endElement(self, name):
+        content = ''.join(self._content).strip()
+        self._content = []
+
+        report = self.report
+        result = self._result
+        hsp = self._hsp
+        hit = self._hit
+        
+        if name == 'BlastOutput' : 
+            pass
+        elif name == 'BlastOutput_program' :
+            report.algorithm = content
+        elif name == 'BlastOutput_version' :
+            report.algorithm_version = content.split()[1]
+        elif name == 'BlastOutput_reference' :
+            report.algorithm_reference = content
+        elif name == 'BlastOutput_db' :
+            report.database_name = content
+        elif name == 'BlastOutput_query-ID' : pass
+        elif name == 'BlastOutput_query-def' : pass
+        elif name == 'BlastOutput_query-len' : pass
+        elif name == 'BlastOutput_query-seq' : pass            
+        elif name == 'BlastOutput_param' : pass
+        elif name == 'BlastOutput_iterations' : pass
+        elif name == 'BlastOutput_mbstat' : pass
+            
+        elif name == 'Iteration' : pass
+        elif name == 'Iteration_iter-num' : pass            
+        elif name == 'Iteration_query-ID' :  
+            result.query.name = content
+        elif name == 'Iteration_query-def' :             
+            result.query.description = content
+        elif name == 'Iteration_query-len' : 
+            result.query.length = int(content)            
+        elif name == 'Iteration_hits' : pass            
+        elif name == 'Iteration_stat' : pass            
+        elif name == 'Iteration_message' : pass  
+                      
+        elif name == 'Parameters' : 
+            pass        
+        elif name == 'Parameters_matrix' :
+            report.parameters['matrix'] = content            
+        elif name == 'Parameters_expect' :
+            report.parameters['expect'] = content              
+        elif name == 'Parameters_include' :
+            report.parameters['include'] = content              
+        elif name == 'Parameters_sc-match' :
+            report.parameters['sc-match'] = content              
+        elif name == 'Parameters_sc-mismatch' :
+            report.parameters['sc-mismatch'] = content              
+        elif name == 'Parameters_gap-open' :
+            report.parameters['gap-open'] = content              
+        elif name == 'Parameters_gap-extend' :
+            report.parameters['gap-extend'] = content              
+        elif name == 'Parameters_filter' :
+            report.parameters['filter'] = content              
+        elif name == 'Parameters_pattern' :
+            report.parameters['pattern'] = content              
+        elif name == 'Parameters_entrez-query' :
+            report.parameters['entrez-query'] = content  
+
+        elif name == 'Statistics' :
+            pass              
+        elif name == 'Statistics_db-num' :
+            result.statistics['db-num'] = int(content)            
+        elif name == 'Statistics_db-len' :
+            result.statistics['db-len'] = int(content)              
+        elif name == 'Statistics_hsp-len' :
+            result.statistics['hsp-len'] = int(content)            
+        elif name == 'Statistics_eff-space' :
+            result.statistics['eff-space'] = float(content)            
+        elif name == 'Statistics_kappa' :
+            result.statistics['kappa'] = float(content)            
+        elif name == 'Statistics_lambda' :
+            result.statistics['lambda'] = float(content)          
+        elif name == 'Statistics_entropy' :
+            result.statistics['entropy'] = float(content)            
+
+        elif name == 'Hit' :
+            self._hit = None
+        elif name == 'Hit_num' :
+            pass            
+        elif name == 'Hit_id' :
+            hit.target.name = content            
+        elif name == 'Hit_def' :
+            hit.target.description = content
+        elif name == 'Hit_accession' :
+            hit.target.accession = content            
+        elif name == 'Hit_len' :
+            hit.target.length = int(content)             
+        elif name == 'Hit_hsps' :
+            pass            
+
+        elif name == 'Hsp' :
+            self._hsp = None                
+        elif name == 'Hsp_num' :
+            pass            
+        elif name == 'Hsp_bit-score' :
+            hsp.bit_score = float(content)            
+        elif name == 'Hsp_score' :
+            hsp.raw_score = float(content)             
+        elif name == 'Hsp_evalue' :
+            hsp.significance = float(content)             
+        elif name == 'Hsp_query-from' :
+            hsp.query_start = int(content) -1           
+        elif name == 'Hsp_query-to' :
+            #hsp.query_end= int(content)              
+            pass
+        elif name == 'Hsp_hit-from' :
+            hsp.target_start = int(content) -1              
+        elif name == 'Hsp_hit-to' :
+            #hsp.target_end = int(content)             
+            pass
+        elif name == 'Hsp_pattern-from' :
+            pass            
+        elif name == 'Hsp_pattern-to' :
+            pass            
+        elif name == 'Hsp_query-frame' :
+            hsp.query_frame = int(content)              
+        elif name == 'Hsp_hit-frame' :
+            hsp.target_frame = int(content)            
+        elif name == 'Hsp_identity' :
+            hsp.identical = int(content)            
+        elif name == 'Hsp_positive' :
+            hsp.similar = int(content)             
+        elif name == 'Hsp_gaps' :
+            hsp.gaps = int(content)            
+        elif name == 'Hsp_align-len' :
+            hsp.length = int(content)             
+        elif name == 'Hsp_density' :
+            pass            
+        elif name == 'Hsp_qseq' :
+            hsp.query_seq = content               
+        elif name == 'Hsp_hseq' :
+            hsp.target_seq = content            
+        elif name == 'Hsp_midline' :
+            hsp.mid_seq = content     
+        else :
+            pass       
+                
+
+