diff corebio/seq_io/genbank_io.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/corebio/seq_io/genbank_io.py	Thu Oct 27 12:09:09 2011 -0400
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+ 
+
+"""Read GenBank flat files. 
+
+Currently only reads sequence data and not annotations.
+
+"""
+from corebio.utils import *
+from corebio.seq import *
+
+  
+names = ( 'genbank',)
+extensions = ('gb','genbank', 'gbk')
+
+
+
+def read(fin, alphabet=None): 
+    """Read and parse a file of genbank records. 
+
+    Args:
+    fin -- A stream or file to read
+    alphabet -- The expected alphabet of the data, if given
+    
+    Returns: 
+    SeqList -- A list of sequences
+    
+    Raises: 
+    ValueError -- If the file is unparsable
+    """         
+    seqs = [ s for s in iterseq(fin, alphabet)]
+    return SeqList(seqs)
+
+    
+def iterseq(fin, alphabet=None):
+    """ Iterate over genbank records
+    
+    Args:
+    fin -- A stream or file to read
+    alphabet -- The expected alphabet of the data, if given    
+    
+    Yeilds: 
+    Seq -- One alphabetic sequence at a time.
+    
+    Raises: 
+    ValueError -- If the file is unparsable
+    """
+    alphabet = Alphabet(alphabet)
+
+    seq = []
+    
+    def notblank(string) :
+        return not isblank(string)
+
+    lines = Reiterate(iter(fin))
+    
+    
+    while True :
+        line = lines.filter( notblank )
+        if not line.startswith('LOCUS') :
+            raise ValueError(
+                "Cannot find start of record at line %d"% lines.index() )
+
+        line = lines.filter(lambda s : s.startswith('ORIGIN') 
+                                            or  s.startswith('//') )
+
+        if line.startswith('//') :
+            # No sequence data    
+            yield Seq( '', alphabet)
+        else:
+            for line in lines :
+                if line.startswith('//') :
+                    yield Seq( ''.join(seq), alphabet)
+                    seq = []
+                    break    
+                seq.extend( line.split()[1:] )
+       
+    
+        
+    
+
+
+     
+     
+     
\ No newline at end of file