Mercurial > repos > davidmurphy > codonlogo
diff corebio/seq_io/genbank_io.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/corebio/seq_io/genbank_io.py Thu Oct 27 12:09:09 2011 -0400 @@ -0,0 +1,85 @@ +#!/usr/bin/env python + + +"""Read GenBank flat files. + +Currently only reads sequence data and not annotations. + +""" +from corebio.utils import * +from corebio.seq import * + + +names = ( 'genbank',) +extensions = ('gb','genbank', 'gbk') + + + +def read(fin, alphabet=None): + """Read and parse a file of genbank records. + + Args: + fin -- A stream or file to read + alphabet -- The expected alphabet of the data, if given + + Returns: + SeqList -- A list of sequences + + Raises: + ValueError -- If the file is unparsable + """ + seqs = [ s for s in iterseq(fin, alphabet)] + return SeqList(seqs) + + +def iterseq(fin, alphabet=None): + """ Iterate over genbank records + + Args: + fin -- A stream or file to read + alphabet -- The expected alphabet of the data, if given + + Yeilds: + Seq -- One alphabetic sequence at a time. + + Raises: + ValueError -- If the file is unparsable + """ + alphabet = Alphabet(alphabet) + + seq = [] + + def notblank(string) : + return not isblank(string) + + lines = Reiterate(iter(fin)) + + + while True : + line = lines.filter( notblank ) + if not line.startswith('LOCUS') : + raise ValueError( + "Cannot find start of record at line %d"% lines.index() ) + + line = lines.filter(lambda s : s.startswith('ORIGIN') + or s.startswith('//') ) + + if line.startswith('//') : + # No sequence data + yield Seq( '', alphabet) + else: + for line in lines : + if line.startswith('//') : + yield Seq( ''.join(seq), alphabet) + seq = [] + break + seq.extend( line.split()[1:] ) + + + + + + + + + \ No newline at end of file