comparison corebio/seq_io/genbank_io.py @ 0:c55bdc2fb9fa

Uploaded
author davidmurphy
date Thu, 27 Oct 2011 12:09:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c55bdc2fb9fa
1 #!/usr/bin/env python
2
3
4 """Read GenBank flat files.
5
6 Currently only reads sequence data and not annotations.
7
8 """
9 from corebio.utils import *
10 from corebio.seq import *
11
12
13 names = ( 'genbank',)
14 extensions = ('gb','genbank', 'gbk')
15
16
17
18 def read(fin, alphabet=None):
19 """Read and parse a file of genbank records.
20
21 Args:
22 fin -- A stream or file to read
23 alphabet -- The expected alphabet of the data, if given
24
25 Returns:
26 SeqList -- A list of sequences
27
28 Raises:
29 ValueError -- If the file is unparsable
30 """
31 seqs = [ s for s in iterseq(fin, alphabet)]
32 return SeqList(seqs)
33
34
35 def iterseq(fin, alphabet=None):
36 """ Iterate over genbank records
37
38 Args:
39 fin -- A stream or file to read
40 alphabet -- The expected alphabet of the data, if given
41
42 Yeilds:
43 Seq -- One alphabetic sequence at a time.
44
45 Raises:
46 ValueError -- If the file is unparsable
47 """
48 alphabet = Alphabet(alphabet)
49
50 seq = []
51
52 def notblank(string) :
53 return not isblank(string)
54
55 lines = Reiterate(iter(fin))
56
57
58 while True :
59 line = lines.filter( notblank )
60 if not line.startswith('LOCUS') :
61 raise ValueError(
62 "Cannot find start of record at line %d"% lines.index() )
63
64 line = lines.filter(lambda s : s.startswith('ORIGIN')
65 or s.startswith('//') )
66
67 if line.startswith('//') :
68 # No sequence data
69 yield Seq( '', alphabet)
70 else:
71 for line in lines :
72 if line.startswith('//') :
73 yield Seq( ''.join(seq), alphabet)
74 seq = []
75 break
76 seq.extend( line.split()[1:] )
77
78
79
80
81
82
83
84
85