comparison corebio/seq_io/fasta_io.py @ 7:8d676bbd1f2d

Uploaded
author davidmurphy
date Mon, 16 Jan 2012 07:03:36 -0500
parents c55bdc2fb9fa
children
comparison
equal deleted inserted replaced
6:4a4aca3d57c9 7:8d676bbd1f2d
1 #!/usr/bin/env python
2
3 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
4 #
5 # This software is distributed under the MIT Open Source License.
6 # <http://www.opensource.org/licenses/mit-license.html>
7 #
8 # Permission is hereby granted, free of charge, to any person obtaining a
9 # copy of this software and associated documentation files (the "Software"),
10 # to deal in the Software without restriction, including without limitation
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 # and/or sell copies of the Software, and to permit persons to whom the
13 # Software is furnished to do so, subject to the following conditions:
14 #
15 # The above copyright notice and this permission notice shall be included
16 # in all copies or substantial portions of the Software.
17 #
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 # THE SOFTWARE.
25 #
26
27 """Read and write sequence information in FASTA format.
28
29 This is a very common format for unannotated biological sequence data,
30 accepted by many multiple sequence alignment programs. Each sequence
31 consists of a single-line description, followed by lines of sequence data.
32 The first character of the description line is a greater-than (">") symbol
33 in the first column. The first word of the description is often the name or
34 ID of the sequence. Fasta files containing multiple sequences have one
35 sequence listed right after another.
36
37
38 Example Fasta File ::
39
40 >Lamprey GLOBIN V - SEA LAMPREY
41 PIVDTGSVA-P------------------LSAAEKTKIRSAWAPVYSTY---ETSGVDILVKFFTSTPAAQEFFPKFKGL
42 TT-----ADQLKKSA---DVRWHA-ERIINAVNDAVASMDDTEKMS--MKL-RDLSGKH----AKSFQV-----DPQYFK
43 VLAAVI-AD-TVAAGD--AGFEKLMSM------I---CILLR----S-----A-----Y------------
44 >Hagfish GLOBIN III - ATLANTIC HAGFISH
45 PITDHGQPP-T------------------LSEGDKKAIRESWPQIYKNF---EQNSLAVLLEFLKKFPKAQDSFPKFSAK
46 KS-------HLEQDP---AVKLQA-EVIINAVNHTIGLMDKEAAMK--KYL-KDLSTKH----STEFQV-----NPDMFK
47 ELSAVF-VS-TMG-GK--AAYEKLFSI------I---ATLLR----S-----T-----YDA----------
48 >Frog HEMOGLOBIN BETA CHAIN - EDIBLE FROG
49 ----------GS-----------------------DLVSGFWGKV--DA---HKIGGEALARLLVVYPWTQRYFTTFGNL
50 GSADAIC-----HNA---KVLAHG-EKVLAAIGEGLKHPENLKAHY--AKL-SEYHSNK----LHVDPANFRLLGNVFIT
51 VLARHF-QH-EFTPELQ-HALEAHFCA------V---GDALA----K-----A-----YH-----------
52
53
54 """
55 import re
56 from corebio.utils import *
57 from corebio.seq import *
58 from corebio.seq_io import *
59
60
61 names = ( 'fasta', 'pearson', 'fa')
62 extensions = ('fa', 'fasta', 'fast', 'seq', 'fsa', 'fst', 'nt', 'aa','fna','mpfa', 'faa', 'fnn','mfasta')
63
64
65 example = """
66 >Lamprey GLOBIN V - SEA LAMPREY
67 PIVDTGSVA-P------------------LSAAEKTKIRSAWAPVYSTY---ETSGVDILVKFFTSTPAAQEFFPKFKGL
68 TT-----ADQLKKSA---DVRWHA-ERIINAVNDAVASMDDTEKMS--MKL-RDLSGKH----AKSFQV-----DPQYFK
69 VLAAVI-AD-TVAAGD--AGFEKLMSM------I---CILLR----S-----A-----Y------------
70
71 >Hagfish GLOBIN III - ATLANTIC HAGFISH
72 PITDHGQPP-T------------------LSEGDKKAIRESWPQIYKNF---EQNSLAVLLEFLKKFPKAQDSFPKFSAK
73 KS-------HLEQDP---AVKLQA-EVIINAVNHTIGLMDKEAAMK--KYL-KDLSTKH----STEFQV-----NPDMFK
74 ELSAVF-VS-TMG-GK--AAYEKLFSI------I---ATLLR----S-----T-----YDA----------
75
76 >Frog HEMOGLOBIN BETA CHAIN - EDIBLE FROG
77 ----------GS-----------------------DLVSGFWGKV--DA---HKIGGEALARLLVVYPWTQRYFTTFGNL
78 GSADAIC-----HNA---KVLAHG-EKVLAAIGEGLKHPENLKAHY--AKL-SEYHSNK----LHVDPANFRLLGNVFIT
79 VLARHF-QH-EFTPELQ-HALEAHFCA------V---GDALA----K-----A-----YH-----------
80
81 """
82
83
84 def read(fin, alphabet=None):
85 """Read and parse a fasta file.
86
87 Args:
88 fin -- A stream or file to read
89 alphabet -- The expected alphabet of the data, if given
90 Returns:
91 SeqList -- A list of sequences
92 Raises:
93 ValueError -- If the file is unparsable
94 """
95 seqs = [ s for s in iterseq(fin, alphabet)]
96 name = names[0]
97 if hasattr(fin, "name") : name = fin.name
98 return SeqList(seqs, name=name)
99
100
101 def readseq(fin, alphabet=None) :
102 """Read one sequence from the file, starting
103 from the current file position."""
104 return iterseq(fin, alphabet).next()
105
106
107 def iterseq(fin, alphabet=None):
108 """ Parse a fasta file and generate sequences.
109
110 Args:
111 fin -- A stream or file to read
112 alphabet -- The expected alphabet of the data, if given
113 Yeilds:
114 Seq -- One alphabetic sequence at a time.
115 Raises:
116 ValueError -- If the file is unparsable
117 """
118 alphabet = Alphabet(alphabet)
119
120 seqs = []
121 comments = [] # FIXME: comments before first sequence are lost.
122 header = None
123 header_lineno = -1
124
125 def build_seq(seqs,alphabet, header, header_lineno,comments) :
126 try :
127 name = header.split(' ',1)[0]
128 if comments :
129 header += '\n' + '\n'.join(comments)
130 s = Seq( "".join(seqs), alphabet, name=name, description=header)
131 except ValueError:
132 raise ValueError(
133 "Parsed failed with sequence starting at line %d: "
134 "Character not in alphabet: %s" % (header_lineno, alphabet) )
135 return s
136
137 for lineno, line in enumerate(fin) :
138 line = line.strip()
139 if line == '' : continue
140 if line.startswith('>') :
141 if header is not None :
142 yield build_seq(seqs,alphabet, header, header_lineno, comments)
143 header = None
144 seqs = []
145 header = line[1:]
146 header_lineno = lineno
147 comments = []
148 elif line.startswith(';') :
149 # Optional (and unusual) comment line
150 comments.append(line[1:])
151 else :
152 if header is None :
153 raise ValueError (
154 "Parse failed on line %d: sequence before header"
155 % (lineno) )
156 seqs.append(line)
157
158 if not seqs: return
159 yield build_seq(seqs,alphabet, header, header_lineno, comments)
160
161
162 def write(fout, seqs):
163 """Write a fasta file.
164
165 Args:
166 fout -- A writable stream.
167 seqs -- A list of Seq's
168 """
169 if seqs.description :
170 for line in seqs.description.splitlines():
171 print >>fout, ';'+ line
172 for s in seqs :
173 writeseq(fout, s)
174
175
176 def writeseq(afile, seq):
177 """ Write a single sequence in fasta format.
178
179 Args:
180 afile -- A writable stream.
181 seq -- A Seq instance
182 """
183
184 header = seq.description or seq.name or ''
185
186 # We prepend '>' to the first header line
187 # Additional lines start with ';' to indicate comment lines
188 if header :
189 header = header.splitlines()
190 print >>afile, '>'+header[0]
191 if len(header) > 1 :
192 for h in header[1:] :
193 print >>afile, ';' +h
194 else :
195 print >>afile, '>'
196
197 L = len(seq)
198 line_length = 80
199 for n in range (1+ L/line_length) :
200 print >>afile, seq[n * line_length: (n+1) * line_length]
201 print >>afile
202
203
204 def index(afile, alphabet=None) :
205 """Return a FileIndex for the fasta file. Sequences can be retrieved
206 by item number or name.
207 """
208 def parser( afile) :
209 return readseq(afile, alphabet)
210
211 key = re.compile(r"^>\s*(\S*)")
212 def linekey( line):
213 k = key.search(line)
214 if k is None : return None
215 return k.group(1)
216
217 return FileIndex(afile, linekey, parser)
218
219
220
221
222
223
224
225
226
227
228
229
230