Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq_io/fasta_io.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c55bdc2fb9fa |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com> | |
4 # | |
5 # This software is distributed under the MIT Open Source License. | |
6 # <http://www.opensource.org/licenses/mit-license.html> | |
7 # | |
8 # Permission is hereby granted, free of charge, to any person obtaining a | |
9 # copy of this software and associated documentation files (the "Software"), | |
10 # to deal in the Software without restriction, including without limitation | |
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
12 # and/or sell copies of the Software, and to permit persons to whom the | |
13 # Software is furnished to do so, subject to the following conditions: | |
14 # | |
15 # The above copyright notice and this permission notice shall be included | |
16 # in all copies or substantial portions of the Software. | |
17 # | |
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
24 # THE SOFTWARE. | |
25 # | |
26 | |
27 """Read and write sequence information in FASTA format. | |
28 | |
29 This is a very common format for unannotated biological sequence data, | |
30 accepted by many multiple sequence alignment programs. Each sequence | |
31 consists of a single-line description, followed by lines of sequence data. | |
32 The first character of the description line is a greater-than (">") symbol | |
33 in the first column. The first word of the description is often the name or | |
34 ID of the sequence. Fasta files containing multiple sequences have one | |
35 sequence listed right after another. | |
36 | |
37 | |
38 Example Fasta File :: | |
39 | |
40 >Lamprey GLOBIN V - SEA LAMPREY | |
41 PIVDTGSVA-P------------------LSAAEKTKIRSAWAPVYSTY---ETSGVDILVKFFTSTPAAQEFFPKFKGL | |
42 TT-----ADQLKKSA---DVRWHA-ERIINAVNDAVASMDDTEKMS--MKL-RDLSGKH----AKSFQV-----DPQYFK | |
43 VLAAVI-AD-TVAAGD--AGFEKLMSM------I---CILLR----S-----A-----Y------------ | |
44 >Hagfish GLOBIN III - ATLANTIC HAGFISH | |
45 PITDHGQPP-T------------------LSEGDKKAIRESWPQIYKNF---EQNSLAVLLEFLKKFPKAQDSFPKFSAK | |
46 KS-------HLEQDP---AVKLQA-EVIINAVNHTIGLMDKEAAMK--KYL-KDLSTKH----STEFQV-----NPDMFK | |
47 ELSAVF-VS-TMG-GK--AAYEKLFSI------I---ATLLR----S-----T-----YDA---------- | |
48 >Frog HEMOGLOBIN BETA CHAIN - EDIBLE FROG | |
49 ----------GS-----------------------DLVSGFWGKV--DA---HKIGGEALARLLVVYPWTQRYFTTFGNL | |
50 GSADAIC-----HNA---KVLAHG-EKVLAAIGEGLKHPENLKAHY--AKL-SEYHSNK----LHVDPANFRLLGNVFIT | |
51 VLARHF-QH-EFTPELQ-HALEAHFCA------V---GDALA----K-----A-----YH----------- | |
52 | |
53 | |
54 """ | |
55 import re | |
56 from corebio.utils import * | |
57 from corebio.seq import * | |
58 from corebio.seq_io import * | |
59 | |
60 | |
61 names = ( 'fasta', 'pearson', 'fa') | |
62 extensions = ('fa', 'fasta', 'fast', 'seq', 'fsa', 'fst', 'nt', 'aa','fna','mpfa', 'faa', 'fnn','mfasta') | |
63 | |
64 | |
65 example = """ | |
66 >Lamprey GLOBIN V - SEA LAMPREY | |
67 PIVDTGSVA-P------------------LSAAEKTKIRSAWAPVYSTY---ETSGVDILVKFFTSTPAAQEFFPKFKGL | |
68 TT-----ADQLKKSA---DVRWHA-ERIINAVNDAVASMDDTEKMS--MKL-RDLSGKH----AKSFQV-----DPQYFK | |
69 VLAAVI-AD-TVAAGD--AGFEKLMSM------I---CILLR----S-----A-----Y------------ | |
70 | |
71 >Hagfish GLOBIN III - ATLANTIC HAGFISH | |
72 PITDHGQPP-T------------------LSEGDKKAIRESWPQIYKNF---EQNSLAVLLEFLKKFPKAQDSFPKFSAK | |
73 KS-------HLEQDP---AVKLQA-EVIINAVNHTIGLMDKEAAMK--KYL-KDLSTKH----STEFQV-----NPDMFK | |
74 ELSAVF-VS-TMG-GK--AAYEKLFSI------I---ATLLR----S-----T-----YDA---------- | |
75 | |
76 >Frog HEMOGLOBIN BETA CHAIN - EDIBLE FROG | |
77 ----------GS-----------------------DLVSGFWGKV--DA---HKIGGEALARLLVVYPWTQRYFTTFGNL | |
78 GSADAIC-----HNA---KVLAHG-EKVLAAIGEGLKHPENLKAHY--AKL-SEYHSNK----LHVDPANFRLLGNVFIT | |
79 VLARHF-QH-EFTPELQ-HALEAHFCA------V---GDALA----K-----A-----YH----------- | |
80 | |
81 """ | |
82 | |
83 | |
84 def read(fin, alphabet=None): | |
85 """Read and parse a fasta file. | |
86 | |
87 Args: | |
88 fin -- A stream or file to read | |
89 alphabet -- The expected alphabet of the data, if given | |
90 Returns: | |
91 SeqList -- A list of sequences | |
92 Raises: | |
93 ValueError -- If the file is unparsable | |
94 """ | |
95 seqs = [ s for s in iterseq(fin, alphabet)] | |
96 name = names[0] | |
97 if hasattr(fin, "name") : name = fin.name | |
98 return SeqList(seqs, name=name) | |
99 | |
100 | |
101 def readseq(fin, alphabet=None) : | |
102 """Read one sequence from the file, starting | |
103 from the current file position.""" | |
104 return iterseq(fin, alphabet).next() | |
105 | |
106 | |
107 def iterseq(fin, alphabet=None): | |
108 """ Parse a fasta file and generate sequences. | |
109 | |
110 Args: | |
111 fin -- A stream or file to read | |
112 alphabet -- The expected alphabet of the data, if given | |
113 Yeilds: | |
114 Seq -- One alphabetic sequence at a time. | |
115 Raises: | |
116 ValueError -- If the file is unparsable | |
117 """ | |
118 alphabet = Alphabet(alphabet) | |
119 | |
120 seqs = [] | |
121 comments = [] # FIXME: comments before first sequence are lost. | |
122 header = None | |
123 header_lineno = -1 | |
124 | |
125 def build_seq(seqs,alphabet, header, header_lineno,comments) : | |
126 try : | |
127 name = header.split(' ',1)[0] | |
128 if comments : | |
129 header += '\n' + '\n'.join(comments) | |
130 s = Seq( "".join(seqs), alphabet, name=name, description=header) | |
131 except ValueError: | |
132 raise ValueError( | |
133 "Parsed failed with sequence starting at line %d: " | |
134 "Character not in alphabet: %s" % (header_lineno, alphabet) ) | |
135 return s | |
136 | |
137 for lineno, line in enumerate(fin) : | |
138 line = line.strip() | |
139 if line == '' : continue | |
140 if line.startswith('>') : | |
141 if header is not None : | |
142 yield build_seq(seqs,alphabet, header, header_lineno, comments) | |
143 header = None | |
144 seqs = [] | |
145 header = line[1:] | |
146 header_lineno = lineno | |
147 comments = [] | |
148 elif line.startswith(';') : | |
149 # Optional (and unusual) comment line | |
150 comments.append(line[1:]) | |
151 else : | |
152 if header is None : | |
153 raise ValueError ( | |
154 "Parse failed on line %d: sequence before header" | |
155 % (lineno) ) | |
156 seqs.append(line) | |
157 | |
158 if not seqs: return | |
159 yield build_seq(seqs,alphabet, header, header_lineno, comments) | |
160 | |
161 | |
162 def write(fout, seqs): | |
163 """Write a fasta file. | |
164 | |
165 Args: | |
166 fout -- A writable stream. | |
167 seqs -- A list of Seq's | |
168 """ | |
169 if seqs.description : | |
170 for line in seqs.description.splitlines(): | |
171 print >>fout, ';'+ line | |
172 for s in seqs : | |
173 writeseq(fout, s) | |
174 | |
175 | |
176 def writeseq(afile, seq): | |
177 """ Write a single sequence in fasta format. | |
178 | |
179 Args: | |
180 afile -- A writable stream. | |
181 seq -- A Seq instance | |
182 """ | |
183 | |
184 header = seq.description or seq.name or '' | |
185 | |
186 # We prepend '>' to the first header line | |
187 # Additional lines start with ';' to indicate comment lines | |
188 if header : | |
189 header = header.splitlines() | |
190 print >>afile, '>'+header[0] | |
191 if len(header) > 1 : | |
192 for h in header[1:] : | |
193 print >>afile, ';' +h | |
194 else : | |
195 print >>afile, '>' | |
196 | |
197 L = len(seq) | |
198 line_length = 80 | |
199 for n in range (1+ L/line_length) : | |
200 print >>afile, seq[n * line_length: (n+1) * line_length] | |
201 print >>afile | |
202 | |
203 | |
204 def index(afile, alphabet=None) : | |
205 """Return a FileIndex for the fasta file. Sequences can be retrieved | |
206 by item number or name. | |
207 """ | |
208 def parser( afile) : | |
209 return readseq(afile, alphabet) | |
210 | |
211 key = re.compile(r"^>\s*(\S*)") | |
212 def linekey( line): | |
213 k = key.search(line) | |
214 if k is None : return None | |
215 return k.group(1) | |
216 | |
217 return FileIndex(afile, linekey, parser) | |
218 | |
219 | |
220 | |
221 | |
222 | |
223 | |
224 | |
225 | |
226 | |
227 | |
228 | |
229 | |
230 |