0
|
1
|
|
2 # Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
|
|
3 # Copyright (c) 2006, The Regents of the University of California, through
|
|
4 # Lawrence Berkeley National Laboratory (subject to receipt of any required
|
|
5 # approvals from the U.S. Dept. of Energy). All rights reserved.
|
|
6
|
|
7 # This software is distributed under the new BSD Open Source License.
|
|
8 # <http://www.opensource.org/licenses/bsd-license.html>
|
|
9 #
|
|
10 # Redistribution and use in source and binary forms, with or without
|
|
11 # modification, are permitted provided that the following conditions are met:
|
|
12 #
|
|
13 # (1) Redistributions of source code must retain the above copyright notice,
|
|
14 # this list of conditions and the following disclaimer.
|
|
15 #
|
|
16 # (2) Redistributions in binary form must reproduce the above copyright
|
|
17 # notice, this list of conditions and the following disclaimer in the
|
|
18 # documentation and or other materials provided with the distribution.
|
|
19 #
|
|
20 # (3) Neither the name of the University of California, Lawrence Berkeley
|
|
21 # National Laboratory, U.S. Dept. of Energy nor the names of its contributors
|
|
22 # may be used to endorse or promote products derived from this software
|
|
23 # without specific prior written permission.
|
|
24 #
|
|
25 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
26 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
27 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
28 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
29 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
30 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
31 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
32 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
33 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
34 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
35 # POSSIBILITY OF SUCH DAMAGE.
|
|
36
|
|
37
|
|
38
|
|
39
|
|
40
|
|
41 """ Sequence file reading and writing.
|
|
42
|
|
43 Biological sequence data is stored and transmitted using a wide variety of
|
|
44 different file formats. This package provides convient methods to read and
|
|
45 write several of these file fomats.
|
|
46
|
|
47 CoreBio is often capable of guessing the correct file type, either from the
|
|
48 file extension or the structure of the file:
|
|
49 >>> import corebio.seq_io
|
|
50 >>> afile = open("test_corebio/data/cap.fa")
|
|
51 >>> seqs = corebio.seq_io.read(afile)
|
|
52
|
|
53 Alternatively, each sequence file type has a seperate module named FILETYPE_io
|
|
54 (e.g. fasta_io, clustal_io).
|
|
55 >>> import corebio.seq_io.fasta_io
|
|
56 >>> afile = open("test_corebio/data/cap.fa")
|
|
57 >>> seqs = corebio.seq_io.fasta_io.read( afile )
|
|
58
|
|
59 Sequence data can also be written back to files:
|
|
60 >>> fout = open("out.fa", "w")
|
|
61 >>> corebio.seq_io.fasta_io.write( fout, seqs )
|
|
62
|
|
63
|
|
64 Supported File Formats
|
|
65 ----------------------
|
|
66
|
|
67 Module Name Extension read write features
|
|
68 ---------------------------------------------------------------------------
|
|
69 array_io array, flatfile yes yes none
|
|
70 clustal_io clustalw aln yes yes
|
|
71 fasta_io fasta, Pearson fa yes yes none
|
|
72 genbank_io genbank gb yes
|
|
73 intelligenetics_io intelligenetics ig yes yes
|
|
74 msf_io msf msf yes
|
|
75 nbrf_io nbrf, pir pir yes
|
|
76 nexus_io nexus nexus yes
|
|
77 phylip_io phylip phy yes
|
|
78 plain_io plain, raw txt yes yes none
|
|
79 table_io table tbl yes yes none
|
|
80
|
|
81 Each IO module defines one or more of the following functions and variables:
|
|
82
|
|
83 read(afile, alphabet=None)
|
|
84 Read a file of sequence data and return a SeqList, a collection
|
|
85 of Seq's (Alphabetic strings) and features.
|
|
86
|
|
87 read_seq(afile, alphabet=None)
|
|
88 Read a single sequence from a file.
|
|
89
|
|
90 iter_seq(afile, alphabet =None)
|
|
91 Iterate over the sequences in a file.
|
|
92
|
|
93 index(afile, alphabet = None)
|
|
94 Instead of loading all of the sequences into memory, scan the file and
|
|
95 return an index map that will load sequences on demand. Typically not
|
|
96 implemented for formats with interleaved sequences.
|
|
97
|
|
98 write(afile, seqlist)
|
|
99 Write a collection of sequences to the specifed file.
|
|
100
|
|
101 write_seq(afile, seq)
|
|
102 Write one sequence to the file. Only implemented for non-inteleaved,
|
|
103 headerless formats, such as fasta and plain.
|
|
104
|
|
105 example
|
|
106 A string containing a short example of the file format
|
|
107
|
|
108 names
|
|
109 A list of synonyms for the file format. e.g. for fasta_io, ( 'fasta',
|
|
110 'pearson', 'fa'). The first entry is the preferred format name.
|
|
111
|
|
112 extensions
|
|
113 A list of file name extensions used for this file format. e.g.
|
|
114 fasta_io.extensions is ('fa', 'fasta', 'fast', 'seq', 'fsa', 'fst', 'nt',
|
|
115 'aa','fna','mpfa'). The preferred or standard extension is first in the
|
|
116 list.
|
|
117
|
|
118
|
|
119 Attributes :
|
|
120 - formats -- Available seq_io format parsers
|
|
121 - format_names -- A map between format names and format parsers.
|
|
122 - format_extensions -- A map between filename extensions and parsers.
|
|
123
|
|
124 """
|
|
125
|
|
126 # Dev. References :
|
|
127 #
|
|
128 # - http://iubio.bio.indiana.edu/soft/molbio/readseq/java/Readseq2-help.html
|
|
129 # - http://www.ebi.ac.uk/help/formats_frame.html
|
|
130 # - http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html
|
|
131 # - http://bioperl.org/HOWTOs/html/SeqIO.html
|
|
132 # - http://emboss.sourceforge.net/docs/themes/SequenceFormats.html
|
|
133 # - http://www.cse.ucsc.edu/research/compbio/a2m-desc.html (a2m)
|
|
134 # - http://www.genomatix.de/online_help/help/sequence_formats.html
|
|
135
|
|
136 from corebio.seq import *
|
|
137
|
|
138 import clustal_io
|
|
139 import fasta_io
|
|
140 import msf_io
|
|
141 import nbrf_io
|
|
142 import nexus_io
|
|
143 import plain_io
|
|
144 import phylip_io
|
|
145 #import null_io
|
|
146 import stockholm_io
|
|
147 import intelligenetics_io
|
|
148 import table_io
|
|
149 import array_io
|
|
150 import genbank_io
|
|
151
|
|
152 __all__ = [
|
|
153 'clustal_io',
|
|
154 'fasta_io',
|
|
155 'msf_io',
|
|
156 'nbrf_io',
|
|
157 'nexus_io',
|
|
158 'plain_io',
|
|
159 'phylip_io',
|
|
160 'null_io',
|
|
161 'stockholm_io',
|
|
162 'intelligenetics_io',
|
|
163 'table_io',
|
|
164 'array_io',
|
|
165 'genbank_io',
|
|
166 'read',
|
|
167 'formats',
|
|
168 'format_names',
|
|
169 'format_extensions',
|
|
170 ]
|
|
171
|
|
172 formats = ( clustal_io, fasta_io, plain_io, msf_io, genbank_io,nbrf_io, nexus_io, phylip_io, stockholm_io, intelligenetics_io, table_io, array_io)
|
|
173 """Available seq_io formats"""
|
|
174
|
|
175
|
|
176 def format_names() :
|
|
177 """Return a map between format names and format modules"""
|
|
178 global formats
|
|
179 fnames = {}
|
|
180 for f in formats :
|
|
181 for name in f.names :
|
|
182 assert name not in fnames # Insanity check
|
|
183 fnames[name] = f
|
|
184 return fnames
|
|
185
|
|
186 def format_extensions() :
|
|
187 """Return a map between filename extensions and sequence file types"""
|
|
188 global formats
|
|
189 fext = {}
|
|
190 for f in formats :
|
|
191 for ext in f.extensions :
|
|
192 assert ext not in fext # Insanity check
|
|
193 fext[ext] = f
|
|
194 return fext
|
|
195
|
|
196
|
|
197 # seq_io._parsers is an ordered list of sequence parsers that are tried, in
|
|
198 # turn, on files of unknown format. Each parser must raise an exception when
|
|
199 # fed a format further down the list.
|
|
200 #
|
|
201 # The general trend is most common to least common file format. However,
|
|
202 # 'nbrf_io' is before 'fasta_io' because nbrf looks like fasta with extras, and
|
|
203 # 'array_io' is last, since it is very general.
|
|
204 _parsers = (nbrf_io, fasta_io, clustal_io, phylip_io, genbank_io, stockholm_io, msf_io, nexus_io, table_io, array_io)
|
|
205
|
|
206
|
|
207 def _get_parsers(fin) :
|
|
208 global _parsers
|
|
209
|
|
210 fnames = format_names()
|
|
211 fext = format_extensions()
|
|
212 parsers = list(_parsers)
|
|
213 best_guess = parsers[0]
|
|
214
|
|
215 # If a filename is supplied use the extension to guess the format.
|
|
216 if hasattr(fin, "name") and '.' in fin.name :
|
|
217 extension = fin.name.split('.')[-1]
|
|
218 if extension in fnames:
|
|
219 best_guess = fnames[extension]
|
|
220 elif extension in fext :
|
|
221 best_guess = fext[extension]
|
|
222
|
|
223 if best_guess in parsers :
|
|
224 parsers.remove(best_guess)
|
|
225 parsers.insert(0,best_guess)
|
|
226
|
|
227 return parsers
|
|
228
|
|
229
|
|
230
|
|
231 def read(fin, alphabet=None) :
|
|
232 """ Read a sequence file and attempt to guess its format.
|
|
233 First the filename extension (if available) is used to infer the format.
|
|
234 If that fails, then we attempt to parse the file using several common
|
|
235 formats.
|
|
236
|
|
237 returns :
|
|
238 SeqList
|
|
239 raises :
|
|
240 ValueError - If the file cannot be parsed.
|
|
241 ValueError - Sequence do not conform to the alphabet.
|
|
242 """
|
|
243
|
|
244 alphabet = Alphabet(alphabet)
|
|
245 parsers = _get_parsers(fin)
|
|
246
|
|
247 for p in _get_parsers(fin) :
|
|
248 try:
|
|
249 return p.read(fin, alphabet)
|
|
250 except ValueError:
|
|
251 pass
|
|
252 fin.seek(0) # FIXME. Non seakable stdin?
|
|
253
|
|
254 names = ", ".join([ p.names[0] for p in parsers])
|
|
255 raise ValueError("Cannot parse sequence file: Tried %s " % names)
|
|
256
|
|
257
|
|
258
|
|
259
|
|
260
|
|
261
|