1
|
1 /*
|
|
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
|
|
3
|
|
4 This file is part of the EggLib library.
|
|
5
|
|
6 EggLib is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 EggLib is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
|
|
18 */
|
|
19
|
|
20 #ifndef EGGLIB_FASTA_HPP
|
|
21 #define EGGLIB_FASTA_HPP
|
|
22
|
|
23 #include <istream>
|
|
24 #include <iostream>
|
|
25 #include <string>
|
|
26 #include "Container.hpp"
|
|
27
|
|
28 namespace egglib {
|
|
29
|
|
30 /** \brief Fasta parser/formatted
|
|
31 *
|
|
32 * \ingroup core
|
|
33 *
|
|
34 * Reads a multifasta sequence file from a string, a stream or a file
|
|
35 * and returns a Container. See the description of the format below.
|
|
36 * Formats a fasta string from a sequence container object and places
|
|
37 * it in a string, a stream of a file. All methods are static and the
|
|
38 * class cannot be instantiated. The methods parsef and formatf will
|
|
39 * open the file for you while the others will read/write directly
|
|
40 * in a string.
|
|
41 *
|
|
42 * Specifications of the fasta format:
|
|
43 *
|
|
44 * - The number of sequences is not limited.
|
|
45 *
|
|
46 * - Each sequence is preceded by a header limited to a single
|
|
47 * line and starting by a ">" character.
|
|
48 *
|
|
49 * - The header length is not limited and all characters are
|
|
50 * allowed but white spaces and special characters are
|
|
51 * discouraged.
|
|
52 *
|
|
53 * - Group indices are specified by \@0, \@1, \@2... strings
|
|
54 * appearing at the end of the header string (just before the
|
|
55 * carriage return). Note that group labels are ignored by
|
|
56 * default.
|
|
57 *
|
|
58 * - Group indices are ignored unless specifically specified in a
|
|
59 * parser's options.
|
|
60 *
|
|
61 * - The sequence itself continues on following lines until the
|
|
62 * next ">" character or the end of the file.
|
|
63 *
|
|
64 * - White spaces, tab and carriage returns are allowed at any
|
|
65 * position There is no limitation in length and different
|
|
66 * sequences can have different lengths.
|
|
67 *
|
|
68 * - Although the standard is lower case characters, Fasta
|
|
69 * assumes upper case characters and only supports lower case
|
|
70 * characters (and converts them to upper case characters).
|
|
71 * Information coded by change in case is lost.
|
|
72 *
|
|
73 */
|
|
74 class Fasta {
|
|
75
|
|
76 public:
|
|
77
|
|
78 /** \brief Imports a fasta file
|
|
79 *
|
|
80 * Imports the content of the file as is. Calls the method
|
|
81 * pase(std::istream*, bool) by creating its own istream.
|
|
82 *
|
|
83 * \param fname the name of a fasta file.
|
|
84 *
|
|
85 * \param importGroupLabels if set to true, scan automatically
|
|
86 * for groups. The format is @ followed by an integer, placed
|
|
87 * at the end of the header string(sequences without labels
|
|
88 * will be treated as \@0).
|
|
89 *
|
|
90 * \return A Container object containing the sequences.
|
|
91 *
|
|
92 */
|
|
93 static Container parsef(const char* fname, bool importGroupLabels=false);
|
|
94
|
|
95
|
|
96 /** \brief Imports a fasta file
|
|
97 *
|
|
98 * Imports the content of the file as is. Calls the method
|
|
99 * pase(std::istream*, bool) by creating its own istream. This
|
|
100 * method expects a reference to a Container to which the
|
|
101 * sequences will be appended.
|
|
102 *
|
|
103 * \param fname the name of a fasta file.
|
|
104 *
|
|
105 * \param container a Container instance, empty or not.
|
|
106 *
|
|
107 * \param importGroupLabels if set to true, scan automatically
|
|
108 * for groups. The format is @ followed by an integer, placed
|
|
109 * at the end of the header string(sequences without labels
|
|
110 * will be treated as \@0).
|
|
111 *
|
|
112 * \return Nothings: the new sequences are appended to the
|
|
113 * Container passed as argument.
|
|
114 *
|
|
115 */
|
|
116 static void parsef(const char* fname, Container& container, bool importGroupLabels=false);
|
|
117
|
|
118
|
|
119 /** \brief Imports a fasta file
|
|
120 *
|
|
121 * Imports the content of the file as is. Calls the method
|
|
122 * pase(std::istream*, bool) by creating its own istream.
|
|
123 *
|
|
124 * \param str a string containing the data.
|
|
125 *
|
|
126 * \param importGroupLabels if set to true, scan automatically
|
|
127 * for groups. The format is @ followed by an integer, placed
|
|
128 * at the end of the header string(sequences without labels
|
|
129 * will be treated as \@0).
|
|
130 *
|
|
131 * \return A Container object containing the sequences.
|
|
132 *
|
|
133 */
|
|
134 static Container parse(const std::string& str, bool importGroupLabels=false);
|
|
135
|
|
136
|
|
137 /** \brief Imports a fasta file
|
|
138 *
|
|
139 * Imports the content of the file as is. Calls the method
|
|
140 * pase(std::istream*, bool) by creating its own istream. This
|
|
141 * method expects a reference to a Container to which the
|
|
142 * sequences will be appended.
|
|
143 *
|
|
144 * \param str a string containing the data.
|
|
145 *
|
|
146 * \param container a Container instance, empty or not.
|
|
147 *
|
|
148 * \param importGroupLabels if set to true, scan automatically
|
|
149 * for groups. The format is @ followed by an integer, placed
|
|
150 * at the end of the header string(sequences without labels
|
|
151 * will be treated as \@0).
|
|
152 *
|
|
153 * \return Nothing: new sequences are appended to the Container
|
|
154 * passed as argument.
|
|
155 *
|
|
156 */
|
|
157 static void parse(const std::string& str, Container& container, bool importGroupLabels=false);
|
|
158
|
|
159
|
|
160 /** \brief Imports a fasta file from an open stream
|
|
161 *
|
|
162 * Imports the content of the file as is.
|
|
163 *
|
|
164 * \param stream an open stream (file or string) containing the
|
|
165 * data.
|
|
166 *
|
|
167 * \param importGroupLabels if set to true, scan automatically
|
|
168 * for groups. The format is @ followed by an integer, placed
|
|
169 * at the end of the header string(sequences without labels
|
|
170 * will be treated as \@0).
|
|
171 *
|
|
172 * \return A Container object containing the sequences.
|
|
173 *
|
|
174 */
|
|
175 static Container parse(std::istream& stream, bool importGroupLabels=false);
|
|
176
|
|
177
|
|
178 /** \brief Imports a fasta file from an open stream
|
|
179 *
|
|
180 * Imports the content of the file as is. This
|
|
181 * method expects a reference to a Container to which the
|
|
182 * sequences will be appended.
|
|
183 *
|
|
184 * \param stream an open stream (file or string) containing the
|
|
185 * data.
|
|
186 *
|
|
187 * \param container a Container instance, empty or not.
|
|
188 *
|
|
189 * \param importGroupLabels if set to true, scan automatically
|
|
190 * for groups. The format is @ followed by an integer, placed
|
|
191 * at the end of the header string(sequences without labels
|
|
192 * will be treated as \@0).
|
|
193 *
|
|
194 * \return Nothing: the new sequences are appended to the
|
|
195 * Container passed as argument.
|
|
196 *
|
|
197 */
|
|
198 static void parse(std::istream& stream, Container& container, bool importGroupLabels=false);
|
|
199
|
|
200
|
|
201 /** \brief Export sequences as fasta
|
|
202 *
|
|
203 * \param fname the name of the file where to place the result.
|
|
204 *
|
|
205 * \param container Container object to export.
|
|
206 *
|
|
207 * \param exportGroupLabels if set to true, exports group
|
|
208 * indices as a \@x at the end of the sequence name, where x is
|
|
209 * the group index. Otherwise, this information is discarded.
|
|
210 *
|
|
211 * \param lineLength the number of characters to place on a
|
|
212 * single line. If zero, no newlines are inserted within
|
|
213 * sequences.
|
|
214 *
|
|
215 */
|
|
216 static void formatf(const char* fname, const Container& container, bool exportGroupLabels=false, unsigned int lineLength=50);
|
|
217
|
|
218
|
|
219 /** \brief Export sequences as fasta
|
|
220 *
|
|
221 * \param file an open stream.
|
|
222 *
|
|
223 * \param container Container object to export.
|
|
224 *
|
|
225 * \param exportGroupLabels if set to true, exports group
|
|
226 * indices as a \@x at the end of the sequence name, where x is
|
|
227 * the group index. Otherwise, this information is discarded.
|
|
228 *
|
|
229 * \param lineLength the number of characters to place on a
|
|
230 * single line. If zero, no newlines are inserted within
|
|
231 * sequences.
|
|
232 *
|
|
233 */
|
|
234 static void format(std::ostream& file, const Container& container, bool exportGroupLabels=false, unsigned int lineLength=50);
|
|
235
|
|
236
|
|
237 /** \brief Export sequences as fasta
|
|
238 *
|
|
239 * This medod creates internally an ostringstream, calls the
|
|
240 * method format(ostream, container, bool) and returns the
|
|
241 * resulting string.
|
|
242 *
|
|
243 * \param container Container object to export.
|
|
244 *
|
|
245 * \param exportGroupLabels if set to true, exports group
|
|
246 * indices as a \@x at the end of the sequence name, where x is
|
|
247 * the group index. Otherwise, this information is discarded.
|
|
248 *
|
|
249 * \param lineLength the number of characters to place on a
|
|
250 * single line. If zero, no newlines are inserted within
|
|
251 * sequences.
|
|
252 *
|
|
253 * \return The formatted string.
|
|
254 *
|
|
255 */
|
|
256 static std::string format(const Container& container, bool exportGroupLabels=false, unsigned int lineLength=50);
|
|
257
|
|
258
|
|
259
|
|
260 protected:
|
|
261
|
|
262 /// This class cannot be instantiated
|
|
263 Fasta() { }
|
|
264
|
|
265 /// This class cannot be instantiated
|
|
266 Fasta(const Fasta& source) { }
|
|
267
|
|
268 /// This class cannot be or copied
|
|
269 Fasta& operator=(const Fasta& source) { return *this; }
|
|
270
|
|
271 /// This class cannot be instantiated
|
|
272 virtual ~Fasta() { }
|
|
273
|
|
274
|
|
275 };
|
|
276 }
|
|
277
|
|
278 #endif
|