1
|
1 /*
|
|
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
|
|
3
|
|
4 This file is part of the EggLib library.
|
|
5
|
|
6 EggLib is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 EggLib is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
|
|
18 */
|
|
19
|
|
20
|
|
21 #ifndef EGGLIB_ALIGN_HPP
|
|
22 #define EGGLIB_ALIGN_HPP
|
|
23
|
|
24 #include "Container.hpp"
|
|
25 #include "CharMatrix.hpp"
|
|
26 #include <vector>
|
|
27
|
|
28 /** \mainpage Summary
|
|
29 *
|
|
30 * This is the automatically-generated reference manual of the C++
|
|
31 * egglib-cpp library. The library is presented as several modules, but
|
|
32 * note that they are only used to structure the documentation.
|
|
33 *
|
|
34 * There is a single namespace (egglib) in which all classes are
|
|
35 * defined. See an example of programming with egglib-cpp in the
|
|
36 * EggLib package main documentation. Use "Modules" or "Classes" above
|
|
37 * to navigate in the library reference manual.
|
|
38 *
|
|
39 */
|
|
40
|
|
41
|
|
42 /** \defgroup core core
|
|
43 *
|
|
44 * \brief Central core of the C++ library of Egglib
|
|
45 *
|
|
46 * Data storage classes, parsers/formatters and tools, plus exception
|
|
47 * types.
|
|
48 *
|
|
49 */
|
|
50
|
|
51 namespace egglib {
|
|
52
|
|
53
|
|
54 /** \brief Handles a sequence alignment
|
|
55 *
|
|
56 * \ingroup core
|
|
57 *
|
|
58 * Creation from a file or string stream should be performed using
|
|
59 * the class Fasta. Align objects can be created by deep copy from
|
|
60 * both Align and Container type. In the latter case, the length are
|
|
61 * artificially equalized by "?" characters. Align objects can be
|
|
62 * created from a DataMatrix object (and all the way arround) using
|
|
63 * the specific class DMAConverter.
|
|
64 *
|
|
65 * Sequences are represented by two strings (name and sequence) and
|
|
66 * an integer (group) that can be accessed or modified by index.The
|
|
67 * order of sequences is guaranteed to be conserved, as if Align was
|
|
68 * a list of triplets (name, sequence, group).
|
|
69 *
|
|
70 * The data matrix is implemented as continuous array (char**) and
|
|
71 * allows efficient access and modification of data. For very large
|
|
72 * data matrices you might claim immediately the required memory
|
|
73 * using the constructor Align(unsigned int, char**).
|
|
74 *
|
|
75 */
|
|
76 class Align : public Container, public CharMatrix {
|
|
77 public:
|
|
78
|
|
79 /** \brief Creates an empty alignment
|
|
80 *
|
|
81 */
|
|
82 Align();
|
|
83
|
|
84
|
|
85 /** \brief Creates an alignment from a data matrix.
|
|
86 *
|
|
87 * Allows you to create an object from data stored in a char*
|
|
88 * array. The array's dimensions must be passed to the
|
|
89 * constructor, and as a result there is not need to
|
|
90 * terminate each sequence by a NULL character.
|
|
91 *
|
|
92 * \param number_of_sequences the number of sequences (the
|
|
93 * length of the first dimension of the array).
|
|
94 *
|
|
95 * \param alignment_length the length of sequences (the
|
|
96 * length of all lines of the array).
|
|
97 *
|
|
98 * \param cstring_array the pointer to the data matrix.
|
|
99 *
|
|
100 */
|
|
101 Align(unsigned int number_of_sequences, unsigned int alignment_length, char const * const * const cstring_array);
|
|
102
|
|
103
|
|
104 /** \brief Creates an alignment with given dimensions
|
|
105 *
|
|
106 * Allows you to allocate directly a data matrix of a given
|
|
107 * size. Names are empty strings, groups 0, and all
|
|
108 * characters are ?.
|
|
109 *
|
|
110 * \param number_of_sequences the number of sequences (the
|
|
111 * length of the first dimension of the array).
|
|
112 *
|
|
113 * \param alignment_length the length of sequences (the
|
|
114 * length of all lines of the array).
|
|
115 *
|
|
116 */
|
|
117 Align(unsigned int number_of_sequences, unsigned int alignment_length);
|
|
118
|
|
119
|
|
120 /** \brief Copy constructor
|
|
121 *
|
|
122 */
|
|
123 Align(const Align& align);
|
|
124
|
|
125
|
|
126 /** \brief Copy constructor accepting a Container object
|
|
127 *
|
|
128 * All but the longest sequences are padded with ? to match
|
|
129 * the longest sequence's length.
|
|
130 *
|
|
131 */
|
|
132 Align(const Container& container);
|
|
133
|
|
134
|
|
135 /** \brief Copy operator
|
|
136 *
|
|
137 */
|
|
138 Align& operator=(const Align& align);
|
|
139
|
|
140
|
|
141 /** \brief Copy operator accepting a Container object
|
|
142 *
|
|
143 * All but the longest sequences are padded with ? to match
|
|
144 * the longest sequence's length.
|
|
145 *
|
|
146 */
|
|
147 Align& operator=(const Container& container);
|
|
148
|
|
149
|
|
150 /** \brief Destructor
|
|
151 *
|
|
152 */
|
|
153 virtual ~Align();
|
|
154
|
|
155
|
|
156 /** \brief Adds a sequence
|
|
157 *
|
|
158 * If the object already contains at least one sequence, the
|
|
159 * new sequence must have the same length. Otherwise, a
|
|
160 * EggUnalignedError is raised.
|
|
161 *
|
|
162 * \param name the name of the sequence.
|
|
163 * \param sequence the sequence string.
|
|
164 * \param group the group index of the sequence.
|
|
165 * \return The new number of sequences.
|
|
166 *
|
|
167 */
|
|
168 virtual unsigned int append(const char* name, const char* sequence, unsigned int group=0);
|
|
169
|
|
170
|
|
171 /** \brief Removes a position (column) of the alignment
|
|
172 *
|
|
173 * \param pos the position to remove in the alignment.
|
|
174 * \return The new length of the alignment.
|
|
175 *
|
|
176 */
|
|
177 virtual unsigned int removePosition(unsigned int pos);
|
|
178
|
|
179
|
|
180 /** \brief Removes a sequence from the alignment
|
|
181 *
|
|
182 * \param pos the index of the sequence to remove.
|
|
183 * \return The new number of sequences.
|
|
184 *
|
|
185 */
|
|
186 virtual unsigned int remove(unsigned int pos);
|
|
187
|
|
188
|
|
189 /** \brief Replace a sequence string
|
|
190 *
|
|
191 * The new sequence must have the same length than the
|
|
192 * alignment. Otherwise, a EggUnalignedError is raised.
|
|
193 *
|
|
194 * \param seq the index of the sequence to change.
|
|
195 * \param sequence the new sequence.
|
|
196 *
|
|
197 */
|
|
198 virtual void sequence(unsigned int seq, const char* sequence);
|
|
199
|
|
200
|
|
201 /** \brief Gets the name of a given sequence
|
|
202 *
|
|
203 * \param pos the index of the sequence.
|
|
204 *
|
|
205 * \return The sequence string for that particular sequence.
|
|
206 *
|
|
207 */
|
|
208 virtual inline const char* sequence(unsigned int pos) const { return Container::sequence(pos); }
|
|
209
|
|
210
|
|
211 /** \brief Alignment length
|
|
212 *
|
|
213 * Returns 0 if the alignment is empty.
|
|
214 *
|
|
215 */
|
|
216 virtual unsigned int ls() const;
|
|
217
|
|
218
|
|
219 /** \brief Length of a given sequence
|
|
220 *
|
|
221 * Calling this function is exactly the same as calling ls()
|
|
222 * (without arguments), regardless of the index provided,
|
|
223 * except that an exception is thrown if the index is out of
|
|
224 * bounds. Provided for compatibility with Container.
|
|
225 *
|
|
226 * \param pos the index of the sequence.
|
|
227 * \return the length of the alignment.
|
|
228 *
|
|
229 */
|
|
230 virtual unsigned int ls(unsigned int pos) const;
|
|
231
|
|
232
|
|
233 /** \brief Fast and unsecure accessor
|
|
234 *
|
|
235 * This accessor doesn't perform out-of-bound checking!
|
|
236 *
|
|
237 * \param s the index of the sequence (line).
|
|
238 * \param p the position in the alignment (column).
|
|
239 * \return The character at the given position.
|
|
240 *
|
|
241 */
|
|
242 inline char character(unsigned int s, unsigned int p) const { return sequences[s][p]; }
|
|
243
|
|
244
|
|
245 /** \brief Gets a nucleotide
|
|
246 *
|
|
247 * This modifier does perform out-of-bound checking.
|
|
248 * The specified position must exist.
|
|
249 *
|
|
250 * \param sequence the index of the sequence (line).
|
|
251 * \param position the position in the alignment (column).
|
|
252 * \return the character at the given position.
|
|
253 *
|
|
254 */
|
|
255 virtual char get(unsigned int sequence, unsigned int position) const;
|
|
256
|
|
257
|
|
258 /** \brief Sets a matrix position to a new character
|
|
259 *
|
|
260 * This modifier does perform out-of-bound checking.
|
|
261 * The specified position must exist.
|
|
262 *
|
|
263 * \param sequence the index of the sequence (line).
|
|
264 * \param position the position in the alignment (column).
|
|
265 * \param ch the new character value.
|
|
266 */
|
|
267 virtual void set(unsigned int sequence, unsigned position, char ch);
|
|
268
|
|
269
|
|
270 /** \brief Reverse a given column in binary data
|
|
271 *
|
|
272 * The specified column must contain only "0" ans "1" characters.
|
|
273 * "0" is replaced by "1" and all the way around
|
|
274 *
|
|
275 */
|
|
276 void binSwitch(unsigned int pos);
|
|
277
|
|
278
|
|
279 /** \brief Extracts specified positions (columns) of the alignment
|
|
280 *
|
|
281 * All the specified sites are extracted in the specified
|
|
282 * order. This function is suitable for bootstrap (resample
|
|
283 * allowing redrawing the same site) and permutations.
|
|
284 *
|
|
285 * This function doesn't perform out-of-bound checking.
|
|
286 *
|
|
287 * \param list_of_sites a vector containing alignment
|
|
288 * positions.
|
|
289 *
|
|
290 * \return A copy of the object containing the specified
|
|
291 * set of positions.
|
|
292 *
|
|
293 */
|
|
294 Align vslice(std::vector<unsigned int> list_of_sites);
|
|
295
|
|
296
|
|
297 /** \brief Extracts a range of positions (columns)
|
|
298 *
|
|
299 * \param a the first position.
|
|
300 *
|
|
301 * \param b the index immediately passed the last sequence to
|
|
302 * extract.
|
|
303 *
|
|
304 * \return A copy of the object containing the specified
|
|
305 * range of sequences.
|
|
306 *
|
|
307 * Positions a to b-1 are extracted, provided that the
|
|
308 * indices fit in the current length of sequences. To extract
|
|
309 * all sequences, use align.vslice(0, align.ls()).
|
|
310 *
|
|
311 * Note: invalid ranges will be silently supported. If
|
|
312 * a>=ls or b<=a, an empty object is returned. If b>ns,
|
|
313 * ls will be substituted to a.
|
|
314 */
|
|
315 Align vslice(unsigned int a, unsigned int b);
|
|
316
|
|
317
|
|
318 /** \brief Deletes all the content of the object
|
|
319 *
|
|
320 */
|
|
321 virtual void clear();
|
|
322
|
|
323
|
|
324 /** \brief Same as ns()
|
|
325 *
|
|
326 */
|
|
327 inline unsigned int numberOfSequences() const {
|
|
328 return _ns;
|
|
329 }
|
|
330
|
|
331
|
|
332 /** \brief Same as ls()
|
|
333 *
|
|
334 */
|
|
335 inline unsigned int numberOfSites() const {
|
|
336 return _ls;
|
|
337 }
|
|
338
|
|
339
|
|
340 /** \brief Gets a group label (insecure)
|
|
341 *
|
|
342 */
|
|
343 inline unsigned int populationLabel(unsigned int sequenceIndex) const {
|
|
344 return groups[sequenceIndex];
|
|
345 }
|
|
346
|
|
347
|
|
348 /** \brief Just return the passed value
|
|
349 *
|
|
350 */
|
|
351 inline double sitePosition(unsigned int position) const {
|
|
352 return (double) position;
|
|
353 }
|
|
354
|
|
355
|
|
356 protected:
|
|
357
|
|
358 /// This function is not available for alignments
|
|
359 virtual void appendSequence(unsigned int pos, const char* sequence) {}
|
|
360
|
|
361 // Initializer (creates a valid empty alignment)
|
|
362 virtual void init();
|
|
363
|
|
364 // Makes a deep copy of the specified data matrix - if cstring_array is NULL, then ignores it and pads with ?'s
|
|
365 virtual void setFromSource(unsigned int number_of_sequences, unsigned int alignment_length, const char* const * const cstring_array);
|
|
366
|
|
367 // Copies from a Container
|
|
368 virtual void copyObject(const Container&);
|
|
369
|
|
370 // Copies from an Align
|
|
371 virtual void copyObject(const Align&);
|
|
372
|
|
373 // Alignment length
|
|
374 unsigned int _ls;
|
|
375 };
|
|
376 }
|
|
377
|
|
378 #endif
|