comparison egglib/egglib-2.1.5/include/egglib-cpp/Container.hpp @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
comparison
equal deleted inserted replaced
0:3e19d0dfcf3e 1:420b57c3c185
1 /*
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
3
4 This file is part of the EggLib library.
5
6 EggLib is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 EggLib is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20
21 #ifndef EGGLIB_CONTAINER_HPP
22 #define EGGLIB_CONTAINER_HPP
23
24
25 namespace egglib {
26
27 /** \brief Handles a set of sequence alignment (aligned or not)
28 *
29 * \ingroup core
30 *
31 * Creation from a file or string stream should be performed using
32 * the class Fasta.
33 *
34 * Sequences are represented by two strings (name and sequence) and
35 * an integer (group) that can be accessed or modified by index.The
36 * order of sequences is guaranteed to be conserved, as if Container
37 * was a list of triplets (name, sequence, group).
38 *
39 * The data matrix is implemented as continuous arrays (char**) and
40 * allows efficient access and modification of data. For very large
41 * data matrices you might claim immediately the required memory
42 * using the constructor Container(unsigned int, char**).
43 *
44 */
45 class Container {
46
47 public:
48
49 /** \brief Creates an empty object
50 *
51 */
52 Container();
53
54
55 /** \brief Copy constructor
56 *
57 */
58 Container(const Container& source);
59
60
61 /** \brief Assignment operator
62 *
63 */
64 Container& operator= (const Container& source);
65
66
67 /** \brief Creates an object from a data matrix
68 *
69 * Allows you to create an object from data stored in a char*
70 * array. The array's size must be passed to the constructor.
71 * Since sequences can have different lengths, you need to
72 * terminate each sequence by a NULL character. This constructor
73 * is dedicated to very performance-critical tasks. For usual
74 * tasks, using the default constructor and subsequently adding
75 * sequences with addSeq should be enough.
76 *
77 * \param number_of_sequences the number of sequences (the length
78 * of the first dimension of the array).
79 *
80 * \param cstring_array the pointer to the data matrix.
81 *
82 */
83 Container(unsigned int number_of_sequences, char const* const* const cstring_array);
84
85
86 /** \brief Destructor
87 *
88 */
89 virtual ~Container();
90
91
92 /** \brief Clears all content of the object
93 *
94 */
95 virtual void clear();
96
97
98 /** \brief Adds a sequence to the object
99 *
100 * \param name the name of the sequence, as a c-string.
101 * \param sequence the sequence string, as a c-string.
102 * \param group the group index of the sequence.
103 *
104 * \return The new number of sequences.
105 *
106 */
107 virtual unsigned int append(const char* name, const char* sequence, unsigned int group=0);
108
109
110 /** \brief Removes a sequence from the object
111 *
112 * \param pos the index of the sequence to remove.
113 *
114 * \return The new number of sequences.
115 */
116 virtual unsigned int remove(unsigned int pos);
117
118
119 /** \brief Changes the name of a given sequence
120 *
121 * \param pos the sequence index.
122 * \param name the new name as a C-like string.
123 *
124 */
125 virtual void name(unsigned int pos, const char* name);
126
127
128 /** \brief Changes the sequence string of a given sequence
129 *
130 * \param pos the sequence index.
131 * \param sequence the new sequence as a C-like string.
132 *
133 */
134 virtual void sequence(unsigned int pos, const char* sequence);
135
136
137 /** \brief Appends a string to the a given sequence
138 *
139 * \param pos the sequence index.
140 * \param sequence the sequence to append at the end of the
141 * current one.
142 *
143 */
144 virtual void appendSequence(unsigned int pos, const char* sequence);
145
146
147 /** \brief Changes a character
148 *
149 * \param sequence the sequence index.
150 * \param position the character index.
151 * \param ch the new character value.
152 *
153 * The positions must fit in the current ranges.
154 *
155 */
156 virtual void set(unsigned int sequence, unsigned position, char ch);
157
158
159 /** \brief Gets a given character
160 *
161 * \param s the sequence index.
162 * \param p the character index.
163 *
164 * \return the character value.
165 *
166 * The positions must fit in the current ranges.
167 *
168 */
169 virtual char get(unsigned int s, unsigned int p) const;
170
171
172 /** \brief Changes the group index of a given sequence
173 *
174 * \param pos the sequence index.
175 * \param group the new group index value.
176 *
177 */
178 virtual void group(unsigned int pos, unsigned int group);
179
180
181 /** \brief Extracts a range of sequences
182 *
183 * \param a the index of the first sequence.
184 *
185 * \param b the index immediately passed the last sequence to
186 * extract.
187 *
188 * \return A copy of the object containing the specified
189 * range of sequences.
190 *
191 * Sequences a to b-1 are extracted, provided that the
192 * indices fit in the current number of sequences. To extract
193 * all sequences, use container.hslice(0, container.ns()).
194 *
195 * Note: invalid ranges will be silently supported. If
196 * a>=ls or b<=a, an empty object is returned. If b>ns,
197 * ls will be substituted to a.
198 *
199 */
200 Container hslice(unsigned int a, unsigned int b) const;
201
202
203 /** \brief Gets the number of sequences
204 *
205 */
206 unsigned int ns() const;
207
208
209 /** \brief Gets the length of a given sequence
210 *
211 * \param pos the index of the sequence.
212 *
213 * \return The length of that particular sequence.
214 *
215 */
216 virtual unsigned int ls(unsigned int pos) const ;
217
218
219 /** \brief Gets the name of the a given sequence
220 *
221 * \param pos the index of the sequence.
222 *
223 * \return The name of that particular sequence.
224 *
225 */
226 virtual const char* name(unsigned int pos) const;
227
228
229 /** \brief Gets the name of a given sequence
230 *
231 * \param pos the index of the sequence.
232 *
233 * \return The sequence string for that particular sequence.
234 *
235 */
236 virtual const char* sequence(unsigned int pos) const;
237
238
239
240 /** \brief Gets the group index of a given sequence
241 *
242 * \param pos the index of the sequence.
243 *
244 * \return The group index of that particular sequence.
245 *
246 */
247 virtual unsigned int group(unsigned int pos) const;
248
249
250 /** \brief Checks if all lengths are equal
251 *
252 * Returns true if the length of all sequences are equal or
253 * if there is less thant two sequences.
254 *
255 */
256 bool isEqual() const;
257
258
259 /** \brief Equalizes sequence lengths
260 *
261 * Extends sequences as need to ensure that all sequences
262 * have the same length.
263 *
264 * \param ch the character to use for padding.
265 *
266 * \return The final length obtained, which is the length of
267 * the longest sequence before the operation.
268 *
269 */
270 unsigned int equalize(char ch='?');
271
272
273 /** \brief Finds a sequence by its name
274 *
275 * Gets the position of the first sequence with the specified
276 * name.
277 *
278 * \param string a sequence name.
279 *
280 * \param strict if true, seeks an exact match. If false,
281 * compares only until the end of the requested name (for
282 * example: ATCFF will match ATCFF_01 if strict is false).
283 *
284 * \return The lowest index where the name matches, -1 if no
285 * sequence has such name.
286 *
287 */
288 int find(const char* string, bool strict=true) const;
289
290
291 protected:
292 // The number of sequences
293 unsigned int _ns;
294
295 // The array of name lengths
296 unsigned int* lnames;
297
298 // The array of names
299 char** names;
300
301 // The array of sequences (as c-strings)
302 char** sequences;
303
304 // The array of groups
305 unsigned int* groups;
306
307 // Imports an array of c-strings
308 virtual void setFromSource(unsigned int number_of_sequences, const char* const* const cstring_array);
309
310 // Constructor helper
311 virtual void copyObject(const Container&);
312
313 // Constructor partial helper
314 virtual void getNamesAndGroups(const Container&);
315
316 private:
317
318 // The array of sequence lengths
319 unsigned int* lsequences;
320
321 // Setup a valid empty object
322 virtual void init();
323 };
324 }
325
326 #endif