comparison egglib/egglib-2.1.5/include/egglib-cpp/Consensus.hpp @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
comparison
equal deleted inserted replaced
0:3e19d0dfcf3e 1:420b57c3c185
1 /*
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
3
4 This file is part of the EggLib library.
5
6 EggLib is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 EggLib is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef EGGLIB_CONSENSUS_HPP
21 #define EGGLIB_CONSENSUS_HPP
22
23 #include "Align.hpp"
24 #include <sstream>
25 #include <string>
26 #include <vector>
27
28 namespace egglib {
29
30 /** \brief Generates consensus sequences
31 *
32 * \ingroup polymorphism
33 *
34 *
35 * A consensus is generated when two sequences have the same name,
36 * ignoring everything after the first separator character (by
37 * default, "_"). Hence, the names "foo", "foo_goo" and "foo_third"
38 * will be treated as identical and the root will be "foo". The root
39 * will be used to name the resulting sequence. Note that the
40 * class works only for DNA sequences.
41 *
42 * Symbol convention:
43 * - A: adenosine
44 * - C: cytosine
45 * - G: guanine
46 * - T: thymine
47 * - M: A or C
48 * - R: A or G
49 * - W: A or T (weak)
50 * - S: C or G (strong)
51 * - Y: C or T
52 * - K: G or T
53 * - B: C or G or T(not A)
54 * - D: A or G or T (not C)
55 * - H: A or C or T (not G)
56 * - V: A or C or G (not T)
57 * - N: A or C or G or T
58 * - ?: nonsequenced position
59 *
60 * Other symbols will be treated as ? (lowercase are supported).
61 *
62 * Rigorous (alias liberal or strong) mode:
63 * - If two characters are the same, it is retained whatever it is
64 * (A + A = A)
65 * - Otherwise:
66 * - If one is the missing character (?) the other is retained
67 * whatever it is (A + ? = A).
68 * - If characters are consistent, that is one contains
69 * more information, that one is retained (A + M = A).
70 * - If characters are not consistent, the closest
71 * generic symbol is retained (A + C = M).
72 * .
73 * Note that the feedback of inconsistent characters in the
74 * outcome is not garanteed.
75 * In fact, (A + A + G) will result in R (as expected) but (A +
76 * G + A) will result in A, masking the problem.
77 * However, the position will indeed be counted as inconsistent.
78 *
79 * Not rigorous (conservative/weak) mode:
80 * - If two characters are the same, it is retained whatever it
81 * is (A + A = A).
82 * - Otherwise:
83 * - If one is ? the other is retained whatever it is (A + ?
84 * = A).
85 * - Otherwise an inconsistent character (by default, Z) is
86 * retained (A + C = Z).
87 *
88 * Iterative process of consensus:
89 * - Each sequence is taken in turn.
90 * - Each pair involving the focus sequence is processed and a
91 * consensus is generated.
92 * - When all pair have been processsed, the consensus already
93 * generated are themselves iteratively processed until only one
94 * remains.
95 * - Note that at each time the last two are taken first.
96 *
97 * A transparent interface gives access to the data for all steps of
98 * the consensus process, as vectors that covers all pairs (including
99 * intermediate steps of the iterative procedure described above) as
100 * well as singleton sequences. For the latter, the second name is
101 * not filled and all counts are set to 0. Note also that the name of
102 * such singleton sequence is shortened to the separator as well.
103 *
104 */
105 class Consensus {
106 public:
107 /** \brief Constructor
108 *
109 */
110 Consensus();
111
112 /** \brief Destructor
113 *
114 */
115 virtual ~Consensus() {}
116
117 /// Sets the character interpreted as missing (default: ?)
118 void setMissing(char);
119
120 /// Sets the character used to point to disagreements (default: Z)
121 void setDisagreement(char);
122
123 /// Checks all the characters
124 bool check_sequences(Align& align);
125
126 /** \brief Reduces the sequence alignment by making consensus sequences
127 *
128 * \param align the original alignment.
129 *
130 * \param separator the character used to separated the root
131 * name of sequences to the variable part, as in (for the
132 * default value: "sequence_read1".
133 *
134 * \param rigorous consensus mode.
135 *
136 * \return An Align instance with duplicated sequences consensed.
137 *
138 */
139 Align consensus(Align& align, char separator='_', bool rigorous=true);
140
141 /// First name of consensed pairs
142 const std::vector<std::string>& firstSequenceNames();
143
144 /// Second names of consensed pairs
145 const std::vector<std::string>& secondSequenceNames();
146
147 /// Root names of consensed pairs
148 const std::vector<std::string>& roots();
149
150 /// Number of consistent positions for all consensed pairs
151 const std::vector<int>& consistentPositions();
152
153 /// Number of complementary positions for all consensed pairs
154 const std::vector<int>& complementaryPositions();
155
156 /// Number of uninformative positions for all consensed pairs
157 const std::vector<int>& uninformativePositions();
158
159 /// Number of ambiguous positions for all consensed pairs
160 const std::vector<int>& ambiguousPositions();
161
162 /// Number of at least partially resolved ambiguities for all consensed pairs
163 const std::vector<int>& atLeastPartiallyResolvedAmbiguities();
164
165 /// Vector of inconsistent positions ofr all consensed pairs
166 const std::vector<std::vector<int> >& inconsistentPositions();
167
168
169 private:
170
171 /** \brief Copying this class is not allowed
172 *
173 */
174 Consensus(const Consensus& source) { }
175
176 /** \brief Copying this class is not allowed
177 *
178 */
179 Consensus& operator=(const Consensus& source) { return *this; }
180
181 // A private helper
182 class PairwiseConsensus;
183
184 // report data
185 std::vector<std::string> t_firstSequenceNames;
186 std::vector<std::string> t_secondSequenceNames;
187 std::vector<std::string> t_roots;
188 std::vector<int> t_consistentPositions;
189 std::vector<int> t_complementaryPositions;
190 std::vector<int> t_uninformativePositions;
191 std::vector<int> t_ambiguousPositions;
192 std::vector<int> t_atLeastPartiallyResolvedAmbiguities;
193 std::vector<std::vector<int> > t_inconsistentPositions;
194
195 // Code for missing data (usually ?)
196 char MISSING;
197
198 // Code for disgrement
199 char DISAGREEMENT;
200
201 // Helper class managing a single pair
202 class PairwiseConsensus {
203 public:
204 // Default object creation
205 PairwiseConsensus();
206
207 // Object destruction
208 virtual ~PairwiseConsensus() {}
209
210 // Usual object creation
211 PairwiseConsensus(std::string, std::string);
212
213 // Fills an object created with the default constructor
214 void load(std::string,std::string);
215
216 // Changes the MISSING character
217 void setUndeterminedCharacter(char);
218
219 // Changes the DISAGREEMENT character
220 void setDisagreementCharacter(char);
221
222 /* Uses the conservative mode of consensus
223 *
224 * Tries to avoid to make decisions, and adds the
225 * character set by DISAGREEMENT upon inconsistencies
226 *
227 * return The number of inconsistencies.
228 */
229 int generateSoftConsensus();
230
231 /* Strict mode of consensus
232 *
233 * The number of inconsistencies.
234 */
235 int generateHardConsensus();
236
237 // Two fully resolved (including gap) and identical characters
238 int getConsistentPositions();
239
240 // One informative (including gap) and one missing
241 int getComplementaryPositions();
242
243 // None missing, but different and incompatible
244 int getInconsistentPositions();
245
246 // Both are missing
247 int getUninformativePositions();
248
249 // Both identical or one missing, but not fully resolved
250 int getAmbiguousPositions();
251
252 // Different, not missing, complementary.
253 int getAtLeastPartiallyResolvedAmbiguities();
254
255 // Accessor
256 int getThisInconsistentPosition(unsigned int);
257
258 // Generates the consensus sequence
259 std::string getConsensus();
260
261 private:
262
263 inline bool isValid(char c) {
264 switch (c) {
265 case 'A': case 'a':
266 case 'C': case 'c':
267 case 'G': case 'g':
268 case 'T': case 't':
269 return true;
270 default:
271 return false;
272 }
273 }
274
275 // This initiates a series of embedded objects
276 void setCharacterContainers();
277
278 // The first sequence
279 std::string seqA;
280
281 // The second sequence
282 std::string seqB;
283
284 // The resulting consensus
285 std::string cons;
286
287 // The vecotr storing the inconsistent positions
288 std::vector<int> posIncons;
289
290 // The length of the sequences
291 unsigned int ls;
292
293 // Counter
294 int cntConsistentPositions;
295
296 // Counter
297 int cntComplementaryPositions;
298
299 // Counter
300 int cntAmbiguousPositions;
301
302 // Counter
303 int cntInconsistentPositions;
304
305 // Counter
306 int cntUninformativePositions;
307
308 // Counter
309 int cntAtLeastPartiallyResolvedAmbiguities;
310
311 // Code for missing data (usually ?)
312 char MISSING;
313
314 // Code for disgrement
315 char DISAGREEMENT;
316
317 public:
318 // This class manages relationships different symbols
319 class CharacterContainer {
320 public:
321 // Default value: @
322 CharacterContainer();
323
324 // Initiates to a given symbol
325 CharacterContainer(const char&);
326
327 // Assignment operator
328 CharacterContainer& operator=(const char&);
329
330 // Sets the symbol
331 void setValue(char);
332
333 // Set the descendants
334 void setSons(std::vector<CharacterContainer>);
335
336 // Tests whether the symbol is the same
337 bool is(CharacterContainer);
338
339 // Tests if the query is contained amongst the sons
340 bool has(CharacterContainer);
341
342 // Tests if the query is contained amongst the sons
343 bool has(char);
344
345 /* Tests whether the left character has the left one
346 * Should be called on the N object only.
347 */
348 char lhas(CharacterContainer,CharacterContainer);
349
350 /* Creates the object with the proper sons
351 * Should be called on the N object only.
352 */
353 CharacterContainer init(char);
354
355 // The symbol
356 char value;
357
358 // The descendants
359 std::vector<CharacterContainer> sons;
360 };
361
362 private:
363 // Symbol ?
364 CharacterContainer ccQ;
365
366 // Symbol A
367 CharacterContainer ccA;
368
369 // Symbol C
370 CharacterContainer ccC;
371
372 // Symbol G
373 CharacterContainer ccG;
374
375 // Symbol T
376 CharacterContainer ccT;
377
378 // Symbol U
379 CharacterContainer ccU;
380
381 // Symbol M
382 CharacterContainer ccM;
383
384 // Symbol R
385 CharacterContainer ccR;
386
387 // Symbol W
388 CharacterContainer ccW;
389
390 // Symbol S
391 CharacterContainer ccS;
392
393 // Symbol Y
394 CharacterContainer ccY;
395
396 // Symbol K
397 CharacterContainer ccK;
398
399 // Symbol B
400 CharacterContainer ccB;
401
402 // Symbol D
403 CharacterContainer ccD;
404
405 // Symbol H
406 CharacterContainer ccH;
407
408 // Symbol V
409 CharacterContainer ccV;
410
411 // Symbol N
412 CharacterContainer ccN;
413
414 // Symbol -
415 CharacterContainer ccGAP;
416 };
417 };
418 }
419
420 #endif
421