1
|
1 /*
|
|
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
|
|
3
|
|
4 This file is part of the EggLib library.
|
|
5
|
|
6 EggLib is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 EggLib is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
|
|
18 */
|
|
19
|
|
20 #ifndef EGGLIB_CONSENSUS_HPP
|
|
21 #define EGGLIB_CONSENSUS_HPP
|
|
22
|
|
23 #include "Align.hpp"
|
|
24 #include <sstream>
|
|
25 #include <string>
|
|
26 #include <vector>
|
|
27
|
|
28 namespace egglib {
|
|
29
|
|
30 /** \brief Generates consensus sequences
|
|
31 *
|
|
32 * \ingroup polymorphism
|
|
33 *
|
|
34 *
|
|
35 * A consensus is generated when two sequences have the same name,
|
|
36 * ignoring everything after the first separator character (by
|
|
37 * default, "_"). Hence, the names "foo", "foo_goo" and "foo_third"
|
|
38 * will be treated as identical and the root will be "foo". The root
|
|
39 * will be used to name the resulting sequence. Note that the
|
|
40 * class works only for DNA sequences.
|
|
41 *
|
|
42 * Symbol convention:
|
|
43 * - A: adenosine
|
|
44 * - C: cytosine
|
|
45 * - G: guanine
|
|
46 * - T: thymine
|
|
47 * - M: A or C
|
|
48 * - R: A or G
|
|
49 * - W: A or T (weak)
|
|
50 * - S: C or G (strong)
|
|
51 * - Y: C or T
|
|
52 * - K: G or T
|
|
53 * - B: C or G or T(not A)
|
|
54 * - D: A or G or T (not C)
|
|
55 * - H: A or C or T (not G)
|
|
56 * - V: A or C or G (not T)
|
|
57 * - N: A or C or G or T
|
|
58 * - ?: nonsequenced position
|
|
59 *
|
|
60 * Other symbols will be treated as ? (lowercase are supported).
|
|
61 *
|
|
62 * Rigorous (alias liberal or strong) mode:
|
|
63 * - If two characters are the same, it is retained whatever it is
|
|
64 * (A + A = A)
|
|
65 * - Otherwise:
|
|
66 * - If one is the missing character (?) the other is retained
|
|
67 * whatever it is (A + ? = A).
|
|
68 * - If characters are consistent, that is one contains
|
|
69 * more information, that one is retained (A + M = A).
|
|
70 * - If characters are not consistent, the closest
|
|
71 * generic symbol is retained (A + C = M).
|
|
72 * .
|
|
73 * Note that the feedback of inconsistent characters in the
|
|
74 * outcome is not garanteed.
|
|
75 * In fact, (A + A + G) will result in R (as expected) but (A +
|
|
76 * G + A) will result in A, masking the problem.
|
|
77 * However, the position will indeed be counted as inconsistent.
|
|
78 *
|
|
79 * Not rigorous (conservative/weak) mode:
|
|
80 * - If two characters are the same, it is retained whatever it
|
|
81 * is (A + A = A).
|
|
82 * - Otherwise:
|
|
83 * - If one is ? the other is retained whatever it is (A + ?
|
|
84 * = A).
|
|
85 * - Otherwise an inconsistent character (by default, Z) is
|
|
86 * retained (A + C = Z).
|
|
87 *
|
|
88 * Iterative process of consensus:
|
|
89 * - Each sequence is taken in turn.
|
|
90 * - Each pair involving the focus sequence is processed and a
|
|
91 * consensus is generated.
|
|
92 * - When all pair have been processsed, the consensus already
|
|
93 * generated are themselves iteratively processed until only one
|
|
94 * remains.
|
|
95 * - Note that at each time the last two are taken first.
|
|
96 *
|
|
97 * A transparent interface gives access to the data for all steps of
|
|
98 * the consensus process, as vectors that covers all pairs (including
|
|
99 * intermediate steps of the iterative procedure described above) as
|
|
100 * well as singleton sequences. For the latter, the second name is
|
|
101 * not filled and all counts are set to 0. Note also that the name of
|
|
102 * such singleton sequence is shortened to the separator as well.
|
|
103 *
|
|
104 */
|
|
105 class Consensus {
|
|
106 public:
|
|
107 /** \brief Constructor
|
|
108 *
|
|
109 */
|
|
110 Consensus();
|
|
111
|
|
112 /** \brief Destructor
|
|
113 *
|
|
114 */
|
|
115 virtual ~Consensus() {}
|
|
116
|
|
117 /// Sets the character interpreted as missing (default: ?)
|
|
118 void setMissing(char);
|
|
119
|
|
120 /// Sets the character used to point to disagreements (default: Z)
|
|
121 void setDisagreement(char);
|
|
122
|
|
123 /// Checks all the characters
|
|
124 bool check_sequences(Align& align);
|
|
125
|
|
126 /** \brief Reduces the sequence alignment by making consensus sequences
|
|
127 *
|
|
128 * \param align the original alignment.
|
|
129 *
|
|
130 * \param separator the character used to separated the root
|
|
131 * name of sequences to the variable part, as in (for the
|
|
132 * default value: "sequence_read1".
|
|
133 *
|
|
134 * \param rigorous consensus mode.
|
|
135 *
|
|
136 * \return An Align instance with duplicated sequences consensed.
|
|
137 *
|
|
138 */
|
|
139 Align consensus(Align& align, char separator='_', bool rigorous=true);
|
|
140
|
|
141 /// First name of consensed pairs
|
|
142 const std::vector<std::string>& firstSequenceNames();
|
|
143
|
|
144 /// Second names of consensed pairs
|
|
145 const std::vector<std::string>& secondSequenceNames();
|
|
146
|
|
147 /// Root names of consensed pairs
|
|
148 const std::vector<std::string>& roots();
|
|
149
|
|
150 /// Number of consistent positions for all consensed pairs
|
|
151 const std::vector<int>& consistentPositions();
|
|
152
|
|
153 /// Number of complementary positions for all consensed pairs
|
|
154 const std::vector<int>& complementaryPositions();
|
|
155
|
|
156 /// Number of uninformative positions for all consensed pairs
|
|
157 const std::vector<int>& uninformativePositions();
|
|
158
|
|
159 /// Number of ambiguous positions for all consensed pairs
|
|
160 const std::vector<int>& ambiguousPositions();
|
|
161
|
|
162 /// Number of at least partially resolved ambiguities for all consensed pairs
|
|
163 const std::vector<int>& atLeastPartiallyResolvedAmbiguities();
|
|
164
|
|
165 /// Vector of inconsistent positions ofr all consensed pairs
|
|
166 const std::vector<std::vector<int> >& inconsistentPositions();
|
|
167
|
|
168
|
|
169 private:
|
|
170
|
|
171 /** \brief Copying this class is not allowed
|
|
172 *
|
|
173 */
|
|
174 Consensus(const Consensus& source) { }
|
|
175
|
|
176 /** \brief Copying this class is not allowed
|
|
177 *
|
|
178 */
|
|
179 Consensus& operator=(const Consensus& source) { return *this; }
|
|
180
|
|
181 // A private helper
|
|
182 class PairwiseConsensus;
|
|
183
|
|
184 // report data
|
|
185 std::vector<std::string> t_firstSequenceNames;
|
|
186 std::vector<std::string> t_secondSequenceNames;
|
|
187 std::vector<std::string> t_roots;
|
|
188 std::vector<int> t_consistentPositions;
|
|
189 std::vector<int> t_complementaryPositions;
|
|
190 std::vector<int> t_uninformativePositions;
|
|
191 std::vector<int> t_ambiguousPositions;
|
|
192 std::vector<int> t_atLeastPartiallyResolvedAmbiguities;
|
|
193 std::vector<std::vector<int> > t_inconsistentPositions;
|
|
194
|
|
195 // Code for missing data (usually ?)
|
|
196 char MISSING;
|
|
197
|
|
198 // Code for disgrement
|
|
199 char DISAGREEMENT;
|
|
200
|
|
201 // Helper class managing a single pair
|
|
202 class PairwiseConsensus {
|
|
203 public:
|
|
204 // Default object creation
|
|
205 PairwiseConsensus();
|
|
206
|
|
207 // Object destruction
|
|
208 virtual ~PairwiseConsensus() {}
|
|
209
|
|
210 // Usual object creation
|
|
211 PairwiseConsensus(std::string, std::string);
|
|
212
|
|
213 // Fills an object created with the default constructor
|
|
214 void load(std::string,std::string);
|
|
215
|
|
216 // Changes the MISSING character
|
|
217 void setUndeterminedCharacter(char);
|
|
218
|
|
219 // Changes the DISAGREEMENT character
|
|
220 void setDisagreementCharacter(char);
|
|
221
|
|
222 /* Uses the conservative mode of consensus
|
|
223 *
|
|
224 * Tries to avoid to make decisions, and adds the
|
|
225 * character set by DISAGREEMENT upon inconsistencies
|
|
226 *
|
|
227 * return The number of inconsistencies.
|
|
228 */
|
|
229 int generateSoftConsensus();
|
|
230
|
|
231 /* Strict mode of consensus
|
|
232 *
|
|
233 * The number of inconsistencies.
|
|
234 */
|
|
235 int generateHardConsensus();
|
|
236
|
|
237 // Two fully resolved (including gap) and identical characters
|
|
238 int getConsistentPositions();
|
|
239
|
|
240 // One informative (including gap) and one missing
|
|
241 int getComplementaryPositions();
|
|
242
|
|
243 // None missing, but different and incompatible
|
|
244 int getInconsistentPositions();
|
|
245
|
|
246 // Both are missing
|
|
247 int getUninformativePositions();
|
|
248
|
|
249 // Both identical or one missing, but not fully resolved
|
|
250 int getAmbiguousPositions();
|
|
251
|
|
252 // Different, not missing, complementary.
|
|
253 int getAtLeastPartiallyResolvedAmbiguities();
|
|
254
|
|
255 // Accessor
|
|
256 int getThisInconsistentPosition(unsigned int);
|
|
257
|
|
258 // Generates the consensus sequence
|
|
259 std::string getConsensus();
|
|
260
|
|
261 private:
|
|
262
|
|
263 inline bool isValid(char c) {
|
|
264 switch (c) {
|
|
265 case 'A': case 'a':
|
|
266 case 'C': case 'c':
|
|
267 case 'G': case 'g':
|
|
268 case 'T': case 't':
|
|
269 return true;
|
|
270 default:
|
|
271 return false;
|
|
272 }
|
|
273 }
|
|
274
|
|
275 // This initiates a series of embedded objects
|
|
276 void setCharacterContainers();
|
|
277
|
|
278 // The first sequence
|
|
279 std::string seqA;
|
|
280
|
|
281 // The second sequence
|
|
282 std::string seqB;
|
|
283
|
|
284 // The resulting consensus
|
|
285 std::string cons;
|
|
286
|
|
287 // The vecotr storing the inconsistent positions
|
|
288 std::vector<int> posIncons;
|
|
289
|
|
290 // The length of the sequences
|
|
291 unsigned int ls;
|
|
292
|
|
293 // Counter
|
|
294 int cntConsistentPositions;
|
|
295
|
|
296 // Counter
|
|
297 int cntComplementaryPositions;
|
|
298
|
|
299 // Counter
|
|
300 int cntAmbiguousPositions;
|
|
301
|
|
302 // Counter
|
|
303 int cntInconsistentPositions;
|
|
304
|
|
305 // Counter
|
|
306 int cntUninformativePositions;
|
|
307
|
|
308 // Counter
|
|
309 int cntAtLeastPartiallyResolvedAmbiguities;
|
|
310
|
|
311 // Code for missing data (usually ?)
|
|
312 char MISSING;
|
|
313
|
|
314 // Code for disgrement
|
|
315 char DISAGREEMENT;
|
|
316
|
|
317 public:
|
|
318 // This class manages relationships different symbols
|
|
319 class CharacterContainer {
|
|
320 public:
|
|
321 // Default value: @
|
|
322 CharacterContainer();
|
|
323
|
|
324 // Initiates to a given symbol
|
|
325 CharacterContainer(const char&);
|
|
326
|
|
327 // Assignment operator
|
|
328 CharacterContainer& operator=(const char&);
|
|
329
|
|
330 // Sets the symbol
|
|
331 void setValue(char);
|
|
332
|
|
333 // Set the descendants
|
|
334 void setSons(std::vector<CharacterContainer>);
|
|
335
|
|
336 // Tests whether the symbol is the same
|
|
337 bool is(CharacterContainer);
|
|
338
|
|
339 // Tests if the query is contained amongst the sons
|
|
340 bool has(CharacterContainer);
|
|
341
|
|
342 // Tests if the query is contained amongst the sons
|
|
343 bool has(char);
|
|
344
|
|
345 /* Tests whether the left character has the left one
|
|
346 * Should be called on the N object only.
|
|
347 */
|
|
348 char lhas(CharacterContainer,CharacterContainer);
|
|
349
|
|
350 /* Creates the object with the proper sons
|
|
351 * Should be called on the N object only.
|
|
352 */
|
|
353 CharacterContainer init(char);
|
|
354
|
|
355 // The symbol
|
|
356 char value;
|
|
357
|
|
358 // The descendants
|
|
359 std::vector<CharacterContainer> sons;
|
|
360 };
|
|
361
|
|
362 private:
|
|
363 // Symbol ?
|
|
364 CharacterContainer ccQ;
|
|
365
|
|
366 // Symbol A
|
|
367 CharacterContainer ccA;
|
|
368
|
|
369 // Symbol C
|
|
370 CharacterContainer ccC;
|
|
371
|
|
372 // Symbol G
|
|
373 CharacterContainer ccG;
|
|
374
|
|
375 // Symbol T
|
|
376 CharacterContainer ccT;
|
|
377
|
|
378 // Symbol U
|
|
379 CharacterContainer ccU;
|
|
380
|
|
381 // Symbol M
|
|
382 CharacterContainer ccM;
|
|
383
|
|
384 // Symbol R
|
|
385 CharacterContainer ccR;
|
|
386
|
|
387 // Symbol W
|
|
388 CharacterContainer ccW;
|
|
389
|
|
390 // Symbol S
|
|
391 CharacterContainer ccS;
|
|
392
|
|
393 // Symbol Y
|
|
394 CharacterContainer ccY;
|
|
395
|
|
396 // Symbol K
|
|
397 CharacterContainer ccK;
|
|
398
|
|
399 // Symbol B
|
|
400 CharacterContainer ccB;
|
|
401
|
|
402 // Symbol D
|
|
403 CharacterContainer ccD;
|
|
404
|
|
405 // Symbol H
|
|
406 CharacterContainer ccH;
|
|
407
|
|
408 // Symbol V
|
|
409 CharacterContainer ccV;
|
|
410
|
|
411 // Symbol N
|
|
412 CharacterContainer ccN;
|
|
413
|
|
414 // Symbol -
|
|
415 CharacterContainer ccGAP;
|
|
416 };
|
|
417 };
|
|
418 }
|
|
419
|
|
420 #endif
|
|
421
|