1
|
1 /*
|
|
2 Copyright 2009, 2010, 2012 Stéphane De Mita, Mathieu Siol
|
|
3
|
|
4 This file is part of the EggLib library.
|
|
5
|
|
6 EggLib is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 EggLib is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
|
|
18 */
|
|
19
|
|
20 #ifndef EGGLIB_MUTATOR_HPP
|
|
21 #define EGGLIB_MUTATOR_HPP
|
|
22
|
|
23
|
|
24 #include "DataMatrix.hpp"
|
|
25 #include "Random.hpp"
|
|
26 #include "Arg.hpp"
|
|
27 #include "Mutation.hpp"
|
|
28
|
|
29
|
|
30 namespace egglib {
|
|
31
|
|
32
|
|
33 /** \brief Implements mutation models
|
|
34 *
|
|
35 * \ingroup coalesce
|
|
36 *
|
|
37 * Works with a previously built Ancestral Reconbination Graph. The
|
|
38 * user must sets options using the setter-based interface. After
|
|
39 * that he or she can call the method mute() that will generates
|
|
40 * a DataMatrix object.
|
|
41 *
|
|
42 * Genotype data are represented by integer numbers. Regardless of
|
|
43 * the mutation model, the ancestral state is always 0. The user can
|
|
44 * set the rate of mutation (or, alternatively, fix the number of
|
|
45 * mutations that occurred - which is the number of segregating sites
|
|
46 * only with an infinite site model).
|
|
47 *
|
|
48 * Other options fall into two separate groups: the positions of the
|
|
49 * mutated sites and the process of mutation (how new alleles are
|
|
50 * generated).
|
|
51 *
|
|
52 * Concerning allele generation, several mutation models are available
|
|
53 * (coded by single letters):
|
|
54 * - F: fixed number of alleles. Among other markers, this model is
|
|
55 * appropriate for simulating nucleotides. The user is able
|
|
56 * to choose the number of alleles (where 2 is the standard
|
|
57 * for an infinite site model and 4 for a finite site model).
|
|
58 * Mutator allows assigning independent weights between all
|
|
59 * different transition types and can draw randomly the
|
|
60 * ancestral states, providing a way to emulate evolution of
|
|
61 * nucleotides with multiple mutations at the same site and
|
|
62 * reversion.
|
|
63 * - I: infinite number of alleles. At a given site, each mutation
|
|
64 * raises a new allele. The value of the alleles is therefore
|
|
65 * irrelevant (it only denotes its order of appearance). This
|
|
66 * model does not permit homoplasy.
|
|
67 * - S: stepwise mutation model. In this model the value of the
|
|
68 * alleles are interpreted as a size (typically for simulating
|
|
69 * a microsatellite marker). Each mutation either increases
|
|
70 * or decreases the allele size by an increment of one.
|
|
71 * - T: two-phase mutation model. This model is a generalization
|
|
72 * of the stepwise mutation model (S). For a mutation, the
|
|
73 * increment (either increase or decrease) is 1 with the
|
|
74 * probability given by the parameter (1-TPMproba). With
|
|
75 * probability TPMproba, the increment is drawn from a
|
|
76 * geometric distribution of parameter given by the other
|
|
77 * parameter (TPMparam).
|
|
78 *
|
|
79 * By default, the program will assume an infinite site model (ISM).
|
|
80 * Each mutation will occur to a new position drawn from the interval
|
|
81 * [0,1]. It is possible to set any mutation model with an ISM
|
|
82 * (including microsatellite-like models I, S and T). Alternatively,
|
|
83 * the user can specify a finite number of sites available for
|
|
84 * mutation. For a microsatellite marker, the user will want to
|
|
85 * specify a single site. It is possible to set a finite number of
|
|
86 * sites for all mutation models. In all cases, the mutations will
|
|
87 * be forced to target these sites. It is possible to apply weights
|
|
88 * independently to all sites. The higher the weight value
|
|
89 * (comparatively to the other sites), the higher the probability
|
|
90 * that this site mutes. The weights needs not to be relative. In
|
|
91 * addition, the user can set the positions of the different sites.
|
|
92 * Nothings forces him or her to place them in order. Note that this
|
|
93 * does not affect the mutation process, but on the amount of
|
|
94 * recombination that will be allowed between sites.
|
|
95 *
|
|
96 */
|
|
97 class Mutator {
|
|
98
|
|
99 public:
|
|
100
|
|
101 /** \brief Initializes with default values
|
|
102 *
|
|
103 * List of default values:
|
|
104 * - theta = 0
|
|
105 * - fixedNumberOfMutations = 0
|
|
106 * - model = F (fixed number of alleles)
|
|
107 * - fixed number of alleles = 2
|
|
108 * - infinite site model
|
|
109 * - TPM parameters are both preset to 0.5
|
|
110 *
|
|
111 */
|
|
112 Mutator();
|
|
113
|
|
114
|
|
115 /** \brief Destroys the object
|
|
116 *
|
|
117 */
|
|
118 ~Mutator();
|
|
119
|
|
120
|
|
121 /** \brief Copy constructor
|
|
122 *
|
|
123 */
|
|
124 Mutator(const Mutator&);
|
|
125
|
|
126
|
|
127 /** \brief Assignement operator
|
|
128 *
|
|
129 */
|
|
130 Mutator& operator=(const Mutator&);
|
|
131
|
|
132
|
|
133 /** \brief Restores default values of all parameters
|
|
134 *
|
|
135 */
|
|
136 void reset();
|
|
137
|
|
138
|
|
139 /** \brief Gets the fixed number of mutations
|
|
140 *
|
|
141 */
|
|
142 unsigned int fixedNumberOfMutations() const;
|
|
143
|
|
144
|
|
145 /** \brief Sets the fixed number of mutations
|
|
146 *
|
|
147 * The value can be 0. It is not allowed to set both the
|
|
148 * fixed number of mutations and the mutation rate at
|
|
149 * non-zero value
|
|
150 *
|
|
151 */
|
|
152 void fixedNumberOfMutations(unsigned int);
|
|
153
|
|
154
|
|
155 /** \brief Gets the mutation rate
|
|
156 *
|
|
157 */
|
|
158 double mutationRate() const;
|
|
159
|
|
160
|
|
161 /** \brief Sets the mutation rate
|
|
162 *
|
|
163 * The value cannot be negative. The value can be 0. It is
|
|
164 * not allowed to set both the fixed number of mutations and
|
|
165 * the mutation rate at non-zero value
|
|
166 *
|
|
167 */
|
|
168 void mutationRate(double);
|
|
169
|
|
170
|
|
171 /** \brief Gets the mutation model
|
|
172 *
|
|
173 * See the class documentation for the signification of the
|
|
174 * different one-letter codes.
|
|
175 *
|
|
176 */
|
|
177 char mutationModel() const;
|
|
178
|
|
179
|
|
180 /** \brief Sets the mutation model
|
|
181 *
|
|
182 * The passed character must be one of F, I, S and T. See the
|
|
183 * class documentation for their signification.
|
|
184 *
|
|
185 */
|
|
186 void mutationModel(char);
|
|
187
|
|
188
|
|
189 /** \brief Gets the fixed number of possible alleles
|
|
190 *
|
|
191 */
|
|
192 unsigned int numberOfAlleles() const;
|
|
193
|
|
194
|
|
195 /** \brief Sets the fixed number of possible alleles
|
|
196 *
|
|
197 * The value must be larger than 1. This parameter is
|
|
198 * meaningful only for the fixed number allele model of
|
|
199 * mutation, and ignored otherwise.
|
|
200 *
|
|
201 */
|
|
202 void numberOfAlleles(unsigned int);
|
|
203
|
|
204
|
|
205 /** \brief Sets a transition weight
|
|
206 *
|
|
207 * \param i row (previous allele index).
|
|
208 * \param j column (next allele index).
|
|
209 * \param value weight to apply.
|
|
210 *
|
|
211 * Indices i and j must be different. Weights can be any
|
|
212 * strictly positive value.
|
|
213 *
|
|
214 */
|
|
215 void transitionWeight(unsigned int i, unsigned int j, double value);
|
|
216
|
|
217
|
|
218 /** \brief Gets a transition weight
|
|
219 *
|
|
220 * \param i row (previous allele index).
|
|
221 * \param j column (next allele index).
|
|
222 *
|
|
223 * Indices i and j must be different.
|
|
224 *
|
|
225 */
|
|
226 double transitionWeight(unsigned int i, unsigned int j);
|
|
227
|
|
228
|
|
229 /** \brief Set to true to draw ancestral alleles randomly
|
|
230 *
|
|
231 * By default, the ancestral allele is always 0. If this
|
|
232 * variable is set to true, the ancestral allele will be
|
|
233 * randomly drawn from the defined number of alleles. This
|
|
234 * option is always ignored unless in combination with the
|
|
235 * Fixed Allele Number model.
|
|
236 *
|
|
237 */
|
|
238 void randomAncestralAllele(bool flag);
|
|
239
|
|
240
|
|
241 /** \brief true if ancestral alleles must be drawn randomly
|
|
242 *
|
|
243 */
|
|
244 bool randomAncestralAllele() const;
|
|
245
|
|
246
|
|
247 /** \brief Gets the TPM probability parameter
|
|
248 *
|
|
249 */
|
|
250 double TPMproba() const;
|
|
251
|
|
252
|
|
253 /** \brief Sets the TPM probability parameter
|
|
254 *
|
|
255 * This parameter is considered only if the mutation model
|
|
256 * is T (two-phase mutation model). It gives the probability
|
|
257 * that a mutation step is not fixed to be 1. If TPMproba is
|
|
258 * 0, the mutation model is SMM.
|
|
259 *
|
|
260 * The value must be >=0. and <=1.
|
|
261 *
|
|
262 */
|
|
263 void TPMproba(double value);
|
|
264
|
|
265
|
|
266 /** \brief Gets the TPM distribution parameter
|
|
267 *
|
|
268 */
|
|
269 double TPMparam() const;
|
|
270
|
|
271
|
|
272 /** \brief Sets the TPM distribution parameter
|
|
273 *
|
|
274 * This parameter is considered only if the mutation model
|
|
275 * is T (two-phase mutation model). It gives the parameter
|
|
276 * of the geometric distribution which is used to generate
|
|
277 * the mutation step (if it is not one).
|
|
278 *
|
|
279 * The value must be >=0. and <=1.
|
|
280 *
|
|
281 */
|
|
282 void TPMparam(double value);
|
|
283
|
|
284
|
|
285 /** \brief Gets the number of mutable sites
|
|
286 *
|
|
287 * A value a zero must be interpreted as the infinite site
|
|
288 * model. Note that after all calls all data from the tables
|
|
289 * sitePositions and siteWeights will be reset.
|
|
290 *
|
|
291 */
|
|
292 unsigned int numberOfSites() const;
|
|
293
|
|
294
|
|
295 /** \brief Sets the number of mutable sites
|
|
296 *
|
|
297 * The value of zero is accepted and imposed the infinite
|
|
298 * site model.
|
|
299 *
|
|
300 */
|
|
301 void numberOfSites(unsigned int);
|
|
302
|
|
303
|
|
304 /** \brief Gets the position of a given site
|
|
305 *
|
|
306 */
|
|
307 double sitePosition(unsigned int siteIndex) const;
|
|
308
|
|
309
|
|
310 /** \brief Set the position of a given site
|
|
311 *
|
|
312 * The position must be >=0 and <=1
|
|
313 *
|
|
314 */
|
|
315 void sitePosition(unsigned int siteIndex, double position);
|
|
316
|
|
317
|
|
318 /** \brief Gets the mutation weight of a given site
|
|
319 *
|
|
320 */
|
|
321 double siteWeight(unsigned int siteIndex) const;
|
|
322
|
|
323
|
|
324 /** \brief Set the site weight of a given site
|
|
325 *
|
|
326 * The weight must be strictly positive.
|
|
327 *
|
|
328 */
|
|
329 void siteWeight(unsigned int siteIndex, double weight);
|
|
330
|
|
331
|
|
332 /** \brief Performs mutation
|
|
333 *
|
|
334 * \param arg Ancestral recombination graph instance. If the
|
|
335 * ARG is partially built or not a all, or improperly so,
|
|
336 * the behaviour of this method is not defined.
|
|
337 *
|
|
338 * \param random The address of a Random instance to be
|
|
339 * used for generating random numbers.
|
|
340 *
|
|
341 * \return A DataMatrix instance containing simulated data.
|
|
342 *
|
|
343 */
|
|
344 DataMatrix mute(Arg* arg, Random* random);
|
|
345
|
|
346
|
|
347 /** \brief Gets the last number of mutations
|
|
348 *
|
|
349 * Returns the number of mutations of the last call of mute( ).
|
|
350 * By default, this method returns 0.
|
|
351 *
|
|
352 */
|
|
353 unsigned int numberOfMutations() const;
|
|
354
|
|
355
|
|
356 private:
|
|
357
|
|
358 void clear();
|
|
359 void init();
|
|
360 void copy(const Mutator&);
|
|
361
|
|
362 //int nextAllele(int allele, Random* random);
|
|
363 int TPMstep(double inTPMproba, Random* random);
|
|
364 void apply_mutation(unsigned int matrixIndex,
|
|
365 unsigned int actualSite, DataMatrix& data,
|
|
366 const Edge* edge, int allele,
|
|
367 unsigned int segment, Random* random);
|
|
368
|
|
369
|
|
370 char _model;
|
|
371 double _mutationRate;
|
|
372 unsigned int _fixedNumberOfMutations;
|
|
373 unsigned int _numberOfAlleles;
|
|
374 double** _transitionWeights;
|
|
375 bool _randomAncestralAllele;
|
|
376 unsigned int _numberOfSites;
|
|
377 double* _sitePositions;
|
|
378 double* _siteWeights;
|
|
379 double _TPMproba;
|
|
380 double _TPMparam;
|
|
381 int maxAllele;
|
|
382 unsigned int _numberOfMutations;
|
|
383 std::vector<Mutation> _cache_mutations;
|
|
384 unsigned int _cache_mutations_reserved;
|
|
385
|
|
386 };
|
|
387
|
|
388
|
|
389 bool compare(Mutation mutation1, Mutation mutation2); // returns True if mutation1 is older
|
|
390
|
|
391 }
|
|
392
|
|
393
|
|
394
|
|
395
|
|
396 #endif
|
|
397
|