1
|
1 /*
|
|
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
|
|
3
|
|
4 This file is part of the EggLib library.
|
|
5
|
|
6 EggLib is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 EggLib is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
|
|
18 */
|
|
19
|
|
20
|
|
21 #ifndef EGGLIB_NUCLEOTIDEDIVERSITY_HPP
|
|
22 #define EGGLIB_NUCLEOTIDEDIVERSITY_HPP
|
|
23
|
|
24
|
|
25 #include "BaseDiversity.hpp"
|
|
26 #include <string>
|
|
27 #include <vector>
|
|
28
|
|
29
|
|
30
|
|
31 namespace egglib {
|
|
32
|
|
33
|
|
34 /** \brief Performs analyzes of population genetics
|
|
35 *
|
|
36 * \ingroup polymorphism
|
|
37 *
|
|
38 * This class computes several summary statistics based on
|
|
39 * nucleotide analysis. Note that it is possible to use the same
|
|
40 * object to analyze different data set. Calling the load() method
|
|
41 * erases all data preivously computed (if any). Calling the load()
|
|
42 * method is absolutely required to compute any statistics. Some
|
|
43 * statistics are not computed by default, but are if the
|
|
44 * corresponding accessor is used (only load() is required).
|
|
45 *
|
|
46 * Note that "unsecure" accessors don't perform out-of-bound checks.
|
|
47 *
|
|
48 * S is the number of varying sites (only in sites that were not
|
|
49 * rejected).
|
|
50 *
|
|
51 * eta is the minimum number of mutations, that is the sum of the
|
|
52 * number of alleles minus 1 for each varying site. eta = S if all
|
|
53 * sites have no variant or 2 alleles. eta is computed independently
|
|
54 * of the option multiple and IS NOT computed over lseff sites.
|
|
55 *
|
|
56 * Pi is the average number of pairwise differences between sequences
|
|
57 * (expressed here per site) or (as computed here) the mean per site
|
|
58 * (unbiased) heterozygosity. Pi is zero if no polymorphic sites.
|
|
59 *
|
|
60 * D is the Tajima's test of neutrality
|
|
61 * Ref. Tajima F.: Statistical method for testing the neutral
|
|
62 * mutation hypothesis by DNA polymorphism. Genetics 1989, 123:585-595.
|
|
63 * It is arbitrary set to 0 if no polymorphic sites.
|
|
64 *
|
|
65 * tW: thetaW: estimator of theta based on polymorphic sites (ref.
|
|
66 * e.g. Watterson 1975 Theor. Pop. Biol.).
|
|
67 * Both D and thetaW are computed assuming that rounded nseff samples
|
|
68 * have been sampled.
|
|
69 * The variance of D is computed using rounded nseff instead of ns.
|
|
70 *
|
|
71 * H is the Fay and Wu's test of neutrality.
|
|
72 * Z is the standardized version and E a similar test.
|
|
73 * Ref. Fay J. C., Wu C.-I.: Hitchhiking under positive Darwinian
|
|
74 * selection. Genetics 2000, 155:1405-1413. and Zeng K., Fu Y. X.,
|
|
75 * Shi S., Wu C.-I.: Statistical tests for detecting positive
|
|
76 * selection by utilizing high-frequency variants. Genetics 2006,
|
|
77 * 174:1431-9. Both are arbitrary set to 0 if no polymorphic or
|
|
78 * orientable sites.
|
|
79 *
|
|
80 * tH and tL: theta H: estimators of theta based on derived
|
|
81 * polymorphic sites (ref in Fay and Wu and Zeng al.). The variance
|
|
82 * of H/Z are computed assuming that rounded nseff samples have
|
|
83 * been sampled.
|
|
84 *
|
|
85 */
|
|
86 class NucleotideDiversity : public BaseDiversity {
|
|
87
|
|
88 public:
|
|
89
|
|
90 /** \brief Builds an object
|
|
91 *
|
|
92 */
|
|
93 NucleotideDiversity();
|
|
94
|
|
95
|
|
96 /** \brief Destroys an object
|
|
97 *
|
|
98 */
|
|
99 virtual ~NucleotideDiversity();
|
|
100
|
|
101
|
|
102 /** \brief Identifies polymorphic sites and computes basis
|
|
103 * statistics
|
|
104 *
|
|
105 * \param data an alignment object (subclass of CharMatrix).
|
|
106 * The presence of outgroup or of different populations will
|
|
107 * be detected based on the populationLabel members of the
|
|
108 * passed object. The populationLabel 999 will be interpreted
|
|
109 * as outgroups. If several outgroups are passed, sites were
|
|
110 * the outgroups are not consistent will be treated as "non-
|
|
111 * orientable".
|
|
112 *
|
|
113 * \param allowMultipleMutations if true, sites with more
|
|
114 * than two alleles will not be ignored. The sum of the
|
|
115 * frequencies of all alleles not matching the outgroup will
|
|
116 * treated as the derived allele frequency (for orientable
|
|
117 * sites).
|
|
118 *
|
|
119 * \param minimumExploitableData sites where the non-missing
|
|
120 * data (as defined by characterMapping) are at a frequency
|
|
121 * larger than this value will be removed from the analysis.
|
|
122 * Use 1. to take only 'complete' sites into account and 0.
|
|
123 * to use all sites. (The outgroup is not considered in this
|
|
124 * computation.)
|
|
125 *
|
|
126 * \param ignoreFrequency removes sites that are polymorph
|
|
127 * because of an allele at absolute frequency smaller than or
|
|
128 * equal to this value. If ignoreFrequency=1, no sites are
|
|
129 * removed, if ignoreFrequency=0, singleton sites are
|
|
130 * ignored. Such sites are completely removed from the
|
|
131 * analysis (not counted in lseff). Note that if more than
|
|
132 * one mutation is allowed, the site is removed only if all
|
|
133 * the alleles but one are smaller than or equal to this
|
|
134 * value. For example, an alignment column AAAAAAGAAT is
|
|
135 * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is
|
|
136 * conserved (including the third allele T which is a
|
|
137 * singleton).
|
|
138 *
|
|
139 * \param characterMapping a string giving the list of
|
|
140 * characters that should be considered as valid data. If a
|
|
141 * space is present in the string, the characters left of the
|
|
142 * space will be treated as valid data and the characters
|
|
143 * right of the space will be treated as missing data, that
|
|
144 * is tolerated but ignored. All characters not in the string
|
|
145 * will cause an EggInvalidCharacterError to be raised.
|
|
146 *
|
|
147 * \param useZeroAsAncestral if true, all outgroups (if
|
|
148 * present) will be ignored and the character "0" will be
|
|
149 * considered as ancestral for all sites, whatever the
|
|
150 * character mapping.
|
|
151 *
|
|
152 */
|
|
153 virtual void load(
|
|
154 CharMatrix& data,
|
|
155 bool allowMultipleMutations=false,
|
|
156 double minimumExploitableData=1.,
|
|
157 unsigned int ignoreFrequency=0,
|
|
158 std::string characterMapping=dnaMapping,
|
|
159 bool useZeroAsAncestral=false
|
|
160 );
|
|
161
|
|
162
|
|
163 // accessors for the "site analysis" section
|
|
164
|
|
165 /// Number of polymorphic sites
|
|
166 unsigned int S() const;
|
|
167
|
|
168 /// Number of polymorphic orientable sites
|
|
169 unsigned int So() const;
|
|
170
|
|
171 /// Minimum number of mutations
|
|
172 unsigned int eta() const;
|
|
173
|
|
174 /// Average of per-site number of sequences effectively used
|
|
175 double nseff() const;
|
|
176
|
|
177 /// Number of sites effectively used
|
|
178 unsigned int lseff() const;
|
|
179
|
|
180 /// Average of number of sequences effectively used at orientable sites
|
|
181 double nseffo() const;
|
|
182
|
|
183 /// Number of orientable sites
|
|
184 unsigned int lseffo() const;
|
|
185
|
|
186 /// Number of detected populations
|
|
187 unsigned int npop() const;
|
|
188
|
|
189 /// Label of the population with given index (unsecure)
|
|
190 unsigned int popLabel(unsigned int popIndex) const; // no check!
|
|
191
|
|
192
|
|
193 // accessors for the "diversity" section
|
|
194
|
|
195 /// Nucleotide diversity
|
|
196 double Pi();
|
|
197
|
|
198 /// Watterson estimator of theta
|
|
199 double thetaW();
|
|
200
|
|
201 /// Average of Pi over populations
|
|
202 double average_Pi();
|
|
203
|
|
204 /// Pi of a given population (unsecure)
|
|
205 double pop_Pi(unsigned int popIndex); // no check!
|
|
206
|
|
207 /// Tajima's D
|
|
208 double D();
|
|
209
|
|
210 // accessors for the "outgroup diversity" section
|
|
211
|
|
212 /// Fay and Wu estimator of theta
|
|
213 double thetaH();
|
|
214
|
|
215 /// Zeng et al. estimator of theta
|
|
216 double thetaL();
|
|
217
|
|
218 /// Fay and Wu's H
|
|
219 double H();
|
|
220
|
|
221 /// Standardized H
|
|
222 double Z();
|
|
223
|
|
224 /// Zeng et al.'s E
|
|
225 double E();
|
|
226
|
|
227 // accessors for the "differentiation" section
|
|
228
|
|
229 /// Number of sites with at least one fixed difference
|
|
230 unsigned int FixedDifferences();
|
|
231
|
|
232 /// Number of sites with at least one allele shared among at least two populations
|
|
233 unsigned int CommonAlleles();
|
|
234
|
|
235 /// Number of sites with at least one non-fixed allele shared among at least two populations
|
|
236 unsigned int SharedAlleles();
|
|
237
|
|
238 /// Number of sites with at least one allele specific to one population
|
|
239 unsigned int SpecificAlleles();
|
|
240
|
|
241 /// Number of sites with at least one derived allele specific to one population
|
|
242 unsigned int SpecificDerivedAlleles();
|
|
243
|
|
244 /// Number of polymorphisms in a given population (unsecure)
|
|
245 unsigned int Polymorphisms(unsigned int pop);
|
|
246
|
|
247 /// Number of specific alleles for a given population (unsecure)
|
|
248 unsigned int SpecificAlleles(unsigned int pop);
|
|
249
|
|
250 /// Number of specific derived allele for a given population (unsecure)
|
|
251 unsigned int SpecificDerivedAlleles(unsigned int pop);
|
|
252
|
|
253 /// Number of fixed differences between a given pair of populations (unsecure; pop2 must be larger than pop1)
|
|
254 unsigned int FixedDifferences(unsigned int pop1, unsigned int pop2);
|
|
255
|
|
256 /// Number of common alleles between a given pair of populations (unsecure; pop2 must be larger than pop1)
|
|
257 unsigned int CommonAlleles(unsigned int pop1, unsigned int pop2);
|
|
258
|
|
259 /// Number of shared non-fixed alleles between a given pair of populations (unsecure; pop2 must be larger than pop1)
|
|
260 unsigned int SharedAlleles(unsigned int pop1, unsigned int pop2);
|
|
261
|
|
262
|
|
263 // accessor for the "triConfigurations" section
|
|
264
|
|
265 /** \brief Number falling into one of the possible site configurations
|
|
266 *
|
|
267 * The statistics are limited to three populations.
|
|
268 * Assuming an unrooted A/G polymorphism (A and G can be
|
|
269 * substitued), the site configurations are:
|
|
270 * - 0: A&G A A specific 1
|
|
271 * - 1: A&G A G specific 1 + fixed 2-3
|
|
272 * - 2: A A&G A specific 2
|
|
273 * - 3: A A&G G specific 2 + fixed 1-3
|
|
274 * - 4: A A A&G specific 3
|
|
275 * - 5: A G A&G specific 3 + fixed 1-2
|
|
276 * - 6: A&G A&G A shared 1-2
|
|
277 * - 7: A&G A A&G shared 1-3
|
|
278 * - 8: A A&G A&G shared 2-3
|
|
279 * - 9: A&G A&G A&G shared 1-2-3
|
|
280 * - 10: A G G fixed 1
|
|
281 * - 11: A G A fixed 2
|
|
282 * - 12: A A G fixed 3
|
|
283 *
|
|
284 * \param index must be an index from 0 to 12.
|
|
285 *
|
|
286 */
|
|
287 unsigned int triConfiguration(unsigned int index);
|
|
288
|
|
289
|
|
290 /// Builds and returns the vector of positions of all polymorphic sites
|
|
291 std::vector<unsigned int> polymorphic_positions() const;
|
|
292
|
|
293
|
|
294 /** \brief Builds and returns the vector of positions of all singleton sites
|
|
295 *
|
|
296 * A site singleton when it is polymorphic according to
|
|
297 * parameter of the diversity analysis, when it has exactly two
|
|
298 * alleles and one of them is at absolute frequency 1 (one
|
|
299 * copy) disregarding the outgroup.
|
|
300 *
|
|
301 */
|
|
302 std::vector<unsigned int> singleton_positions() const;
|
|
303
|
|
304
|
|
305 protected:
|
|
306
|
|
307 /** \brief This class cannot be copied
|
|
308 *
|
|
309 */
|
|
310 NucleotideDiversity(const NucleotideDiversity& source) { }
|
|
311
|
|
312
|
|
313 /** \brief This class cannot be copied
|
|
314 *
|
|
315 */
|
|
316 NucleotideDiversity& operator=(const NucleotideDiversity& source) { return *this; }
|
|
317
|
|
318
|
|
319 void init(); // initializes values
|
|
320 void clear(); // free memory but doesn't initializes
|
|
321
|
|
322 // diversity (without outgroup)
|
|
323 void diversity();
|
|
324
|
|
325 // diversity with outgroup
|
|
326 void outgroupDiversity();
|
|
327
|
|
328 // site patterns
|
|
329 void differentiation();
|
|
330
|
|
331 // triconfigurations
|
|
332 void triConfigurations();
|
|
333
|
|
334
|
|
335 // holders for statistics, with booleans flagging groups of stats
|
|
336
|
|
337 bool b_analysisSites;
|
|
338
|
|
339 bool b_diversity;
|
|
340
|
|
341 double v_Pi; // nucleotide diversity
|
|
342 double v_thetaW; // theta (Watterson estimator)
|
|
343 double v_average_Pi; // average diversity across populations
|
|
344 double *v_pop_Pi; // diversity per population
|
|
345 double v_D; // Tajima's D
|
|
346
|
|
347 bool b_outgroupDiversity;
|
|
348
|
|
349 double v_thetaH; // theta (Fay and Wu estimator)
|
|
350 double v_thetaL; // theta (Zeng estimator)
|
|
351 double v_H; // Fay and Wu's H
|
|
352 double v_Z; // normalized Fay and Wu's H
|
|
353 double v_E; // Zeng et al.'s E
|
|
354
|
|
355 bool b_differentiation;
|
|
356
|
|
357 unsigned int *v_pairwiseFixedDifferences;
|
|
358 unsigned int *v_pairwiseCommonAlleles;
|
|
359 unsigned int *v_pairwiseSharedAlleles;
|
|
360 unsigned int *v_popPolymorphic;
|
|
361 unsigned int *v_popSpecific;
|
|
362 unsigned int *v_popSpecificDerived;
|
|
363 unsigned int v_countFixedDifferences;
|
|
364 unsigned int v_countCommonAlleles;
|
|
365 unsigned int v_countSharedAlleles;
|
|
366 unsigned int v_countSpecificAlleles;
|
|
367 unsigned int v_countSpecificDerivedAlleles;
|
|
368
|
|
369
|
|
370 bool b_triConfigurations;
|
|
371
|
|
372 unsigned int *v_triConfigurations;
|
|
373
|
|
374 };
|
|
375 }
|
|
376
|
|
377 #endif
|