comparison egglib/egglib-2.1.5/include/egglib-cpp/FStatistics.hpp @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
comparison
equal deleted inserted replaced
0:3e19d0dfcf3e 1:420b57c3c185
1 /*
2 Copyright 2009 Stéphane De Mita, Mathieu Siol
3
4 This file is part of the EggLib library.
5
6 EggLib is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 EggLib is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef EGGLIB_FSTATISTICS_HPP
21 #define EGGLIB_FSTATISTICS_HPP
22
23
24
25 namespace egglib {
26
27
28 /** \brief Computes Fis, Fst and Fit from diploid data
29 *
30 * The class requires loading data. Data are loaded by individual
31 * (two genotypes per individual). The analyses are cached: they are
32 * performed upon the first call to statistics accessors. The cache
33 * is emptied whenever a datum is loaded.
34 *
35 * The computations are performed after Weir and Cockerham. The
36 * statistics F, theta and f are generalized for multiple alleles.
37 * To allow computation of multi-locus statistics, variance
38 * components are also available. The three components of the
39 * variance are Vpopulation (between-population), Vindividual
40 * (within-population, between-individual) and Vallele (within-
41 * individual). The formulas to compute the F-statistics are as
42 * follows:
43 * - 1-F = Vallele/(Vpopulation+Vindividual+Vallele)
44 * - theta = Vpopulation/(Vpopulation+Vindividual+Vallele)
45 * - 1-f = Vallele/(Vindividual+Vallele).
46 *
47 * \ingroup polymorphism
48 *
49 */
50 class FStatistics {
51
52 public:
53
54 /** \brief Constructor
55 *
56 */
57 FStatistics();
58
59
60 /** \brief Destructor
61 *
62 */
63 virtual ~FStatistics();
64
65
66 /** \brief Reserve sufficient memory for a given number of
67 * individuals.
68 *
69 * This method makes the load function faster by allocating
70 * all required memory at once.
71 *
72 * \param numberOfIndividuals a strictly positive integer.
73 *
74 */
75 void reserve(unsigned int numberOfIndividuals);
76
77
78 /** \brief Loads the data for one individual
79 *
80 * \param genotype1 an integer giving the first allele.
81 * \param genotype2 an integer giving the second allele.
82 * \param populationLabel an integer indication belonging to
83 * a population.
84 *
85 * Genotypes and population labels are not required to be
86 * consecutive (both are labels, not indices). They are
87 * internally mapped to indices (the mapping can be obtained
88 * by accessors populationLabel and allele).
89 *
90 * All genotypes are considered to be valid (no missing data).
91 * If statistics were computed previous to call to this
92 * function, all data will be erase.
93 *
94 */
95 void loadIndividual(unsigned int genotype1,
96 unsigned int genotype2, unsigned int populationLabel);
97
98
99 /** \brief Label of a population
100 *
101 * The index corresponds to the local mapping of populations
102 * regardless of the ranking of population labels. (No out
103 * of bound checking.)
104 *
105 */
106 unsigned int populationLabel(unsigned int populationIndex);
107
108
109 /** \brief Value of an allele
110 *
111 * The index corresponds to the local mapping of alleles
112 * regardless of the ranking of allele values. (No out of
113 * bound checking.)
114 *
115 */
116 unsigned int alleleValue(unsigned int alleleIndex);
117
118
119 /// First allele of a given individual (no checking)
120 unsigned int firstAllele(unsigned int individualIndex) const;
121
122 /// Second allele of a given individual (no checking)
123 unsigned int secondAllele(unsigned int individualIndex) const;
124
125 /// Population label of a given individual (no checking)
126 unsigned int individualLabel(unsigned int individualIndex) const;
127
128
129 /** \brief Number of alleles
130 *
131 */
132 unsigned int numberOfAlleles();
133
134
135 /** \brief Number of populations
136 *
137 */
138 unsigned int numberOfPopulations();
139
140
141 /** \brief Number of loaded genotypes
142 *
143 */
144 unsigned int numberOfGenotypes() const;
145
146
147 /** \brief Absolute total allele frequency
148 *
149 */
150 unsigned int alleleFrequencyTotal(unsigned int alleleIndex);
151
152
153 /** \brief Absolute allele frequency in a population
154 *
155 */
156 unsigned int alleleFrequencyPerPopulation(unsigned int populationIndex, unsigned int alleleIndex);
157
158
159 /** \brief Absolute genotype frequency
160 *
161 * Note that allele AB is considered different to BA (this
162 * means that values can be accessed both sides of the
163 * diagonal.
164 *
165 */
166 unsigned int genotypeFrequencyTotal(unsigned int alleleIndex1, unsigned int alleleIndex2);
167
168
169 /** \brief Absolute genotype frequency in a population
170 *
171 * Note that allele AB is considered different to BA (this
172 * means that values can be accessed both sides of the
173 * diagonal.
174 *
175 */
176 unsigned int genotypeFrequencyPerPopulation(unsigned int populationIndex, unsigned int alleleIndex1, unsigned int alleleIndex2);
177
178
179 /** \brief Sample size of a population
180 *
181 */
182 unsigned int populationFrequency(unsigned int populationIndex);
183
184
185 /** \brief Weir-Cockerham F-statistic
186 *
187 * Note: equivalent to Fit.
188 *
189 */
190 double F();
191
192
193 /** \brief Weir-Cockerham theta-statistic
194 *
195 * Note: equivalent to Fst.
196 *
197 */
198 double theta();
199
200
201 /** \brief Weir-Cockerham f-statistic
202 *
203 * Note: equivalent to Fis.
204 *
205 */
206 double f();
207
208
209 /** \brief Between-population component of variance
210 *
211 */
212 double Vpopulation();
213
214
215 /** \brief Within-population, between-individual component of variance
216 *
217 */
218 double Vindividual();
219
220
221 /** \brief Within-individual component of variance
222 *
223 */
224 double Vallele();
225
226
227 protected:
228
229 bool d_flag;
230 void d_init();
231 void d_clear();
232 unsigned int d_reserved;
233 unsigned int d_numberOfGenotypes;
234 unsigned int *d_genotypes;
235 unsigned int *d_populationLabels;
236
237 bool s_flag;
238 void s_init();
239 void s_clear();
240 void s_compute();
241 void processPopulations();
242 void processAlleles();
243 unsigned int getPopulationIndex(unsigned int) const;
244 unsigned int getAlleleIndex(unsigned int) const;
245 unsigned int s_numberOfAlleles;
246 unsigned int *s_alleleValueMapping;
247 unsigned int s_numberOfPopulations;
248 unsigned int *s_populationLabelMapping;
249 unsigned int *s_populationFrequencies;
250 unsigned int *s_alleleFrequenciesTotal;
251 unsigned int **s_alleleFrequenciesPerPopulation;
252 unsigned int **s_genotypeFrequenciesTotal;
253 unsigned int ***s_genotypeFrequenciesPerPopulation;
254
255 bool w_flag;
256 void w_init();
257 void w_clear();
258 void w_compute();
259 double w_F;
260 double w_T;
261 double w_f;
262 double *w_a;
263 double *w_b;
264 double *w_c;
265 double w_nbar;
266 double w_nc;
267 double *w_pbar;
268 double *w_ssquare;
269 double *w_hbar;
270 double w_sum_a;
271 double w_sum_b;
272 double w_sum_c;
273 double w_sum_abc;
274 double w_sum_bc;
275
276
277 private:
278
279 FStatistics(const FStatistics& source) { }
280
281 FStatistics& operator=(const FStatistics& source) {
282 return *this;
283 }
284
285 };
286 }
287
288 #endif