Mercurial > repos > dereeper > sniplay
diff egglib/egglib-2.1.5/include/egglib-cpp/FStatistics.hpp @ 1:420b57c3c185 draft
Uploaded
author | dereeper |
---|---|
date | Fri, 10 Jul 2015 04:39:30 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/egglib/egglib-2.1.5/include/egglib-cpp/FStatistics.hpp Fri Jul 10 04:39:30 2015 -0400 @@ -0,0 +1,288 @@ +/* + Copyright 2009 Stéphane De Mita, Mathieu Siol + + This file is part of the EggLib library. + + EggLib is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + EggLib is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with EggLib. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef EGGLIB_FSTATISTICS_HPP +#define EGGLIB_FSTATISTICS_HPP + + + +namespace egglib { + + + /** \brief Computes Fis, Fst and Fit from diploid data + * + * The class requires loading data. Data are loaded by individual + * (two genotypes per individual). The analyses are cached: they are + * performed upon the first call to statistics accessors. The cache + * is emptied whenever a datum is loaded. + * + * The computations are performed after Weir and Cockerham. The + * statistics F, theta and f are generalized for multiple alleles. + * To allow computation of multi-locus statistics, variance + * components are also available. The three components of the + * variance are Vpopulation (between-population), Vindividual + * (within-population, between-individual) and Vallele (within- + * individual). The formulas to compute the F-statistics are as + * follows: + * - 1-F = Vallele/(Vpopulation+Vindividual+Vallele) + * - theta = Vpopulation/(Vpopulation+Vindividual+Vallele) + * - 1-f = Vallele/(Vindividual+Vallele). + * + * \ingroup polymorphism + * + */ + class FStatistics { + + public: + + /** \brief Constructor + * + */ + FStatistics(); + + + /** \brief Destructor + * + */ + virtual ~FStatistics(); + + + /** \brief Reserve sufficient memory for a given number of + * individuals. + * + * This method makes the load function faster by allocating + * all required memory at once. + * + * \param numberOfIndividuals a strictly positive integer. + * + */ + void reserve(unsigned int numberOfIndividuals); + + + /** \brief Loads the data for one individual + * + * \param genotype1 an integer giving the first allele. + * \param genotype2 an integer giving the second allele. + * \param populationLabel an integer indication belonging to + * a population. + * + * Genotypes and population labels are not required to be + * consecutive (both are labels, not indices). They are + * internally mapped to indices (the mapping can be obtained + * by accessors populationLabel and allele). + * + * All genotypes are considered to be valid (no missing data). + * If statistics were computed previous to call to this + * function, all data will be erase. + * + */ + void loadIndividual(unsigned int genotype1, + unsigned int genotype2, unsigned int populationLabel); + + + /** \brief Label of a population + * + * The index corresponds to the local mapping of populations + * regardless of the ranking of population labels. (No out + * of bound checking.) + * + */ + unsigned int populationLabel(unsigned int populationIndex); + + + /** \brief Value of an allele + * + * The index corresponds to the local mapping of alleles + * regardless of the ranking of allele values. (No out of + * bound checking.) + * + */ + unsigned int alleleValue(unsigned int alleleIndex); + + + /// First allele of a given individual (no checking) + unsigned int firstAllele(unsigned int individualIndex) const; + + /// Second allele of a given individual (no checking) + unsigned int secondAllele(unsigned int individualIndex) const; + + /// Population label of a given individual (no checking) + unsigned int individualLabel(unsigned int individualIndex) const; + + + /** \brief Number of alleles + * + */ + unsigned int numberOfAlleles(); + + + /** \brief Number of populations + * + */ + unsigned int numberOfPopulations(); + + + /** \brief Number of loaded genotypes + * + */ + unsigned int numberOfGenotypes() const; + + + /** \brief Absolute total allele frequency + * + */ + unsigned int alleleFrequencyTotal(unsigned int alleleIndex); + + + /** \brief Absolute allele frequency in a population + * + */ + unsigned int alleleFrequencyPerPopulation(unsigned int populationIndex, unsigned int alleleIndex); + + + /** \brief Absolute genotype frequency + * + * Note that allele AB is considered different to BA (this + * means that values can be accessed both sides of the + * diagonal. + * + */ + unsigned int genotypeFrequencyTotal(unsigned int alleleIndex1, unsigned int alleleIndex2); + + + /** \brief Absolute genotype frequency in a population + * + * Note that allele AB is considered different to BA (this + * means that values can be accessed both sides of the + * diagonal. + * + */ + unsigned int genotypeFrequencyPerPopulation(unsigned int populationIndex, unsigned int alleleIndex1, unsigned int alleleIndex2); + + + /** \brief Sample size of a population + * + */ + unsigned int populationFrequency(unsigned int populationIndex); + + + /** \brief Weir-Cockerham F-statistic + * + * Note: equivalent to Fit. + * + */ + double F(); + + + /** \brief Weir-Cockerham theta-statistic + * + * Note: equivalent to Fst. + * + */ + double theta(); + + + /** \brief Weir-Cockerham f-statistic + * + * Note: equivalent to Fis. + * + */ + double f(); + + + /** \brief Between-population component of variance + * + */ + double Vpopulation(); + + + /** \brief Within-population, between-individual component of variance + * + */ + double Vindividual(); + + + /** \brief Within-individual component of variance + * + */ + double Vallele(); + + + protected: + + bool d_flag; + void d_init(); + void d_clear(); + unsigned int d_reserved; + unsigned int d_numberOfGenotypes; + unsigned int *d_genotypes; + unsigned int *d_populationLabels; + + bool s_flag; + void s_init(); + void s_clear(); + void s_compute(); + void processPopulations(); + void processAlleles(); + unsigned int getPopulationIndex(unsigned int) const; + unsigned int getAlleleIndex(unsigned int) const; + unsigned int s_numberOfAlleles; + unsigned int *s_alleleValueMapping; + unsigned int s_numberOfPopulations; + unsigned int *s_populationLabelMapping; + unsigned int *s_populationFrequencies; + unsigned int *s_alleleFrequenciesTotal; + unsigned int **s_alleleFrequenciesPerPopulation; + unsigned int **s_genotypeFrequenciesTotal; + unsigned int ***s_genotypeFrequenciesPerPopulation; + + bool w_flag; + void w_init(); + void w_clear(); + void w_compute(); + double w_F; + double w_T; + double w_f; + double *w_a; + double *w_b; + double *w_c; + double w_nbar; + double w_nc; + double *w_pbar; + double *w_ssquare; + double *w_hbar; + double w_sum_a; + double w_sum_b; + double w_sum_c; + double w_sum_abc; + double w_sum_bc; + + + private: + + FStatistics(const FStatistics& source) { } + + FStatistics& operator=(const FStatistics& source) { + return *this; + } + + }; +} + +#endif