diff egglib/egglib-2.1.5/include/egglib-cpp/FStatistics.hpp @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/egglib/egglib-2.1.5/include/egglib-cpp/FStatistics.hpp	Fri Jul 10 04:39:30 2015 -0400
@@ -0,0 +1,288 @@
+/*
+    Copyright 2009 Stéphane De Mita, Mathieu Siol
+
+    This file is part of the EggLib library.
+
+    EggLib is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    EggLib is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef EGGLIB_FSTATISTICS_HPP
+#define EGGLIB_FSTATISTICS_HPP
+
+
+
+namespace egglib {
+
+
+    /** \brief Computes Fis, Fst and Fit from diploid data
+    *
+    * The class requires loading data. Data are loaded by individual
+    * (two genotypes per individual). The analyses are cached: they are
+    * performed upon the first call to statistics accessors. The cache
+    * is emptied whenever a datum is loaded.
+    * 
+    * The computations are performed after Weir and Cockerham. The
+    * statistics F, theta and f are generalized for multiple alleles.
+    * To allow computation of multi-locus statistics, variance
+    * components are also available. The three components of the
+    * variance are Vpopulation (between-population), Vindividual
+    * (within-population, between-individual) and Vallele (within-
+    * individual). The formulas to compute the F-statistics are as
+    * follows:
+    *       - 1-F = Vallele/(Vpopulation+Vindividual+Vallele)
+    *       - theta = Vpopulation/(Vpopulation+Vindividual+Vallele)
+    *       - 1-f = Vallele/(Vindividual+Vallele).
+    * 
+    * \ingroup polymorphism
+    *
+    */
+    class FStatistics {
+    
+        public:
+    
+           /** \brief Constructor
+            * 
+            */ 
+            FStatistics();
+
+            
+           /** \brief Destructor
+            * 
+            */ 
+            virtual ~FStatistics();
+
+            
+           /** \brief Reserve sufficient memory for a given number of
+            * individuals.
+            * 
+            * This method makes the load function faster by allocating
+            * all required memory at once.
+            * 
+            * \param numberOfIndividuals a strictly positive integer.
+            * 
+            */
+            void reserve(unsigned int numberOfIndividuals);
+
+
+           /** \brief Loads the data for one individual
+            * 
+            * \param genotype1 an integer giving the first allele.
+            * \param genotype2 an integer giving the second allele.
+            * \param populationLabel an integer indication belonging to
+            * a population.
+            * 
+            * Genotypes and population labels are not required to be
+            * consecutive (both are labels, not indices). They are
+            * internally mapped to indices (the mapping can be obtained
+            * by accessors populationLabel and allele).
+            * 
+            * All genotypes are considered to be valid (no missing data).
+            * If statistics were computed previous to call to this
+            * function, all data will be erase.
+            * 
+            */
+            void loadIndividual(unsigned int genotype1,
+                    unsigned int genotype2, unsigned int populationLabel);
+
+
+           /** \brief Label of a population
+            * 
+            * The index corresponds to the local mapping of populations
+            * regardless of the ranking of population labels. (No out
+            * of bound checking.)
+            * 
+            */
+            unsigned int populationLabel(unsigned int populationIndex);
+
+
+           /** \brief Value of an allele
+            * 
+            * The index corresponds to the local mapping of alleles
+            * regardless of the ranking of allele values. (No out of
+            * bound checking.)
+            * 
+            */
+            unsigned int alleleValue(unsigned int alleleIndex);
+
+
+            /// First allele of a given individual (no checking)
+            unsigned int firstAllele(unsigned int individualIndex) const;
+
+            /// Second allele of a given individual (no checking)
+            unsigned int secondAllele(unsigned int individualIndex) const;
+
+            /// Population label of a given individual (no checking)
+            unsigned int individualLabel(unsigned int individualIndex) const;
+
+
+           /** \brief Number of alleles
+            * 
+            */
+            unsigned int numberOfAlleles();
+
+
+           /** \brief Number of populations
+            * 
+            */
+            unsigned int numberOfPopulations();
+
+
+           /** \brief Number of loaded genotypes
+            * 
+            */
+            unsigned int numberOfGenotypes() const;
+
+
+           /** \brief Absolute total allele frequency
+            * 
+            */
+            unsigned int alleleFrequencyTotal(unsigned int alleleIndex);
+            
+
+           /** \brief Absolute allele frequency in a population
+            * 
+            */
+            unsigned int alleleFrequencyPerPopulation(unsigned int populationIndex, unsigned int alleleIndex);
+
+
+           /** \brief Absolute genotype frequency
+            * 
+            * Note that allele AB is considered different to BA (this
+            * means that values can be accessed both sides of the
+            * diagonal.
+            * 
+            */
+            unsigned int genotypeFrequencyTotal(unsigned int alleleIndex1, unsigned int alleleIndex2);
+
+
+           /** \brief Absolute genotype frequency in a population
+            * 
+            * Note that allele AB is considered different to BA (this
+            * means that values can be accessed both sides of the
+            * diagonal.
+            * 
+            */
+            unsigned int genotypeFrequencyPerPopulation(unsigned int populationIndex, unsigned int alleleIndex1, unsigned int alleleIndex2);
+
+            
+           /** \brief Sample size of a population
+            * 
+            */
+            unsigned int populationFrequency(unsigned int populationIndex);
+
+
+           /** \brief Weir-Cockerham F-statistic
+            * 
+            * Note: equivalent to Fit.
+            * 
+            */
+            double F();
+
+
+           /** \brief Weir-Cockerham theta-statistic
+            * 
+            * Note: equivalent to Fst.
+            * 
+            */
+            double theta();
+
+
+           /** \brief Weir-Cockerham f-statistic
+            * 
+            * Note: equivalent to Fis.
+            * 
+            */
+            double f();
+            
+
+           /** \brief Between-population component of variance
+            * 
+            */
+            double Vpopulation();
+
+
+           /** \brief Within-population, between-individual component of variance
+            * 
+            */
+            double Vindividual();
+            
+            
+           /** \brief Within-individual component of variance
+            * 
+            */
+            double Vallele();
+
+
+        protected:
+    
+            bool d_flag;
+            void d_init();
+            void d_clear();
+            unsigned int  d_reserved;
+            unsigned int  d_numberOfGenotypes;
+            unsigned int *d_genotypes;
+            unsigned int *d_populationLabels;
+
+            bool s_flag;
+            void s_init();
+            void s_clear();
+            void s_compute();
+            void processPopulations();
+            void processAlleles();
+            unsigned int getPopulationIndex(unsigned int) const;
+            unsigned int getAlleleIndex(unsigned int) const;
+            unsigned int    s_numberOfAlleles;
+            unsigned int   *s_alleleValueMapping;
+            unsigned int    s_numberOfPopulations;
+            unsigned int   *s_populationLabelMapping;
+            unsigned int   *s_populationFrequencies;
+            unsigned int   *s_alleleFrequenciesTotal;
+            unsigned int  **s_alleleFrequenciesPerPopulation;
+            unsigned int  **s_genotypeFrequenciesTotal;
+            unsigned int ***s_genotypeFrequenciesPerPopulation;
+
+            bool w_flag;
+            void w_init();
+            void w_clear();
+            void w_compute();
+            double  w_F;
+            double  w_T;
+            double  w_f;
+            double *w_a;
+            double *w_b;
+            double *w_c;
+            double  w_nbar;
+            double  w_nc;
+            double *w_pbar;
+            double *w_ssquare;
+            double *w_hbar;
+            double  w_sum_a;
+            double  w_sum_b;
+            double  w_sum_c;
+            double  w_sum_abc;
+            double  w_sum_bc;
+
+    
+        private:
+            
+            FStatistics(const FStatistics& source) { }
+            
+            FStatistics& operator=(const FStatistics& source) {
+                return *this;
+            }
+
+    };
+}
+
+#endif