diff egglib/egglib-2.1.5/include/egglib-cpp/HaplotypeDiversity.hpp @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/egglib/egglib-2.1.5/include/egglib-cpp/HaplotypeDiversity.hpp	Fri Jul 10 04:39:30 2015 -0400
@@ -0,0 +1,176 @@
+/*
+    Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
+
+    This file is part of the EggLib library.
+
+    EggLib is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    EggLib is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef EGGLIB_HAPLOTYPEDIVERSITY_HPP
+#define EGGLIB_HAPLOTYPEDIVERSITY_HPP
+
+#include "BaseDiversity.hpp"
+
+namespace egglib {
+
+
+   /** \brief Computes diversity based on haplotype analysis
+    *
+    * \ingroup polymorphism
+    * 
+    * This class relies on detection of polymorphic sites, as does
+    * NucleotideDiversity, with the exception that sites with missing
+    * data cannot be processed (minimumExploitableData is enforced to
+    * 1.).
+    * 
+    * Like NucleotideDiversity, the same object can be used to analyze
+    * different data sets. Only the call to load() is required before
+    * accessing the data.
+    * 
+    * Hst, Gst and Kst are between population differenciation indices.
+    * They are respectively defined in equations 2, 5-6 and 9 of Hudson
+    * et al. 1992a (Molecular Biology and Evolution 9:138-151). Also,
+    * Fst is defined in equation 3 of Hudson et al. 1992b (Genetics
+    * 132:583-589). Finally, Snn is from Hudson 2000 Genetics. It is
+    * computed as the average of Xi for all sequences. Where Xi is the
+    * ratio of nearest neighbours from the same group to the number of
+    * nearest neighbours. Nearest neigbours are all the sequences with
+    * the lowest number of differences to the focal sequence. NOTE: 
+    * Gst/Hst are quite similar, but Fst and Kst are more different. Snn
+    * is a different statistic. Gst and Hst are two ways to estimate the
+    * between-population fraction of haplotypic diversity.
+    * 
+    */
+    class HaplotypeDiversity : public BaseDiversity {
+
+        public:
+
+           /** \brief Constructor
+            * 
+            */
+            HaplotypeDiversity();
+            
+           /** \brief Destructor
+            * 
+            */ 
+            virtual ~HaplotypeDiversity();
+
+           /** \brief Identifies polymorphic sites and computes basis
+            * statistics
+            * 
+            * \param data an alignment object (subclass of CharMatrix).
+            * The presence of outgroup or of different populations will
+            * be detected based on the populationLabel members of the
+            * passed object. The populationLabel 999 will be interpreted
+            * as outgroups. If several outgroups are passed, sites were
+            * the outgroups are not consistent will be treated as "non-
+            * orientable".
+            * 
+            * \param allowMultipleMutations if true, sites with more
+            * than two alleles will not be ignored. The sum of the
+            * frequencies of all alleles not matching the outgroup will
+            * treated as the derived allele frequency (for orientable
+            * sites).
+            * 
+            * \param ignoreFrequency removes sites that are polymorph
+            * because of an allele at absolute frequency smaller than or
+            * equal to this value. If ignoreFrequency=1, no sites are
+            * removed, if ignoreFrequency=1, singleton sites are
+            * ignored. Such sites are completely removed from the
+            * analysis (not counted in lseff). Note that if more than
+            * one mutation is allowed, the site is removed only if all
+            * the alleles but one are smaller than or equal to this
+            * value. For example, an alignment column AAAAAAGAAT is
+            * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is
+            * conserved (including the third allele T which is a
+            * singleton).
+            * 
+            * \param characterMapping a string giving the list of
+            * characters that should be considered as valid data. If a
+            * space is present in the string, the characters left of the
+            * space will be treated as valid data and the characters
+            * right of the space will be treated as missing data, that
+            * is tolerated but ignored. All characters not in the string
+            * will cause an EggInvalidCharacterError to be raised.
+            * 
+            */
+            void load(CharMatrix& data,
+                bool allowMultipleMutations=false,
+                unsigned int ignoreFrequency=0,
+                std::string characterMapping=dnaMapping
+            );
+            
+            /// Number of distinct haplotypes
+            unsigned int K() const;
+            
+            /// Haplotype diversity (unbiased)
+            double He() const;
+            
+            /** \brief Returns the allele number of a given sequence
+             * 
+             * The passed index must be given ignoring any outgroup
+             * sequence.
+             * 
+             */
+            unsigned int haplotypeIndex(unsigned int) const;
+            
+            /// Population differenciation, based on nucleotides (Hudson 1992a)
+            double Kst() const;
+
+            /// Population differenciation, based on nucleotides (Hudson 1992b)
+            double Fst() const;
+
+            /// Population differenciation, based on haplotypes (Nei version)
+            double Gst() const;
+
+            /// Population differenciation, based on haplotypes (Hudson et al. version)
+            double Hst() const;
+            
+            /// Hudson's Snn (nearest neighbor statistics)
+            double Snn() const;
+
+
+        protected:
+        
+            void init();
+            void clear();
+                        
+            inline unsigned int diff(CharMatrix& data, unsigned int ind1, unsigned int ind2) const;
+
+            bool m_loaded;
+            unsigned int m_K;
+            double m_He;
+            double m_Kst;
+            double m_Fst;
+            double m_Gst;
+            double m_Hst;
+            double m_Snn;
+            unsigned int *m_haplotypeIndex;
+
+
+        private:
+        
+            HaplotypeDiversity(const HaplotypeDiversity& source) {
+                
+            }
+            
+            HaplotypeDiversity& operator=(const HaplotypeDiversity& source) {
+                return *this;
+            }
+
+    };
+}
+
+#endif