comparison egglib/egglib-2.1.5/include/egglib-cpp/HaplotypeDiversity.hpp @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
comparison
equal deleted inserted replaced
0:3e19d0dfcf3e 1:420b57c3c185
1 /*
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
3
4 This file is part of the EggLib library.
5
6 EggLib is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 EggLib is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20
21 #ifndef EGGLIB_HAPLOTYPEDIVERSITY_HPP
22 #define EGGLIB_HAPLOTYPEDIVERSITY_HPP
23
24 #include "BaseDiversity.hpp"
25
26 namespace egglib {
27
28
29 /** \brief Computes diversity based on haplotype analysis
30 *
31 * \ingroup polymorphism
32 *
33 * This class relies on detection of polymorphic sites, as does
34 * NucleotideDiversity, with the exception that sites with missing
35 * data cannot be processed (minimumExploitableData is enforced to
36 * 1.).
37 *
38 * Like NucleotideDiversity, the same object can be used to analyze
39 * different data sets. Only the call to load() is required before
40 * accessing the data.
41 *
42 * Hst, Gst and Kst are between population differenciation indices.
43 * They are respectively defined in equations 2, 5-6 and 9 of Hudson
44 * et al. 1992a (Molecular Biology and Evolution 9:138-151). Also,
45 * Fst is defined in equation 3 of Hudson et al. 1992b (Genetics
46 * 132:583-589). Finally, Snn is from Hudson 2000 Genetics. It is
47 * computed as the average of Xi for all sequences. Where Xi is the
48 * ratio of nearest neighbours from the same group to the number of
49 * nearest neighbours. Nearest neigbours are all the sequences with
50 * the lowest number of differences to the focal sequence. NOTE:
51 * Gst/Hst are quite similar, but Fst and Kst are more different. Snn
52 * is a different statistic. Gst and Hst are two ways to estimate the
53 * between-population fraction of haplotypic diversity.
54 *
55 */
56 class HaplotypeDiversity : public BaseDiversity {
57
58 public:
59
60 /** \brief Constructor
61 *
62 */
63 HaplotypeDiversity();
64
65 /** \brief Destructor
66 *
67 */
68 virtual ~HaplotypeDiversity();
69
70 /** \brief Identifies polymorphic sites and computes basis
71 * statistics
72 *
73 * \param data an alignment object (subclass of CharMatrix).
74 * The presence of outgroup or of different populations will
75 * be detected based on the populationLabel members of the
76 * passed object. The populationLabel 999 will be interpreted
77 * as outgroups. If several outgroups are passed, sites were
78 * the outgroups are not consistent will be treated as "non-
79 * orientable".
80 *
81 * \param allowMultipleMutations if true, sites with more
82 * than two alleles will not be ignored. The sum of the
83 * frequencies of all alleles not matching the outgroup will
84 * treated as the derived allele frequency (for orientable
85 * sites).
86 *
87 * \param ignoreFrequency removes sites that are polymorph
88 * because of an allele at absolute frequency smaller than or
89 * equal to this value. If ignoreFrequency=1, no sites are
90 * removed, if ignoreFrequency=1, singleton sites are
91 * ignored. Such sites are completely removed from the
92 * analysis (not counted in lseff). Note that if more than
93 * one mutation is allowed, the site is removed only if all
94 * the alleles but one are smaller than or equal to this
95 * value. For example, an alignment column AAAAAAGAAT is
96 * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is
97 * conserved (including the third allele T which is a
98 * singleton).
99 *
100 * \param characterMapping a string giving the list of
101 * characters that should be considered as valid data. If a
102 * space is present in the string, the characters left of the
103 * space will be treated as valid data and the characters
104 * right of the space will be treated as missing data, that
105 * is tolerated but ignored. All characters not in the string
106 * will cause an EggInvalidCharacterError to be raised.
107 *
108 */
109 void load(CharMatrix& data,
110 bool allowMultipleMutations=false,
111 unsigned int ignoreFrequency=0,
112 std::string characterMapping=dnaMapping
113 );
114
115 /// Number of distinct haplotypes
116 unsigned int K() const;
117
118 /// Haplotype diversity (unbiased)
119 double He() const;
120
121 /** \brief Returns the allele number of a given sequence
122 *
123 * The passed index must be given ignoring any outgroup
124 * sequence.
125 *
126 */
127 unsigned int haplotypeIndex(unsigned int) const;
128
129 /// Population differenciation, based on nucleotides (Hudson 1992a)
130 double Kst() const;
131
132 /// Population differenciation, based on nucleotides (Hudson 1992b)
133 double Fst() const;
134
135 /// Population differenciation, based on haplotypes (Nei version)
136 double Gst() const;
137
138 /// Population differenciation, based on haplotypes (Hudson et al. version)
139 double Hst() const;
140
141 /// Hudson's Snn (nearest neighbor statistics)
142 double Snn() const;
143
144
145 protected:
146
147 void init();
148 void clear();
149
150 inline unsigned int diff(CharMatrix& data, unsigned int ind1, unsigned int ind2) const;
151
152 bool m_loaded;
153 unsigned int m_K;
154 double m_He;
155 double m_Kst;
156 double m_Fst;
157 double m_Gst;
158 double m_Hst;
159 double m_Snn;
160 unsigned int *m_haplotypeIndex;
161
162
163 private:
164
165 HaplotypeDiversity(const HaplotypeDiversity& source) {
166
167 }
168
169 HaplotypeDiversity& operator=(const HaplotypeDiversity& source) {
170 return *this;
171 }
172
173 };
174 }
175
176 #endif