1
|
1 /*
|
|
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
|
|
3
|
|
4 This file is part of the EggLib library.
|
|
5
|
|
6 EggLib is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 EggLib is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
|
|
18 */
|
|
19
|
|
20
|
|
21 #ifndef EGGLIB_HAPLOTYPEDIVERSITY_HPP
|
|
22 #define EGGLIB_HAPLOTYPEDIVERSITY_HPP
|
|
23
|
|
24 #include "BaseDiversity.hpp"
|
|
25
|
|
26 namespace egglib {
|
|
27
|
|
28
|
|
29 /** \brief Computes diversity based on haplotype analysis
|
|
30 *
|
|
31 * \ingroup polymorphism
|
|
32 *
|
|
33 * This class relies on detection of polymorphic sites, as does
|
|
34 * NucleotideDiversity, with the exception that sites with missing
|
|
35 * data cannot be processed (minimumExploitableData is enforced to
|
|
36 * 1.).
|
|
37 *
|
|
38 * Like NucleotideDiversity, the same object can be used to analyze
|
|
39 * different data sets. Only the call to load() is required before
|
|
40 * accessing the data.
|
|
41 *
|
|
42 * Hst, Gst and Kst are between population differenciation indices.
|
|
43 * They are respectively defined in equations 2, 5-6 and 9 of Hudson
|
|
44 * et al. 1992a (Molecular Biology and Evolution 9:138-151). Also,
|
|
45 * Fst is defined in equation 3 of Hudson et al. 1992b (Genetics
|
|
46 * 132:583-589). Finally, Snn is from Hudson 2000 Genetics. It is
|
|
47 * computed as the average of Xi for all sequences. Where Xi is the
|
|
48 * ratio of nearest neighbours from the same group to the number of
|
|
49 * nearest neighbours. Nearest neigbours are all the sequences with
|
|
50 * the lowest number of differences to the focal sequence. NOTE:
|
|
51 * Gst/Hst are quite similar, but Fst and Kst are more different. Snn
|
|
52 * is a different statistic. Gst and Hst are two ways to estimate the
|
|
53 * between-population fraction of haplotypic diversity.
|
|
54 *
|
|
55 */
|
|
56 class HaplotypeDiversity : public BaseDiversity {
|
|
57
|
|
58 public:
|
|
59
|
|
60 /** \brief Constructor
|
|
61 *
|
|
62 */
|
|
63 HaplotypeDiversity();
|
|
64
|
|
65 /** \brief Destructor
|
|
66 *
|
|
67 */
|
|
68 virtual ~HaplotypeDiversity();
|
|
69
|
|
70 /** \brief Identifies polymorphic sites and computes basis
|
|
71 * statistics
|
|
72 *
|
|
73 * \param data an alignment object (subclass of CharMatrix).
|
|
74 * The presence of outgroup or of different populations will
|
|
75 * be detected based on the populationLabel members of the
|
|
76 * passed object. The populationLabel 999 will be interpreted
|
|
77 * as outgroups. If several outgroups are passed, sites were
|
|
78 * the outgroups are not consistent will be treated as "non-
|
|
79 * orientable".
|
|
80 *
|
|
81 * \param allowMultipleMutations if true, sites with more
|
|
82 * than two alleles will not be ignored. The sum of the
|
|
83 * frequencies of all alleles not matching the outgroup will
|
|
84 * treated as the derived allele frequency (for orientable
|
|
85 * sites).
|
|
86 *
|
|
87 * \param ignoreFrequency removes sites that are polymorph
|
|
88 * because of an allele at absolute frequency smaller than or
|
|
89 * equal to this value. If ignoreFrequency=1, no sites are
|
|
90 * removed, if ignoreFrequency=1, singleton sites are
|
|
91 * ignored. Such sites are completely removed from the
|
|
92 * analysis (not counted in lseff). Note that if more than
|
|
93 * one mutation is allowed, the site is removed only if all
|
|
94 * the alleles but one are smaller than or equal to this
|
|
95 * value. For example, an alignment column AAAAAAGAAT is
|
|
96 * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is
|
|
97 * conserved (including the third allele T which is a
|
|
98 * singleton).
|
|
99 *
|
|
100 * \param characterMapping a string giving the list of
|
|
101 * characters that should be considered as valid data. If a
|
|
102 * space is present in the string, the characters left of the
|
|
103 * space will be treated as valid data and the characters
|
|
104 * right of the space will be treated as missing data, that
|
|
105 * is tolerated but ignored. All characters not in the string
|
|
106 * will cause an EggInvalidCharacterError to be raised.
|
|
107 *
|
|
108 */
|
|
109 void load(CharMatrix& data,
|
|
110 bool allowMultipleMutations=false,
|
|
111 unsigned int ignoreFrequency=0,
|
|
112 std::string characterMapping=dnaMapping
|
|
113 );
|
|
114
|
|
115 /// Number of distinct haplotypes
|
|
116 unsigned int K() const;
|
|
117
|
|
118 /// Haplotype diversity (unbiased)
|
|
119 double He() const;
|
|
120
|
|
121 /** \brief Returns the allele number of a given sequence
|
|
122 *
|
|
123 * The passed index must be given ignoring any outgroup
|
|
124 * sequence.
|
|
125 *
|
|
126 */
|
|
127 unsigned int haplotypeIndex(unsigned int) const;
|
|
128
|
|
129 /// Population differenciation, based on nucleotides (Hudson 1992a)
|
|
130 double Kst() const;
|
|
131
|
|
132 /// Population differenciation, based on nucleotides (Hudson 1992b)
|
|
133 double Fst() const;
|
|
134
|
|
135 /// Population differenciation, based on haplotypes (Nei version)
|
|
136 double Gst() const;
|
|
137
|
|
138 /// Population differenciation, based on haplotypes (Hudson et al. version)
|
|
139 double Hst() const;
|
|
140
|
|
141 /// Hudson's Snn (nearest neighbor statistics)
|
|
142 double Snn() const;
|
|
143
|
|
144
|
|
145 protected:
|
|
146
|
|
147 void init();
|
|
148 void clear();
|
|
149
|
|
150 inline unsigned int diff(CharMatrix& data, unsigned int ind1, unsigned int ind2) const;
|
|
151
|
|
152 bool m_loaded;
|
|
153 unsigned int m_K;
|
|
154 double m_He;
|
|
155 double m_Kst;
|
|
156 double m_Fst;
|
|
157 double m_Gst;
|
|
158 double m_Hst;
|
|
159 double m_Snn;
|
|
160 unsigned int *m_haplotypeIndex;
|
|
161
|
|
162
|
|
163 private:
|
|
164
|
|
165 HaplotypeDiversity(const HaplotypeDiversity& source) {
|
|
166
|
|
167 }
|
|
168
|
|
169 HaplotypeDiversity& operator=(const HaplotypeDiversity& source) {
|
|
170 return *this;
|
|
171 }
|
|
172
|
|
173 };
|
|
174 }
|
|
175
|
|
176 #endif
|