Mercurial > repos > dereeper > sniplay
comparison egglib/egglib-2.1.5/include/egglib-cpp/HaplotypeDiversity.hpp @ 1:420b57c3c185 draft
Uploaded
author | dereeper |
---|---|
date | Fri, 10 Jul 2015 04:39:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:3e19d0dfcf3e | 1:420b57c3c185 |
---|---|
1 /* | |
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol | |
3 | |
4 This file is part of the EggLib library. | |
5 | |
6 EggLib is free software: you can redistribute it and/or modify | |
7 it under the terms of the GNU General Public License as published by | |
8 the Free Software Foundation, either version 3 of the License, or | |
9 (at your option) any later version. | |
10 | |
11 EggLib is distributed in the hope that it will be useful, | |
12 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 GNU General Public License for more details. | |
15 | |
16 You should have received a copy of the GNU General Public License | |
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>. | |
18 */ | |
19 | |
20 | |
21 #ifndef EGGLIB_HAPLOTYPEDIVERSITY_HPP | |
22 #define EGGLIB_HAPLOTYPEDIVERSITY_HPP | |
23 | |
24 #include "BaseDiversity.hpp" | |
25 | |
26 namespace egglib { | |
27 | |
28 | |
29 /** \brief Computes diversity based on haplotype analysis | |
30 * | |
31 * \ingroup polymorphism | |
32 * | |
33 * This class relies on detection of polymorphic sites, as does | |
34 * NucleotideDiversity, with the exception that sites with missing | |
35 * data cannot be processed (minimumExploitableData is enforced to | |
36 * 1.). | |
37 * | |
38 * Like NucleotideDiversity, the same object can be used to analyze | |
39 * different data sets. Only the call to load() is required before | |
40 * accessing the data. | |
41 * | |
42 * Hst, Gst and Kst are between population differenciation indices. | |
43 * They are respectively defined in equations 2, 5-6 and 9 of Hudson | |
44 * et al. 1992a (Molecular Biology and Evolution 9:138-151). Also, | |
45 * Fst is defined in equation 3 of Hudson et al. 1992b (Genetics | |
46 * 132:583-589). Finally, Snn is from Hudson 2000 Genetics. It is | |
47 * computed as the average of Xi for all sequences. Where Xi is the | |
48 * ratio of nearest neighbours from the same group to the number of | |
49 * nearest neighbours. Nearest neigbours are all the sequences with | |
50 * the lowest number of differences to the focal sequence. NOTE: | |
51 * Gst/Hst are quite similar, but Fst and Kst are more different. Snn | |
52 * is a different statistic. Gst and Hst are two ways to estimate the | |
53 * between-population fraction of haplotypic diversity. | |
54 * | |
55 */ | |
56 class HaplotypeDiversity : public BaseDiversity { | |
57 | |
58 public: | |
59 | |
60 /** \brief Constructor | |
61 * | |
62 */ | |
63 HaplotypeDiversity(); | |
64 | |
65 /** \brief Destructor | |
66 * | |
67 */ | |
68 virtual ~HaplotypeDiversity(); | |
69 | |
70 /** \brief Identifies polymorphic sites and computes basis | |
71 * statistics | |
72 * | |
73 * \param data an alignment object (subclass of CharMatrix). | |
74 * The presence of outgroup or of different populations will | |
75 * be detected based on the populationLabel members of the | |
76 * passed object. The populationLabel 999 will be interpreted | |
77 * as outgroups. If several outgroups are passed, sites were | |
78 * the outgroups are not consistent will be treated as "non- | |
79 * orientable". | |
80 * | |
81 * \param allowMultipleMutations if true, sites with more | |
82 * than two alleles will not be ignored. The sum of the | |
83 * frequencies of all alleles not matching the outgroup will | |
84 * treated as the derived allele frequency (for orientable | |
85 * sites). | |
86 * | |
87 * \param ignoreFrequency removes sites that are polymorph | |
88 * because of an allele at absolute frequency smaller than or | |
89 * equal to this value. If ignoreFrequency=1, no sites are | |
90 * removed, if ignoreFrequency=1, singleton sites are | |
91 * ignored. Such sites are completely removed from the | |
92 * analysis (not counted in lseff). Note that if more than | |
93 * one mutation is allowed, the site is removed only if all | |
94 * the alleles but one are smaller than or equal to this | |
95 * value. For example, an alignment column AAAAAAGAAT is | |
96 * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is | |
97 * conserved (including the third allele T which is a | |
98 * singleton). | |
99 * | |
100 * \param characterMapping a string giving the list of | |
101 * characters that should be considered as valid data. If a | |
102 * space is present in the string, the characters left of the | |
103 * space will be treated as valid data and the characters | |
104 * right of the space will be treated as missing data, that | |
105 * is tolerated but ignored. All characters not in the string | |
106 * will cause an EggInvalidCharacterError to be raised. | |
107 * | |
108 */ | |
109 void load(CharMatrix& data, | |
110 bool allowMultipleMutations=false, | |
111 unsigned int ignoreFrequency=0, | |
112 std::string characterMapping=dnaMapping | |
113 ); | |
114 | |
115 /// Number of distinct haplotypes | |
116 unsigned int K() const; | |
117 | |
118 /// Haplotype diversity (unbiased) | |
119 double He() const; | |
120 | |
121 /** \brief Returns the allele number of a given sequence | |
122 * | |
123 * The passed index must be given ignoring any outgroup | |
124 * sequence. | |
125 * | |
126 */ | |
127 unsigned int haplotypeIndex(unsigned int) const; | |
128 | |
129 /// Population differenciation, based on nucleotides (Hudson 1992a) | |
130 double Kst() const; | |
131 | |
132 /// Population differenciation, based on nucleotides (Hudson 1992b) | |
133 double Fst() const; | |
134 | |
135 /// Population differenciation, based on haplotypes (Nei version) | |
136 double Gst() const; | |
137 | |
138 /// Population differenciation, based on haplotypes (Hudson et al. version) | |
139 double Hst() const; | |
140 | |
141 /// Hudson's Snn (nearest neighbor statistics) | |
142 double Snn() const; | |
143 | |
144 | |
145 protected: | |
146 | |
147 void init(); | |
148 void clear(); | |
149 | |
150 inline unsigned int diff(CharMatrix& data, unsigned int ind1, unsigned int ind2) const; | |
151 | |
152 bool m_loaded; | |
153 unsigned int m_K; | |
154 double m_He; | |
155 double m_Kst; | |
156 double m_Fst; | |
157 double m_Gst; | |
158 double m_Hst; | |
159 double m_Snn; | |
160 unsigned int *m_haplotypeIndex; | |
161 | |
162 | |
163 private: | |
164 | |
165 HaplotypeDiversity(const HaplotypeDiversity& source) { | |
166 | |
167 } | |
168 | |
169 HaplotypeDiversity& operator=(const HaplotypeDiversity& source) { | |
170 return *this; | |
171 } | |
172 | |
173 }; | |
174 } | |
175 | |
176 #endif |