annotate egglib/egglib-2.1.5/include/egglib-cpp/HaplotypeDiversity.hpp @ 13:734a3572c1d6 draft

Uploaded
author dereeper
date Tue, 08 Jan 2019 08:45:34 -0500
parents 420b57c3c185
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
1 /*
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
2 Copyright 2008-2009 Stéphane De Mita, Mathieu Siol
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
3
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
4 This file is part of the EggLib library.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
5
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
6 EggLib is free software: you can redistribute it and/or modify
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
7 it under the terms of the GNU General Public License as published by
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
8 the Free Software Foundation, either version 3 of the License, or
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
9 (at your option) any later version.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
10
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
11 EggLib is distributed in the hope that it will be useful,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
14 GNU General Public License for more details.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
15
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
16 You should have received a copy of the GNU General Public License
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
18 */
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
19
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
20
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
21 #ifndef EGGLIB_HAPLOTYPEDIVERSITY_HPP
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
22 #define EGGLIB_HAPLOTYPEDIVERSITY_HPP
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
23
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
24 #include "BaseDiversity.hpp"
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
25
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
26 namespace egglib {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
27
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
28
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
29 /** \brief Computes diversity based on haplotype analysis
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
30 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
31 * \ingroup polymorphism
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
32 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
33 * This class relies on detection of polymorphic sites, as does
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
34 * NucleotideDiversity, with the exception that sites with missing
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
35 * data cannot be processed (minimumExploitableData is enforced to
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
36 * 1.).
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
37 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
38 * Like NucleotideDiversity, the same object can be used to analyze
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
39 * different data sets. Only the call to load() is required before
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
40 * accessing the data.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
41 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
42 * Hst, Gst and Kst are between population differenciation indices.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
43 * They are respectively defined in equations 2, 5-6 and 9 of Hudson
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
44 * et al. 1992a (Molecular Biology and Evolution 9:138-151). Also,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
45 * Fst is defined in equation 3 of Hudson et al. 1992b (Genetics
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
46 * 132:583-589). Finally, Snn is from Hudson 2000 Genetics. It is
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
47 * computed as the average of Xi for all sequences. Where Xi is the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
48 * ratio of nearest neighbours from the same group to the number of
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
49 * nearest neighbours. Nearest neigbours are all the sequences with
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
50 * the lowest number of differences to the focal sequence. NOTE:
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
51 * Gst/Hst are quite similar, but Fst and Kst are more different. Snn
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
52 * is a different statistic. Gst and Hst are two ways to estimate the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
53 * between-population fraction of haplotypic diversity.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
54 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
55 */
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
56 class HaplotypeDiversity : public BaseDiversity {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
57
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
58 public:
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
59
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
60 /** \brief Constructor
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
61 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
62 */
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
63 HaplotypeDiversity();
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
64
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
65 /** \brief Destructor
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
66 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
67 */
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
68 virtual ~HaplotypeDiversity();
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
69
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
70 /** \brief Identifies polymorphic sites and computes basis
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
71 * statistics
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
72 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
73 * \param data an alignment object (subclass of CharMatrix).
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
74 * The presence of outgroup or of different populations will
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
75 * be detected based on the populationLabel members of the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
76 * passed object. The populationLabel 999 will be interpreted
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
77 * as outgroups. If several outgroups are passed, sites were
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
78 * the outgroups are not consistent will be treated as "non-
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
79 * orientable".
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
80 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
81 * \param allowMultipleMutations if true, sites with more
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
82 * than two alleles will not be ignored. The sum of the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
83 * frequencies of all alleles not matching the outgroup will
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
84 * treated as the derived allele frequency (for orientable
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
85 * sites).
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
86 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
87 * \param ignoreFrequency removes sites that are polymorph
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
88 * because of an allele at absolute frequency smaller than or
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
89 * equal to this value. If ignoreFrequency=1, no sites are
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
90 * removed, if ignoreFrequency=1, singleton sites are
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
91 * ignored. Such sites are completely removed from the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
92 * analysis (not counted in lseff). Note that if more than
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
93 * one mutation is allowed, the site is removed only if all
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
94 * the alleles but one are smaller than or equal to this
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
95 * value. For example, an alignment column AAAAAAGAAT is
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
96 * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
97 * conserved (including the third allele T which is a
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
98 * singleton).
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
99 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
100 * \param characterMapping a string giving the list of
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
101 * characters that should be considered as valid data. If a
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
102 * space is present in the string, the characters left of the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
103 * space will be treated as valid data and the characters
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
104 * right of the space will be treated as missing data, that
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
105 * is tolerated but ignored. All characters not in the string
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
106 * will cause an EggInvalidCharacterError to be raised.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
107 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
108 */
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
109 void load(CharMatrix& data,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
110 bool allowMultipleMutations=false,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
111 unsigned int ignoreFrequency=0,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
112 std::string characterMapping=dnaMapping
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
113 );
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
114
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
115 /// Number of distinct haplotypes
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
116 unsigned int K() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
117
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
118 /// Haplotype diversity (unbiased)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
119 double He() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
120
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
121 /** \brief Returns the allele number of a given sequence
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
122 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
123 * The passed index must be given ignoring any outgroup
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
124 * sequence.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
125 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
126 */
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
127 unsigned int haplotypeIndex(unsigned int) const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
128
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
129 /// Population differenciation, based on nucleotides (Hudson 1992a)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
130 double Kst() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
131
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
132 /// Population differenciation, based on nucleotides (Hudson 1992b)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
133 double Fst() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
134
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
135 /// Population differenciation, based on haplotypes (Nei version)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
136 double Gst() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
137
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
138 /// Population differenciation, based on haplotypes (Hudson et al. version)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
139 double Hst() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
140
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
141 /// Hudson's Snn (nearest neighbor statistics)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
142 double Snn() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
143
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
144
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
145 protected:
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
146
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
147 void init();
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
148 void clear();
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
149
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
150 inline unsigned int diff(CharMatrix& data, unsigned int ind1, unsigned int ind2) const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
151
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
152 bool m_loaded;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
153 unsigned int m_K;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
154 double m_He;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
155 double m_Kst;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
156 double m_Fst;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
157 double m_Gst;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
158 double m_Hst;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
159 double m_Snn;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
160 unsigned int *m_haplotypeIndex;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
161
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
162
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
163 private:
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
164
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
165 HaplotypeDiversity(const HaplotypeDiversity& source) {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
166
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
167 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
168
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
169 HaplotypeDiversity& operator=(const HaplotypeDiversity& source) {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
170 return *this;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
171 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
172
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
173 };
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
174 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
175
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
176 #endif