annotate egglib/egglib-2.1.5/include/egglib-cpp/LinkageDisequilibrium.hpp @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
1 /*
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
2 Copyright 2009 Stéphane De Mita, Mathieu Siol
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
3
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
4 This file is part of the EggLib library.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
5
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
6 EggLib is free software: you can redistribute it and/or modify
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
7 it under the terms of the GNU General Public License as published by
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
8 the Free Software Foundation, either version 3 of the License, or
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
9 (at your option) any later version.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
10
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
11 EggLib is distributed in the hope that it will be useful,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
14 GNU General Public License for more details.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
15
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
16 You should have received a copy of the GNU General Public License
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
18 */
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
19
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
20 #ifndef EGGLIB_LINKAGEDISEQUILIBRUM_HPP
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
21 #define EGGLIB_LINKAGEDISEQUILIBRUM_HPP
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
22
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
23
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
24 #include "BaseDiversity.hpp"
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
25 #include "EggException.hpp"
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
26
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
27
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
28 namespace egglib {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
29
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
30 /** \brief Analyzes linkage disequilibrium per pair of polymorphic sites
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
31 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
32 * \ingroup polymorphism
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
33 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
34 * The class considers an alignment and detects polymorphic sites
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
35 * using the BaseDiversity functionality (shared with other classes
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
36 * of the module). Only sites with exactly two alleles are
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
37 * considered. Statistics of pairwise linkage disequilibrium can
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
38 * be accessed by pair index (note that out-of-range errors are not
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
39 * checked). Population labels are ignored (but outgroups are
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
40 * excluded from the analysis).
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
41 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
42 */
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
43 class LinkageDisequilibrium : public BaseDiversity {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
44
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
45 public:
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
46
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
47 /// Default constructor
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
48 LinkageDisequilibrium();
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
49
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
50 /// Destructor
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
51 virtual ~LinkageDisequilibrium();
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
52
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
53 /** \brief Analyzes polymorphic sites of an alignment
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
54 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
55 * \param data an alignment object (subclass of CharMatrix).
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
56 * The presence of outgroup or of different populations will
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
57 * be detected based on the populationLabel members of the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
58 * passed object. The populationLabel 999 will be interpreted
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
59 * as outgroups. If several outgroups are passed, sites were
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
60 * the outgroups are not consistent will be treated as "non-
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
61 * orientable".
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
62 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
63 * \param minimumExploitableData site where the non-missing
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
64 * data (as defined by characterMapping) are at a frequency
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
65 * larger than this value will be removed from the analysis.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
66 * Use 1. to take only 'complete' sites into account and 0.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
67 * to use all sites.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
68 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
69 * \param ignoreFrequency removes sites that are polymorphic
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
70 * because of an allele at absolute frequency smaller than or
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
71 * equal to this value. If ignoreFrequency=1, no sites are
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
72 * removed, if ignoreFrequency=1, singleton sites are
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
73 * ignored. Such sites are completely removed from the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
74 * analysis (not counted in lseff). Note that if more than
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
75 * one mutation is allowed, the site is removed only if all
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
76 * the alleles but one are smaller than or equal to this
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
77 * value. For example, an alignment column AAAAAAGAAT is
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
78 * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
79 * conserved (including the third allele T which is a
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
80 * singleton).
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
81 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
82 * \param characterMapping a string giving the list of
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
83 * characters that should be considered as valid data. If a
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
84 * space is present in the string, the characters left of the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
85 * space will be treated as valid data and the characters
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
86 * right of the space will be treated as missing data, that
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
87 * is tolerated but ignored. All characters not in the string
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
88 * will cause an EggInvalidCharacterError to be raised.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
89 */
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
90 void load(CharMatrix& data,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
91 double minimumExploitableData=1.,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
92 unsigned int ignoreFrequency=0,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
93 std::string characterMapping=dnaMapping);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
94
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
95
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
96 /// Number of pairs contained in the instance
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
97 unsigned int numberOfPairs() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
98
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
99 /// Alignment distance between a given pair
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
100 int d(unsigned int pair_index);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
101
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
102 /// D statistic for a given pair
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
103 double D(unsigned int pair_index);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
104
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
105 /// D' statistic for a given pair
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
106 double Dp(unsigned int pair_index);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
107
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
108 /// r statistic for a given pair
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
109 double r(unsigned int pair_index);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
110
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
111 /// r2 statistic for a given pair
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
112 double r2(unsigned int pair_index);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
113
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
114 /// position of the first site for a given pair
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
115 unsigned int site1(unsigned int pair_index);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
116
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
117 /// position of the second site for a given pair
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
118 unsigned int site2(unsigned int pair_index);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
119
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
120 /// correlation coefficient between r2 and distance
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
121 double correl() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
122
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
123 /** \brief Computes the minimal number of recombination events
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
124 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
125 * The computation is performed as described in Hudson, RR and
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
126 * NL Kaplan. 1985. Statistical properties of the number of
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
127 * recombination events in the history of a sample of DNA
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
128 * sequences. Genetics 111: 147-164. The returned parameter is
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
129 * the minimal number of recombination events, given by the
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
130 * number of non-overlapping pairs of segregating sites violating
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
131 * the rule of the four gamete. Only sites with two alleles are
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
132 * considered. Note that homoplasy (multiple mutations) mimicks
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
133 * recombination. The result of this function is not stored
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
134 * in this instance, and re-computed at each call.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
135 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
136 * \param data the same CharMatrix instance as passed to the load
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
137 * method. The instance must not have been modified.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
138 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
139 */
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
140 unsigned int Rmin(CharMatrix& data) const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
141
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
142
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
143
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
144 protected:
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
145
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
146 // adds a pair of polymorphic sites
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
147 // assume position2>position1,
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
148 // sites are polymorphic with exactly 2 alleles
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
149 void add(CharMatrix& data, unsigned int position1, unsigned int position2);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
150
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
151 // Constructor help
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
152 void init();
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
153
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
154 // Destructor helper
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
155 void clear();
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
156
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
157 // Resizes arrays
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
158 void reset();
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
159
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
160 // Small helper
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
161 inline double min(double a, double b) { return (a>b)?a:b;}
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
162
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
163 // Small helper
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
164 inline double max(double a, double b) { return (a>b)?b:a;}
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
165
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
166 // Small helper
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
167 inline void check(unsigned int pos) { if (pos>=_n) throw EggRuntimeError("tried to access an invalid index"); }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
168
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
169 /* Performs correlation
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
170 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
171 * This function works independently from the rest of the class.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
172 *
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
173 * \param n length of data arrays.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
174 * \param x first data vector.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
175 * \param y second data vector.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
176 * \param r variable to receive the correlation coefficient.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
177 * \param a variable to receive the regression slope.
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
178 */
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
179 static void _correl(unsigned int n, const int* x, const double* y, double& r, double& a);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
180
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
181 // Distance between pairs
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
182 int* _d;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
183
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
184 // D (classical) measure of LD
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
185 double *_D;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
186
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
187 // D'
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
188 double *_Dp;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
189
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
190 // r, correlation coefficient
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
191 double *_r;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
192
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
193 // square r
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
194 double *_r2;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
195
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
196 // Data array (not managed by the instance)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
197 unsigned int *_site1;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
198
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
199 // Data array (not managed by the instance)
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
200 unsigned int *_site2;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
201
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
202 // Number of pairs
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
203 unsigned int _n;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
204
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
205 private:
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
206
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
207 /// Copy constructor not available
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
208 LinkageDisequilibrium(const LinkageDisequilibrium&) { }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
209
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
210 /// Assignment operator not available
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
211 LinkageDisequilibrium& operator=(const LinkageDisequilibrium&) {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
212 return *this;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
213 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
214
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
215
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
216 class Interval {
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
217 public:
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
218 Interval(unsigned int, unsigned int);
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
219 unsigned int a() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
220 unsigned int b() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
221 bool good() const;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
222 void set_false();
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
223 private:
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
224 unsigned int _a;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
225 unsigned int _b;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
226 unsigned int _good;
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
227 };
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
228
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
229
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
230 };
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
231 }
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
232
420b57c3c185 Uploaded
dereeper
parents:
diff changeset
233 #endif