1
|
1 /*
|
|
2 Copyright 2009 Stéphane De Mita, Mathieu Siol
|
|
3
|
|
4 This file is part of the EggLib library.
|
|
5
|
|
6 EggLib is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 EggLib is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
|
|
18 */
|
|
19
|
|
20 #ifndef EGGLIB_LINKAGEDISEQUILIBRUM_HPP
|
|
21 #define EGGLIB_LINKAGEDISEQUILIBRUM_HPP
|
|
22
|
|
23
|
|
24 #include "BaseDiversity.hpp"
|
|
25 #include "EggException.hpp"
|
|
26
|
|
27
|
|
28 namespace egglib {
|
|
29
|
|
30 /** \brief Analyzes linkage disequilibrium per pair of polymorphic sites
|
|
31 *
|
|
32 * \ingroup polymorphism
|
|
33 *
|
|
34 * The class considers an alignment and detects polymorphic sites
|
|
35 * using the BaseDiversity functionality (shared with other classes
|
|
36 * of the module). Only sites with exactly two alleles are
|
|
37 * considered. Statistics of pairwise linkage disequilibrium can
|
|
38 * be accessed by pair index (note that out-of-range errors are not
|
|
39 * checked). Population labels are ignored (but outgroups are
|
|
40 * excluded from the analysis).
|
|
41 *
|
|
42 */
|
|
43 class LinkageDisequilibrium : public BaseDiversity {
|
|
44
|
|
45 public:
|
|
46
|
|
47 /// Default constructor
|
|
48 LinkageDisequilibrium();
|
|
49
|
|
50 /// Destructor
|
|
51 virtual ~LinkageDisequilibrium();
|
|
52
|
|
53 /** \brief Analyzes polymorphic sites of an alignment
|
|
54 *
|
|
55 * \param data an alignment object (subclass of CharMatrix).
|
|
56 * The presence of outgroup or of different populations will
|
|
57 * be detected based on the populationLabel members of the
|
|
58 * passed object. The populationLabel 999 will be interpreted
|
|
59 * as outgroups. If several outgroups are passed, sites were
|
|
60 * the outgroups are not consistent will be treated as "non-
|
|
61 * orientable".
|
|
62 *
|
|
63 * \param minimumExploitableData site where the non-missing
|
|
64 * data (as defined by characterMapping) are at a frequency
|
|
65 * larger than this value will be removed from the analysis.
|
|
66 * Use 1. to take only 'complete' sites into account and 0.
|
|
67 * to use all sites.
|
|
68 *
|
|
69 * \param ignoreFrequency removes sites that are polymorphic
|
|
70 * because of an allele at absolute frequency smaller than or
|
|
71 * equal to this value. If ignoreFrequency=1, no sites are
|
|
72 * removed, if ignoreFrequency=1, singleton sites are
|
|
73 * ignored. Such sites are completely removed from the
|
|
74 * analysis (not counted in lseff). Note that if more than
|
|
75 * one mutation is allowed, the site is removed only if all
|
|
76 * the alleles but one are smaller than or equal to this
|
|
77 * value. For example, an alignment column AAAAAAGAAT is
|
|
78 * ignored with an ignoreFrequency of 1, but AAAAAAGGAT is
|
|
79 * conserved (including the third allele T which is a
|
|
80 * singleton).
|
|
81 *
|
|
82 * \param characterMapping a string giving the list of
|
|
83 * characters that should be considered as valid data. If a
|
|
84 * space is present in the string, the characters left of the
|
|
85 * space will be treated as valid data and the characters
|
|
86 * right of the space will be treated as missing data, that
|
|
87 * is tolerated but ignored. All characters not in the string
|
|
88 * will cause an EggInvalidCharacterError to be raised.
|
|
89 */
|
|
90 void load(CharMatrix& data,
|
|
91 double minimumExploitableData=1.,
|
|
92 unsigned int ignoreFrequency=0,
|
|
93 std::string characterMapping=dnaMapping);
|
|
94
|
|
95
|
|
96 /// Number of pairs contained in the instance
|
|
97 unsigned int numberOfPairs() const;
|
|
98
|
|
99 /// Alignment distance between a given pair
|
|
100 int d(unsigned int pair_index);
|
|
101
|
|
102 /// D statistic for a given pair
|
|
103 double D(unsigned int pair_index);
|
|
104
|
|
105 /// D' statistic for a given pair
|
|
106 double Dp(unsigned int pair_index);
|
|
107
|
|
108 /// r statistic for a given pair
|
|
109 double r(unsigned int pair_index);
|
|
110
|
|
111 /// r2 statistic for a given pair
|
|
112 double r2(unsigned int pair_index);
|
|
113
|
|
114 /// position of the first site for a given pair
|
|
115 unsigned int site1(unsigned int pair_index);
|
|
116
|
|
117 /// position of the second site for a given pair
|
|
118 unsigned int site2(unsigned int pair_index);
|
|
119
|
|
120 /// correlation coefficient between r2 and distance
|
|
121 double correl() const;
|
|
122
|
|
123 /** \brief Computes the minimal number of recombination events
|
|
124 *
|
|
125 * The computation is performed as described in Hudson, RR and
|
|
126 * NL Kaplan. 1985. Statistical properties of the number of
|
|
127 * recombination events in the history of a sample of DNA
|
|
128 * sequences. Genetics 111: 147-164. The returned parameter is
|
|
129 * the minimal number of recombination events, given by the
|
|
130 * number of non-overlapping pairs of segregating sites violating
|
|
131 * the rule of the four gamete. Only sites with two alleles are
|
|
132 * considered. Note that homoplasy (multiple mutations) mimicks
|
|
133 * recombination. The result of this function is not stored
|
|
134 * in this instance, and re-computed at each call.
|
|
135 *
|
|
136 * \param data the same CharMatrix instance as passed to the load
|
|
137 * method. The instance must not have been modified.
|
|
138 *
|
|
139 */
|
|
140 unsigned int Rmin(CharMatrix& data) const;
|
|
141
|
|
142
|
|
143
|
|
144 protected:
|
|
145
|
|
146 // adds a pair of polymorphic sites
|
|
147 // assume position2>position1,
|
|
148 // sites are polymorphic with exactly 2 alleles
|
|
149 void add(CharMatrix& data, unsigned int position1, unsigned int position2);
|
|
150
|
|
151 // Constructor help
|
|
152 void init();
|
|
153
|
|
154 // Destructor helper
|
|
155 void clear();
|
|
156
|
|
157 // Resizes arrays
|
|
158 void reset();
|
|
159
|
|
160 // Small helper
|
|
161 inline double min(double a, double b) { return (a>b)?a:b;}
|
|
162
|
|
163 // Small helper
|
|
164 inline double max(double a, double b) { return (a>b)?b:a;}
|
|
165
|
|
166 // Small helper
|
|
167 inline void check(unsigned int pos) { if (pos>=_n) throw EggRuntimeError("tried to access an invalid index"); }
|
|
168
|
|
169 /* Performs correlation
|
|
170 *
|
|
171 * This function works independently from the rest of the class.
|
|
172 *
|
|
173 * \param n length of data arrays.
|
|
174 * \param x first data vector.
|
|
175 * \param y second data vector.
|
|
176 * \param r variable to receive the correlation coefficient.
|
|
177 * \param a variable to receive the regression slope.
|
|
178 */
|
|
179 static void _correl(unsigned int n, const int* x, const double* y, double& r, double& a);
|
|
180
|
|
181 // Distance between pairs
|
|
182 int* _d;
|
|
183
|
|
184 // D (classical) measure of LD
|
|
185 double *_D;
|
|
186
|
|
187 // D'
|
|
188 double *_Dp;
|
|
189
|
|
190 // r, correlation coefficient
|
|
191 double *_r;
|
|
192
|
|
193 // square r
|
|
194 double *_r2;
|
|
195
|
|
196 // Data array (not managed by the instance)
|
|
197 unsigned int *_site1;
|
|
198
|
|
199 // Data array (not managed by the instance)
|
|
200 unsigned int *_site2;
|
|
201
|
|
202 // Number of pairs
|
|
203 unsigned int _n;
|
|
204
|
|
205 private:
|
|
206
|
|
207 /// Copy constructor not available
|
|
208 LinkageDisequilibrium(const LinkageDisequilibrium&) { }
|
|
209
|
|
210 /// Assignment operator not available
|
|
211 LinkageDisequilibrium& operator=(const LinkageDisequilibrium&) {
|
|
212 return *this;
|
|
213 }
|
|
214
|
|
215
|
|
216 class Interval {
|
|
217 public:
|
|
218 Interval(unsigned int, unsigned int);
|
|
219 unsigned int a() const;
|
|
220 unsigned int b() const;
|
|
221 bool good() const;
|
|
222 void set_false();
|
|
223 private:
|
|
224 unsigned int _a;
|
|
225 unsigned int _b;
|
|
226 unsigned int _good;
|
|
227 };
|
|
228
|
|
229
|
|
230 };
|
|
231 }
|
|
232
|
|
233 #endif
|