1
|
1 /*
|
|
2 Copyright 2009 Stéphane De Mita, Mathieu Siol
|
|
3
|
|
4 This file is part of the EggLib library.
|
|
5
|
|
6 EggLib is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 EggLib is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
|
|
18 */
|
|
19
|
|
20 #ifndef EGGLIB_BASEDIVERSITY_HPP
|
|
21 #define EGGLIB_BASEDIVERSITY_HPP
|
|
22
|
|
23 #include "CharMatrix.hpp"
|
|
24 #include "SitePolymorphism.hpp"
|
|
25 #include <string>
|
|
26
|
|
27 /** \defgroup polymorphism polymorphism
|
|
28 *
|
|
29 * \brief Diversity analyses
|
|
30 *
|
|
31 * Two classes are contained in this module: NucleotideDiversity, that
|
|
32 * performs site-centered polymorphism analyses, and HaplotypeDiversity,
|
|
33 * that performs haplotype-centered analyses. The detection of
|
|
34 * polymorphic sites is common to both, through the base class
|
|
35 * BaseDiversity. However this phase must be repeated when stats from
|
|
36 * the two classes are needed. To reduce the computational burden, the
|
|
37 * function reserve() can be use, that directly allocates needed memory
|
|
38 * when the eventual number of polymorphic sites is known prior to
|
|
39 * analysis (even if not precisely). For both classes, a set of
|
|
40 * statistics are computed immediately upon load of a data set. For
|
|
41 * NucleotideDiversity, additional statistics are computed per group
|
|
42 * upon use of the corresponding accessors. This number of operations
|
|
43 * performed several times is strictly limited. This is particularly
|
|
44 * useful when different statistics are needed for a given alignment.
|
|
45 * However, this system allows not computing unnecessary statistics to
|
|
46 * a certain extend.
|
|
47 *
|
|
48 */
|
|
49
|
|
50 namespace egglib {
|
|
51
|
|
52 /** \brief Base class of diversity classes
|
|
53 *
|
|
54 * Mutualizes the analysis of polymorphic sites through the method
|
|
55 * importSites() and related accessors.
|
|
56 *
|
|
57 * \ingroup polymorphism
|
|
58 *
|
|
59 */
|
|
60 class BaseDiversity {
|
|
61
|
|
62 public:
|
|
63
|
|
64 /** \brief Constructor
|
|
65 *
|
|
66 */
|
|
67 BaseDiversity();
|
|
68
|
|
69 /** \brief Destructor
|
|
70 *
|
|
71 */
|
|
72 virtual ~BaseDiversity();
|
|
73
|
|
74 /** \brief Reserve sufficient memory for a given number of
|
|
75 * polymorphic sites.
|
|
76 *
|
|
77 * This method makes importSite function faster when you
|
|
78 * already know how many polymorphic sites to expect, since
|
|
79 * the necessary memory will be allocated prior the screening
|
|
80 * of data. It is possible to use reserve() even if with a
|
|
81 * number of sites that is not matching what importSites()
|
|
82 * will find.
|
|
83 *
|
|
84 * \param numberOfSites a strictly positive integer.
|
|
85 *
|
|
86 */
|
|
87 virtual void reserve(unsigned int numberOfSites);
|
|
88
|
|
89 /// Gets a site
|
|
90 const SitePolymorphism* get_site(unsigned int index) const;
|
|
91
|
|
92 /// Gets a site position
|
|
93 unsigned int get_position(unsigned int index) const;
|
|
94
|
|
95 /** \brief Predefined mapping string for DNA data
|
|
96 *
|
|
97 */
|
|
98 static const std::string dnaMapping;
|
|
99
|
|
100
|
|
101 /** \brief Predefined mapping string for RNA data
|
|
102 *
|
|
103 */
|
|
104 static const std::string rnaMapping;
|
|
105
|
|
106
|
|
107 /** \brief Predefined mapping string for amino acid data
|
|
108 *
|
|
109 */
|
|
110 static const std::string aaMapping;
|
|
111
|
|
112
|
|
113 /// Clears and re-initializes object
|
|
114 virtual void reset();
|
|
115
|
|
116
|
|
117 protected:
|
|
118
|
|
119 virtual void init();
|
|
120 virtual void clear();
|
|
121
|
|
122 //
|
|
123 void importSites(CharMatrix& data, bool allowMultipleMutations,
|
|
124 double minimumExploitableData, unsigned int ignoreFrequency,
|
|
125 std::string characterMapping, bool useZeroAsAncestral,
|
|
126 bool ignoreOutgroup);
|
|
127
|
|
128 //
|
|
129 void analyzeSite(CharMatrix& data, unsigned int index, double maxMissingData, bool ignoreOutgroup); // analyzes a site, adds a Site to the Site container if the site is polymorphic
|
|
130 unsigned int getPopIndex(unsigned int label) const; // returns v_npop if not found
|
|
131
|
|
132 SitePolymorphism** v_sites; // holder of polymorphic site addresses
|
|
133 bool* v_orientables; // stores whether the sites are orientable or not
|
|
134 unsigned int* v_sitePositions; // stores position of sites
|
|
135
|
|
136 unsigned int v_reserved;
|
|
137 unsigned int v_ns; // maximum number of sequences analyzed (max of sites' ns)
|
|
138 unsigned int v_S; // number of polymorphic sites
|
|
139 unsigned int v_So; // number of orientable sites
|
|
140 unsigned int v_eta; // number of mutation (whatever multiple)
|
|
141 double v_nseff; // average number of analyzed sequence
|
|
142 unsigned int v_lseff; // number of analyzed sites
|
|
143 double v_nseffo; // average number of analyzed sequences for analyzes with outgroup
|
|
144 unsigned int v_lseffo; // number of analyzed sites for analyzes with outgroup
|
|
145 unsigned int v_npop; // number of populations
|
|
146 unsigned int *v_popLabel; // label of each pop
|
|
147
|
|
148 // options
|
|
149 bool p_allowMultipleMutations;
|
|
150 double p_minimumExploitableData;
|
|
151 std::string p_characterMapping;
|
|
152 unsigned int p_pos_sep_mapping;
|
|
153 bool p_useZeroAsAncestral;
|
|
154 unsigned int p_ignoreFrequency;
|
|
155
|
|
156
|
|
157
|
|
158 private:
|
|
159
|
|
160 BaseDiversity(const BaseDiversity& source) { }
|
|
161
|
|
162 BaseDiversity& operator=(const BaseDiversity& source) {
|
|
163 return *this;
|
|
164 }
|
|
165
|
|
166 };
|
|
167 }
|
|
168
|
|
169 #endif
|