1
|
1 /*
|
|
2 Copyright 2009 Stéphane De Mita, Mathieu Siol
|
|
3
|
|
4 This file is part of the EggLib library.
|
|
5
|
|
6 EggLib is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 EggLib is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
|
|
18 */
|
|
19
|
|
20
|
|
21 #ifndef EGGLIB_CONVERT_HPP
|
|
22 #define EGGLIB_CONVERT_HPP
|
|
23
|
|
24
|
|
25 #include "DataMatrix.hpp"
|
|
26 #include "Align.hpp"
|
|
27 #include "EggException.hpp"
|
|
28 #include "Random.hpp"
|
|
29 #include <string>
|
|
30
|
|
31 #include "config.h"
|
|
32
|
|
33 #ifdef HAVE_LIBBPP_SEQ
|
|
34 #include <Bpp/Seq/Alphabet.all>
|
|
35 #include <Bpp/Seq/Sequence.h>
|
|
36 #include <Bpp/Seq/Container.all>
|
|
37 #endif
|
|
38
|
|
39
|
|
40
|
|
41 namespace egglib {
|
|
42
|
|
43
|
|
44 /** \brief Performs conversion between sequence holder types
|
|
45 *
|
|
46 * \ingroup core
|
|
47 *
|
|
48 * Static methods of this class allows conversion between sequence
|
|
49 * holder types implying parametrizable modifications.
|
|
50 *
|
|
51 */
|
|
52 class Convert {
|
|
53
|
|
54 public:
|
|
55
|
|
56 /** \brief DataMatrix to Align conversion
|
|
57 *
|
|
58 * By defaut, this method generates an Align instance
|
|
59 * containing only the polymorphic sites. The integers of
|
|
60 * the DataMatrix will be converted as follow: 0 to A, 1 to
|
|
61 * C, 2 to G and 3 to T. This behaviour can be largely
|
|
62 * modified using options.
|
|
63 *
|
|
64 * \param dataMatrix DataMatrix instance.
|
|
65 *
|
|
66 * \param length length of the desired alignment. Non-varying
|
|
67 * stretches of data will be introduced to reach the
|
|
68 * specified length. By default the positions of segregating
|
|
69 * sites will be determined from the positions given by the
|
|
70 * DataMatrix object. Those positions are expressed in a
|
|
71 * continuous range, and will be discretized. Mutations
|
|
72 * falling on the same site will be moved of one position
|
|
73 * left or right (always preserving the order of mutation
|
|
74 * sites). If positions are all zero (the default of the
|
|
75 * DataMatrix class) and if length is larger than the number
|
|
76 * of segregating sites, then all segregating sites will
|
|
77 * cluster on the left-hand side of the alignment.
|
|
78 *
|
|
79 * \param random the address to a Random object allowing to
|
|
80 * draw random numbers (for randomizing positions and/or
|
|
81 * non-varying states). If an address is provided but no
|
|
82 * random numbers are required, it is ignored. If no address
|
|
83 * if provided and random numbers are required, a Random
|
|
84 * instance is built internally.
|
|
85 *
|
|
86 * \param randomizePositions if true, the positions specified
|
|
87 * in the DataMatrix objects are ignored and the positions of
|
|
88 * mutations are drawn randomly along the interval (only if
|
|
89 * the specified length is larger than the number of
|
|
90 * segregating sites). If randomizePositions and false and
|
|
91 * positions are not
|
|
92 *
|
|
93 * \param enforceLength specify whether a
|
|
94 * EggRuntimeError should be thrown when the number of
|
|
95 * polymorphic sites is larger than the specified length. If
|
|
96 * false (the default) and in cases where the specified
|
|
97 * length is too short to harbor all polymorphic sites, the
|
|
98 * alignment length will be increased as needed.
|
|
99 *
|
|
100 * \param randomizeNonVaryingStates if true, the stretches of
|
|
101 * conserved positions (between segregating sites) will be
|
|
102 * randomly drawn from the current symbol mapping. Otherwise,
|
|
103 * the symbol given by fixed will be used.
|
|
104 *
|
|
105 * \param randomizeAlleles if true, alleles will be drawn
|
|
106 * randomly from the mapped characters. Note that if a
|
|
107 * genotype value is larger than the size of the mapping, it
|
|
108 * will be replaced by the character given by unknown,
|
|
109 * without randomization. In other words, with the mapping
|
|
110 * "ACGT", alleles 0, 1, 2 and 3 will be randomly assigned
|
|
111 * to these four characters, but larger and negative alleles
|
|
112 * will be assigned to the unknown character.
|
|
113 *
|
|
114 * \param mapping a string given the character to assign to
|
|
115 * different character values read from the DataMatrix. If
|
|
116 * the read value is 0, the first character of the string
|
|
117 * will used, the the value is 1, the second character will
|
|
118 * be used, and so on. If the integer read is out of range
|
|
119 * (in particular, for any negative value), then the
|
|
120 * character given by unknown will be used. An empty string
|
|
121 * will always lead to alignments containing only the
|
|
122 * character given by unknown. The string "01" is suitable
|
|
123 * for binary data.
|
|
124 *
|
|
125 * \param unknown the character to use if an integer genotype
|
|
126 * value is not mapped in the mapping string (that is, if
|
|
127 * the mapping string is too short).
|
|
128 *
|
|
129 * \param nonVaryingState character to use for conserved
|
|
130 * stretches of data. It doesn't have to be included in the
|
|
131 * mapping. If randomizeNonVaryingState is true, this
|
|
132 * argument is ignored.
|
|
133 *
|
|
134 * \return The resulting Align object.
|
|
135 *
|
|
136 */
|
|
137 static Align align(
|
|
138 DataMatrix& dataMatrix,
|
|
139 unsigned int length=0,
|
|
140 Random* random=NULL,
|
|
141 bool randomizePositions=false,
|
|
142 bool randomizeNonVaryingStates=false,
|
|
143 bool randomizeAlleles=false,
|
|
144 bool enforceLength=false,
|
|
145 std::string mapping="ACGT",
|
|
146 char unknown='?',
|
|
147 char nonVaryingState='A'
|
|
148 );
|
|
149
|
|
150
|
|
151 #ifdef HAVE_LIBBPP_SEQ
|
|
152
|
|
153 /** \brief Converts an alignment to the equivalent Bio++ type
|
|
154 *
|
|
155 * During conversion, name information is lost (arbitrary
|
|
156 * names are generated in order toprevent duplicate names).
|
|
157 * The object is attached to an alphabet matching the passed
|
|
158 * integer. The names are bare rank integers (starting at the
|
|
159 * value giving by *offset*).
|
|
160 *
|
|
161 * \param align the source alignment object.
|
|
162 *
|
|
163 * \param alphabetID an integer indicating which alphabet to
|
|
164 * use:
|
|
165 * - 1 for DNA
|
|
166 * - 2 for RNA
|
|
167 * - 3 for proteins
|
|
168 * - 4 for standard codon
|
|
169 * - 5 for vertebrate mitochondrial codon
|
|
170 * - 6 for invertebrate mitochondrial codon
|
|
171 * - 7 for echinoderm mitochondrial codon
|
|
172 * .
|
|
173 * Other values will result in an exception.
|
|
174 *
|
|
175 * \param outgroupFlag an integer indicating whether to
|
|
176 * include outgroup sequences:
|
|
177 * - 0 use all sequences
|
|
178 * - 1 use only sequences without 999 label (ingroup)
|
|
179 * - 2 use only sequences with 999 label (outgroup)
|
|
180 * .
|
|
181 * Other values will result in an exception.
|
|
182 *
|
|
183 * \param offset enter an integer to shift the names of the
|
|
184 * resulting alignment (useful to merge alignment and ensure
|
|
185 * that names are not duplicated).
|
|
186 *
|
|
187 * \return A Bio++ alignment.
|
|
188 *
|
|
189 */
|
|
190 static bpp::AlignedSequenceContainer egglib2bpp(Align& align, unsigned int alphabetID, unsigned int outgroupFlag, unsigned int offset=0);
|
|
191
|
|
192 #endif
|
|
193
|
|
194
|
|
195
|
|
196 protected:
|
|
197
|
|
198 /** \brief This class cannot be instantiated
|
|
199 *
|
|
200 */
|
|
201 Convert() { }
|
|
202
|
|
203
|
|
204 /** \brief This class cannot be instantiated
|
|
205 *
|
|
206 */
|
|
207 Convert(const Convert& source) { }
|
|
208
|
|
209
|
|
210 /** \brief This class cannot be instantiated
|
|
211 *
|
|
212 */
|
|
213 Convert& operator=(const Convert& source) { return *this; }
|
|
214
|
|
215
|
|
216 /** \brief This class cannot be instantiated
|
|
217 *
|
|
218 */
|
|
219 virtual ~Convert() { }
|
|
220
|
|
221 #ifdef HAVE_LIBBPP_SEQ
|
|
222 static bpp::DNA dnaAlphabet;
|
|
223 static bpp::RNA rnaAlphabet;
|
|
224 static bpp::ProteicAlphabet proteicAlphabet;
|
|
225 static bpp::StandardCodonAlphabet standardCodonAlphabet;
|
|
226 static bpp::VertebrateMitochondrialCodonAlphabet vertebrateMitochondrialCodonAlphabet;
|
|
227 static bpp::InvertebrateMitochondrialCodonAlphabet invertebrateMitochondrialCodonAlphabet;
|
|
228 static bpp::EchinodermMitochondrialCodonAlphabet echinodermMitochondrialCodonAlphabet;
|
|
229 #endif
|
|
230
|
|
231 };
|
|
232 }
|
|
233
|
|
234 #endif
|