comparison egglib/egglib-2.1.5/include/egglib-cpp/Convert.hpp @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
comparison
equal deleted inserted replaced
0:3e19d0dfcf3e 1:420b57c3c185
1 /*
2 Copyright 2009 Stéphane De Mita, Mathieu Siol
3
4 This file is part of the EggLib library.
5
6 EggLib is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 EggLib is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20
21 #ifndef EGGLIB_CONVERT_HPP
22 #define EGGLIB_CONVERT_HPP
23
24
25 #include "DataMatrix.hpp"
26 #include "Align.hpp"
27 #include "EggException.hpp"
28 #include "Random.hpp"
29 #include <string>
30
31 #include "config.h"
32
33 #ifdef HAVE_LIBBPP_SEQ
34 #include <Bpp/Seq/Alphabet.all>
35 #include <Bpp/Seq/Sequence.h>
36 #include <Bpp/Seq/Container.all>
37 #endif
38
39
40
41 namespace egglib {
42
43
44 /** \brief Performs conversion between sequence holder types
45 *
46 * \ingroup core
47 *
48 * Static methods of this class allows conversion between sequence
49 * holder types implying parametrizable modifications.
50 *
51 */
52 class Convert {
53
54 public:
55
56 /** \brief DataMatrix to Align conversion
57 *
58 * By defaut, this method generates an Align instance
59 * containing only the polymorphic sites. The integers of
60 * the DataMatrix will be converted as follow: 0 to A, 1 to
61 * C, 2 to G and 3 to T. This behaviour can be largely
62 * modified using options.
63 *
64 * \param dataMatrix DataMatrix instance.
65 *
66 * \param length length of the desired alignment. Non-varying
67 * stretches of data will be introduced to reach the
68 * specified length. By default the positions of segregating
69 * sites will be determined from the positions given by the
70 * DataMatrix object. Those positions are expressed in a
71 * continuous range, and will be discretized. Mutations
72 * falling on the same site will be moved of one position
73 * left or right (always preserving the order of mutation
74 * sites). If positions are all zero (the default of the
75 * DataMatrix class) and if length is larger than the number
76 * of segregating sites, then all segregating sites will
77 * cluster on the left-hand side of the alignment.
78 *
79 * \param random the address to a Random object allowing to
80 * draw random numbers (for randomizing positions and/or
81 * non-varying states). If an address is provided but no
82 * random numbers are required, it is ignored. If no address
83 * if provided and random numbers are required, a Random
84 * instance is built internally.
85 *
86 * \param randomizePositions if true, the positions specified
87 * in the DataMatrix objects are ignored and the positions of
88 * mutations are drawn randomly along the interval (only if
89 * the specified length is larger than the number of
90 * segregating sites). If randomizePositions and false and
91 * positions are not
92 *
93 * \param enforceLength specify whether a
94 * EggRuntimeError should be thrown when the number of
95 * polymorphic sites is larger than the specified length. If
96 * false (the default) and in cases where the specified
97 * length is too short to harbor all polymorphic sites, the
98 * alignment length will be increased as needed.
99 *
100 * \param randomizeNonVaryingStates if true, the stretches of
101 * conserved positions (between segregating sites) will be
102 * randomly drawn from the current symbol mapping. Otherwise,
103 * the symbol given by fixed will be used.
104 *
105 * \param randomizeAlleles if true, alleles will be drawn
106 * randomly from the mapped characters. Note that if a
107 * genotype value is larger than the size of the mapping, it
108 * will be replaced by the character given by unknown,
109 * without randomization. In other words, with the mapping
110 * "ACGT", alleles 0, 1, 2 and 3 will be randomly assigned
111 * to these four characters, but larger and negative alleles
112 * will be assigned to the unknown character.
113 *
114 * \param mapping a string given the character to assign to
115 * different character values read from the DataMatrix. If
116 * the read value is 0, the first character of the string
117 * will used, the the value is 1, the second character will
118 * be used, and so on. If the integer read is out of range
119 * (in particular, for any negative value), then the
120 * character given by unknown will be used. An empty string
121 * will always lead to alignments containing only the
122 * character given by unknown. The string "01" is suitable
123 * for binary data.
124 *
125 * \param unknown the character to use if an integer genotype
126 * value is not mapped in the mapping string (that is, if
127 * the mapping string is too short).
128 *
129 * \param nonVaryingState character to use for conserved
130 * stretches of data. It doesn't have to be included in the
131 * mapping. If randomizeNonVaryingState is true, this
132 * argument is ignored.
133 *
134 * \return The resulting Align object.
135 *
136 */
137 static Align align(
138 DataMatrix& dataMatrix,
139 unsigned int length=0,
140 Random* random=NULL,
141 bool randomizePositions=false,
142 bool randomizeNonVaryingStates=false,
143 bool randomizeAlleles=false,
144 bool enforceLength=false,
145 std::string mapping="ACGT",
146 char unknown='?',
147 char nonVaryingState='A'
148 );
149
150
151 #ifdef HAVE_LIBBPP_SEQ
152
153 /** \brief Converts an alignment to the equivalent Bio++ type
154 *
155 * During conversion, name information is lost (arbitrary
156 * names are generated in order toprevent duplicate names).
157 * The object is attached to an alphabet matching the passed
158 * integer. The names are bare rank integers (starting at the
159 * value giving by *offset*).
160 *
161 * \param align the source alignment object.
162 *
163 * \param alphabetID an integer indicating which alphabet to
164 * use:
165 * - 1 for DNA
166 * - 2 for RNA
167 * - 3 for proteins
168 * - 4 for standard codon
169 * - 5 for vertebrate mitochondrial codon
170 * - 6 for invertebrate mitochondrial codon
171 * - 7 for echinoderm mitochondrial codon
172 * .
173 * Other values will result in an exception.
174 *
175 * \param outgroupFlag an integer indicating whether to
176 * include outgroup sequences:
177 * - 0 use all sequences
178 * - 1 use only sequences without 999 label (ingroup)
179 * - 2 use only sequences with 999 label (outgroup)
180 * .
181 * Other values will result in an exception.
182 *
183 * \param offset enter an integer to shift the names of the
184 * resulting alignment (useful to merge alignment and ensure
185 * that names are not duplicated).
186 *
187 * \return A Bio++ alignment.
188 *
189 */
190 static bpp::AlignedSequenceContainer egglib2bpp(Align& align, unsigned int alphabetID, unsigned int outgroupFlag, unsigned int offset=0);
191
192 #endif
193
194
195
196 protected:
197
198 /** \brief This class cannot be instantiated
199 *
200 */
201 Convert() { }
202
203
204 /** \brief This class cannot be instantiated
205 *
206 */
207 Convert(const Convert& source) { }
208
209
210 /** \brief This class cannot be instantiated
211 *
212 */
213 Convert& operator=(const Convert& source) { return *this; }
214
215
216 /** \brief This class cannot be instantiated
217 *
218 */
219 virtual ~Convert() { }
220
221 #ifdef HAVE_LIBBPP_SEQ
222 static bpp::DNA dnaAlphabet;
223 static bpp::RNA rnaAlphabet;
224 static bpp::ProteicAlphabet proteicAlphabet;
225 static bpp::StandardCodonAlphabet standardCodonAlphabet;
226 static bpp::VertebrateMitochondrialCodonAlphabet vertebrateMitochondrialCodonAlphabet;
227 static bpp::InvertebrateMitochondrialCodonAlphabet invertebrateMitochondrialCodonAlphabet;
228 static bpp::EchinodermMitochondrialCodonAlphabet echinodermMitochondrialCodonAlphabet;
229 #endif
230
231 };
232 }
233
234 #endif