1
|
1 /*
|
|
2 Copyright 2008,2009,2011 Stéphane De Mita and Mathieu Siol
|
|
3
|
|
4 This file is part of the EggLib library.
|
|
5
|
|
6 EggLib is free software: you can redistribute it and/or modify
|
|
7 it under the terms of the GNU General Public License as published by
|
|
8 the Free Software Foundation, either version 3 of the License, or
|
|
9 (at your option) any later version.
|
|
10
|
|
11 EggLib is distributed in the hope that it will be useful,
|
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 GNU General Public License for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
|
|
18 */
|
|
19
|
|
20 #ifndef EGGLIB_GMS_HPP
|
|
21 #define EGGLIB_GMS_HPP
|
|
22
|
|
23 #include "DataMatrix.hpp"
|
|
24 #include <string>
|
|
25 #include <istream>
|
|
26
|
|
27 namespace egglib {
|
|
28
|
|
29 /** \brief ms-like sequence format parser
|
|
30 *
|
|
31 * The class provides parsing (input) and formatting (output)
|
|
32 * operations in ms format, that is the format used by Richard
|
|
33 * Hudson's program ms for outputting genotypes and by the
|
|
34 * associated program samplestat for reading them. Both types of
|
|
35 * operations are available through static methods using either
|
|
36 * a string or a stream (which can be a stream to or from a file
|
|
37 * or a string). In either case, types from the STL are used.
|
|
38 * Although ms deals only with data coded with 0 and 1, the class Ms
|
|
39 * offers the possibility of both importing and exporting data coded
|
|
40 * with by integer. All methods have an option named "separated". If
|
|
41 * this option is true, the parser or formatter introduces a slight
|
|
42 * modification of the format: genotypes individual data are
|
|
43 * separated by a white space ("1 0 1 1" instead of "1011", allowing
|
|
44 * genotype values larger than 9: "1 0 11 1").
|
|
45 *
|
|
46 * \ingroup core
|
|
47 *
|
|
48 */
|
|
49 class Ms {
|
|
50
|
|
51 public:
|
|
52
|
|
53 /** \brief Imports a sequence alignment
|
|
54 *
|
|
55 * Creates a istringstream from the string and calls the
|
|
56 * overloaded method.
|
|
57 *
|
|
58 * \param str the string to parse.
|
|
59 * \param ns the expected number of sequences.
|
|
60 * \param separated true if a white space separator is placed
|
|
61 * between genotype at each site.
|
|
62 *
|
|
63 * \return A sequence alignment as a data matrix.
|
|
64 */
|
|
65 static DataMatrix get(std::string, unsigned int ns, bool separated=false);
|
|
66
|
|
67
|
|
68 /** \brief Imports a sequence alignment
|
|
69 *
|
|
70 * Attemps to generate a DataMatrix object from the stream.
|
|
71 * Reads only one simulation and throws a SeqlibFormatError
|
|
72 * exception in case of format error.
|
|
73 *
|
|
74 * Allows any number of white lines before the //, but no other
|
|
75 * data. Supports \r at the end of lines (before the \n).
|
|
76 * Accepted symbols are all integers (0-9).
|
|
77 *
|
|
78 * \param stream the stream to parse.
|
|
79 * \param ns the expected number of sequences.
|
|
80 * \param separated true if a white space separator is placed
|
|
81 * between genotype at each site.
|
|
82 *
|
|
83 * \return A sequence alignment as a data matrix.
|
|
84 */
|
|
85 static DataMatrix get(std::istream& stream, unsigned int ns, bool separated=false);
|
|
86
|
|
87
|
|
88 /** \brief Exports a sequence alignment
|
|
89 *
|
|
90 * Internally creates a stringstream, calls the overloaded method
|
|
91 * and returns the outcome.
|
|
92 *
|
|
93 * \param dataMatrix the alignment object to write.
|
|
94 * \param separated true if a white space separator must be placed
|
|
95 * between the genotype at each site.
|
|
96 *
|
|
97 */
|
|
98 static std::string format(DataMatrix& dataMatrix, bool separated=false);
|
|
99
|
|
100
|
|
101 /** \brief Exports a sequence alignment
|
|
102 *
|
|
103 * Writes the formatted string to the stream 'on the fly'. The
|
|
104 * formatted string is guaranteed to starts with a // line and
|
|
105 * ends with an empty line. The client is expected to take care
|
|
106 * of writing any header and add an additional white line between
|
|
107 * simulations if needed. The method throws a SeqlibRuntimeError
|
|
108 * if the stream is not writable. The data matrix should contain
|
|
109 * only data within range 0-9 if separated is false (default) and
|
|
110 * any positive (>=0) integer if separated is true. Note that
|
|
111 * output generated with separated=true is never compatible with
|
|
112 * the original ms format, and that output generated with
|
|
113 * separator=false is compatible with the original ms format only
|
|
114 * if all alleles are 0 or 1 (which is not checked by this
|
|
115 * formatted).
|
|
116 *
|
|
117 * \param stream the stream (file or string stream) where to
|
|
118 * write the output.
|
|
119 * \param dataMatrix the alignment object to write.
|
|
120 * \param separated true if a white space separator must be placed
|
|
121 * between the genotype at each site.
|
|
122 *
|
|
123 */
|
|
124 static void format(std::ostream& stream, DataMatrix& dataMatrix, bool separated=false);
|
|
125
|
|
126
|
|
127 /** \brief Returns the last tMRCA read by any Ms instance
|
|
128 *
|
|
129 * If a tMRCA value was present in the last simulation read by
|
|
130 * any Ms instance, it will be returned by this method. A value
|
|
131 * of -1. is returned if no simulation was read, or if the last
|
|
132 * simulation didn't contain a tMRCA value or if the last
|
|
133 * simulation provoked an exception before reaching the tMRCA
|
|
134 * line.
|
|
135 *
|
|
136 */
|
|
137 static double tMRCA();
|
|
138
|
|
139
|
|
140 /** \brief Returns the last "prob" read by any Ms instance
|
|
141 *
|
|
142 * "prob" is returned by ms when a fixed number of segregating
|
|
143 * sites is used in conjunction with a theta value. If a "prob"
|
|
144 * value was present in the last simulation read by any Ms
|
|
145 * instance, it will be returned by this method. A value of -1
|
|
146 * is returned if no simulation was read, or if the last
|
|
147 * simulation didn't contain a "prob" value or if the last
|
|
148 * simulation provoked an exception before reaching the "prob"
|
|
149 * line.
|
|
150 *
|
|
151 */
|
|
152 static double prob();
|
|
153
|
|
154
|
|
155 /** \brief Returns the tree string found in the last simulation read by any Ms instance
|
|
156 *
|
|
157 * If one or more trees were present in the last simulation read
|
|
158 * by any Ms instance, they will be returned as a unique string
|
|
159 * by this method. An empty string is returned if no simulation
|
|
160 * was read, or if the last simulation, or if the last simulation
|
|
161 * didn't contain any tree value or if the last simulation
|
|
162 * provoked an exception before reaching the tree line.
|
|
163 *
|
|
164 * Note: the trees are returned as a single line.
|
|
165 *
|
|
166 */
|
|
167 static std::string trees();
|
|
168
|
|
169
|
|
170 private:
|
|
171 // Line parser (the last \n is extracted and discarded - no error upon EOF)
|
|
172 std::string next_line(std::istream& stream);
|
|
173
|
|
174 /// tMRCA (-1 if not found in ms output)
|
|
175 static double _tMRCA;
|
|
176
|
|
177 /// probability (-1 if not found in ms output)
|
|
178 static double _prob;
|
|
179
|
|
180 /// tree string (maybe contain several trees) (empty string if not found in ms output)
|
|
181 static std::string _trees;
|
|
182
|
|
183
|
|
184 /// No instantiation allowed
|
|
185 Ms() { }
|
|
186
|
|
187 /// A fortiori no destruction allowed
|
|
188 ~Ms() { }
|
|
189
|
|
190 /// No copy allowed
|
|
191 Ms(const Ms&) { }
|
|
192
|
|
193 /// No copy allowed
|
|
194 Ms& operator=(const Ms&) { return *this; }
|
|
195
|
|
196 };
|
|
197 }
|
|
198
|
|
199 #endif
|