comparison egglib/egglib-2.1.5/include/egglib-cpp/Ms.hpp @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
comparison
equal deleted inserted replaced
0:3e19d0dfcf3e 1:420b57c3c185
1 /*
2 Copyright 2008,2009,2011 Stéphane De Mita and Mathieu Siol
3
4 This file is part of the EggLib library.
5
6 EggLib is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 EggLib is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with EggLib. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef EGGLIB_GMS_HPP
21 #define EGGLIB_GMS_HPP
22
23 #include "DataMatrix.hpp"
24 #include <string>
25 #include <istream>
26
27 namespace egglib {
28
29 /** \brief ms-like sequence format parser
30 *
31 * The class provides parsing (input) and formatting (output)
32 * operations in ms format, that is the format used by Richard
33 * Hudson's program ms for outputting genotypes and by the
34 * associated program samplestat for reading them. Both types of
35 * operations are available through static methods using either
36 * a string or a stream (which can be a stream to or from a file
37 * or a string). In either case, types from the STL are used.
38 * Although ms deals only with data coded with 0 and 1, the class Ms
39 * offers the possibility of both importing and exporting data coded
40 * with by integer. All methods have an option named "separated". If
41 * this option is true, the parser or formatter introduces a slight
42 * modification of the format: genotypes individual data are
43 * separated by a white space ("1 0 1 1" instead of "1011", allowing
44 * genotype values larger than 9: "1 0 11 1").
45 *
46 * \ingroup core
47 *
48 */
49 class Ms {
50
51 public:
52
53 /** \brief Imports a sequence alignment
54 *
55 * Creates a istringstream from the string and calls the
56 * overloaded method.
57 *
58 * \param str the string to parse.
59 * \param ns the expected number of sequences.
60 * \param separated true if a white space separator is placed
61 * between genotype at each site.
62 *
63 * \return A sequence alignment as a data matrix.
64 */
65 static DataMatrix get(std::string, unsigned int ns, bool separated=false);
66
67
68 /** \brief Imports a sequence alignment
69 *
70 * Attemps to generate a DataMatrix object from the stream.
71 * Reads only one simulation and throws a SeqlibFormatError
72 * exception in case of format error.
73 *
74 * Allows any number of white lines before the //, but no other
75 * data. Supports \r at the end of lines (before the \n).
76 * Accepted symbols are all integers (0-9).
77 *
78 * \param stream the stream to parse.
79 * \param ns the expected number of sequences.
80 * \param separated true if a white space separator is placed
81 * between genotype at each site.
82 *
83 * \return A sequence alignment as a data matrix.
84 */
85 static DataMatrix get(std::istream& stream, unsigned int ns, bool separated=false);
86
87
88 /** \brief Exports a sequence alignment
89 *
90 * Internally creates a stringstream, calls the overloaded method
91 * and returns the outcome.
92 *
93 * \param dataMatrix the alignment object to write.
94 * \param separated true if a white space separator must be placed
95 * between the genotype at each site.
96 *
97 */
98 static std::string format(DataMatrix& dataMatrix, bool separated=false);
99
100
101 /** \brief Exports a sequence alignment
102 *
103 * Writes the formatted string to the stream 'on the fly'. The
104 * formatted string is guaranteed to starts with a // line and
105 * ends with an empty line. The client is expected to take care
106 * of writing any header and add an additional white line between
107 * simulations if needed. The method throws a SeqlibRuntimeError
108 * if the stream is not writable. The data matrix should contain
109 * only data within range 0-9 if separated is false (default) and
110 * any positive (>=0) integer if separated is true. Note that
111 * output generated with separated=true is never compatible with
112 * the original ms format, and that output generated with
113 * separator=false is compatible with the original ms format only
114 * if all alleles are 0 or 1 (which is not checked by this
115 * formatted).
116 *
117 * \param stream the stream (file or string stream) where to
118 * write the output.
119 * \param dataMatrix the alignment object to write.
120 * \param separated true if a white space separator must be placed
121 * between the genotype at each site.
122 *
123 */
124 static void format(std::ostream& stream, DataMatrix& dataMatrix, bool separated=false);
125
126
127 /** \brief Returns the last tMRCA read by any Ms instance
128 *
129 * If a tMRCA value was present in the last simulation read by
130 * any Ms instance, it will be returned by this method. A value
131 * of -1. is returned if no simulation was read, or if the last
132 * simulation didn't contain a tMRCA value or if the last
133 * simulation provoked an exception before reaching the tMRCA
134 * line.
135 *
136 */
137 static double tMRCA();
138
139
140 /** \brief Returns the last "prob" read by any Ms instance
141 *
142 * "prob" is returned by ms when a fixed number of segregating
143 * sites is used in conjunction with a theta value. If a "prob"
144 * value was present in the last simulation read by any Ms
145 * instance, it will be returned by this method. A value of -1
146 * is returned if no simulation was read, or if the last
147 * simulation didn't contain a "prob" value or if the last
148 * simulation provoked an exception before reaching the "prob"
149 * line.
150 *
151 */
152 static double prob();
153
154
155 /** \brief Returns the tree string found in the last simulation read by any Ms instance
156 *
157 * If one or more trees were present in the last simulation read
158 * by any Ms instance, they will be returned as a unique string
159 * by this method. An empty string is returned if no simulation
160 * was read, or if the last simulation, or if the last simulation
161 * didn't contain any tree value or if the last simulation
162 * provoked an exception before reaching the tree line.
163 *
164 * Note: the trees are returned as a single line.
165 *
166 */
167 static std::string trees();
168
169
170 private:
171 // Line parser (the last \n is extracted and discarded - no error upon EOF)
172 std::string next_line(std::istream& stream);
173
174 /// tMRCA (-1 if not found in ms output)
175 static double _tMRCA;
176
177 /// probability (-1 if not found in ms output)
178 static double _prob;
179
180 /// tree string (maybe contain several trees) (empty string if not found in ms output)
181 static std::string _trees;
182
183
184 /// No instantiation allowed
185 Ms() { }
186
187 /// A fortiori no destruction allowed
188 ~Ms() { }
189
190 /// No copy allowed
191 Ms(const Ms&) { }
192
193 /// No copy allowed
194 Ms& operator=(const Ms&) { return *this; }
195
196 };
197 }
198
199 #endif