diff egglib/egglib-2.1.5/include/egglib-cpp/Ms.hpp @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/egglib/egglib-2.1.5/include/egglib-cpp/Ms.hpp	Fri Jul 10 04:39:30 2015 -0400
@@ -0,0 +1,199 @@
+/*
+    Copyright 2008,2009,2011 Stéphane De Mita and Mathieu Siol
+
+    This file is part of the EggLib library.
+
+    EggLib is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    EggLib is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with EggLib.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef EGGLIB_GMS_HPP
+#define EGGLIB_GMS_HPP
+
+#include "DataMatrix.hpp"
+#include <string>
+#include <istream>
+
+namespace egglib {
+
+    /** \brief ms-like sequence format parser
+     * 
+     * The class provides parsing (input) and formatting (output)
+     * operations in ms format, that is the format used by Richard
+     * Hudson's program ms for outputting genotypes and by the
+     * associated program samplestat for reading them. Both types of
+     * operations are available through static methods using either
+     * a string or a stream (which can be a stream to or from a file
+     * or a string). In either case, types from the STL are used.
+     * Although ms deals only with data coded with 0 and 1, the class Ms
+     * offers the possibility of both importing and exporting data coded
+     * with by integer. All methods have an option named "separated". If
+     * this option is true, the parser or formatter introduces a slight
+     * modification of the format: genotypes individual data are
+     * separated by a white space ("1 0 1 1" instead of "1011", allowing
+     * genotype values larger than 9: "1 0 11 1").
+     *
+     * \ingroup core
+     *
+     */
+     class Ms {
+
+      public:
+         
+       /** \brief Imports a sequence alignment
+        * 
+        * Creates a istringstream from the string and calls the
+        * overloaded method.
+        * 
+        * \param str the string to parse.
+        * \param ns the expected number of sequences.
+        * \param separated true if a white space separator is placed
+        * between genotype at each site.
+        *
+        * \return A sequence alignment as a data matrix.
+        */
+        static DataMatrix get(std::string, unsigned int ns, bool separated=false);
+
+
+       /** \brief Imports a sequence alignment
+        * 
+        * Attemps to generate a DataMatrix object from the stream.
+        * Reads only one simulation and throws a SeqlibFormatError
+        * exception in case of format error.
+        * 
+        * Allows any number of white lines before the //, but no other
+        * data. Supports \r at the end of lines (before the \n).
+        * Accepted symbols are all integers (0-9).
+        *
+        * \param stream the stream to parse.
+        * \param ns the expected number of sequences.
+        * \param separated true if a white space separator is placed
+        * between genotype at each site.
+        * 
+        * \return A sequence alignment as a data matrix.
+        */
+        static DataMatrix get(std::istream& stream, unsigned int ns, bool separated=false);
+
+
+       /** \brief Exports a sequence alignment
+        * 
+        * Internally creates a stringstream, calls the overloaded method
+        * and returns the outcome.
+        *
+        * \param dataMatrix the alignment object to write.
+        * \param separated true if a white space separator must be placed
+        * between the genotype at each site.
+        * 
+        */
+        static std::string format(DataMatrix& dataMatrix, bool separated=false);
+        
+        
+       /** \brief Exports a sequence alignment
+        * 
+        * Writes the formatted string to the stream 'on the fly'. The
+        * formatted string is guaranteed to starts with a // line and
+        * ends with an empty line. The client is expected to take care
+        * of writing any header and add an additional white line between
+        * simulations if needed. The method throws a SeqlibRuntimeError
+        * if the stream is not writable. The data matrix should contain
+        * only data within range 0-9 if separated is false (default) and
+        * any positive (>=0) integer if separated is true. Note that
+        * output generated with separated=true is never compatible with
+        * the original ms format, and that output generated with
+        * separator=false is compatible with the original ms format only
+        * if all alleles are 0 or 1 (which is not checked by this
+        * formatted).
+        * 
+        * \param stream the stream (file or string stream) where to
+        * write the output.
+        * \param dataMatrix the alignment object to write.
+        * \param separated true if a white space separator must be placed
+        * between the genotype at each site.
+        * 
+        */
+        static void format(std::ostream& stream, DataMatrix& dataMatrix, bool separated=false);
+
+
+       /** \brief Returns the last tMRCA read by any Ms instance
+        * 
+        * If a tMRCA value was present in the last simulation read by
+        * any Ms instance, it will be returned by this method. A value
+        * of -1. is returned if no simulation was read, or if the last
+        * simulation didn't contain a tMRCA value or if the last
+        * simulation provoked an exception before reaching the tMRCA
+        * line.
+        * 
+        */
+        static double tMRCA();
+
+
+       /** \brief Returns the last "prob" read by any Ms instance
+        * 
+        * "prob" is returned by ms when a fixed number of segregating
+        * sites is used in conjunction with a theta value. If a "prob"
+        * value was present in the last simulation read by any Ms
+        * instance, it will be returned by this method. A value of -1
+        * is returned if no simulation was read, or if the last
+        * simulation didn't contain a "prob" value or if the last
+        * simulation provoked an exception before reaching the "prob"
+        * line.
+        * 
+        */
+        static double prob();
+    
+
+       /** \brief Returns the tree string found in the last simulation read by any Ms instance
+        * 
+        * If one or more trees were present in the last simulation read
+        * by any Ms instance, they will be returned as a unique string
+        * by this method. An empty string is returned if no simulation
+        * was read, or if the last simulation, or if the last simulation
+        * didn't contain any tree value or if the last simulation
+        * provoked an exception before reaching the tree line.
+        * 
+        * Note: the trees are returned as a single line.
+        * 
+        */
+        static std::string trees();
+
+         
+      private:
+        // Line parser (the last \n is extracted and discarded - no error upon EOF)
+        std::string next_line(std::istream& stream);
+        
+        /// tMRCA (-1 if not found in ms output)
+        static double _tMRCA;
+        
+        /// probability (-1 if not found in ms output)
+        static double _prob;
+        
+        /// tree string (maybe contain several trees) (empty string if not found in ms output)
+        static std::string _trees;
+
+        
+        /// No instantiation allowed
+        Ms() { }
+        
+        /// A fortiori no destruction allowed
+        ~Ms() { }
+
+        /// No copy allowed
+        Ms(const Ms&) { }
+
+        /// No copy allowed
+        Ms& operator=(const Ms&) { return *this; }
+                
+    };
+}
+    
+#endif