| 1 | 1 /* | 
|  | 2     Copyright 2008-2009 Stéphane De Mita, Mathieu Siol | 
|  | 3 | 
|  | 4     This file is part of the EggLib library. | 
|  | 5 | 
|  | 6     EggLib is free software: you can redistribute it and/or modify | 
|  | 7     it under the terms of the GNU General Public License as published by | 
|  | 8     the Free Software Foundation, either version 3 of the License, or | 
|  | 9     (at your option) any later version. | 
|  | 10 | 
|  | 11     EggLib is distributed in the hope that it will be useful, | 
|  | 12     but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 14     GNU General Public License for more details. | 
|  | 15 | 
|  | 16     You should have received a copy of the GNU General Public License | 
|  | 17     along with EggLib.  If not, see <http://www.gnu.org/licenses/>. | 
|  | 18 */ | 
|  | 19 | 
|  | 20 | 
|  | 21 #ifndef EGGLIB_ALIGN_HPP | 
|  | 22 #define EGGLIB_ALIGN_HPP | 
|  | 23 | 
|  | 24 #include "Container.hpp" | 
|  | 25 #include "CharMatrix.hpp" | 
|  | 26 #include <vector> | 
|  | 27 | 
|  | 28 /** \mainpage Summary | 
|  | 29  * | 
|  | 30  * This is the automatically-generated reference manual of the C++ | 
|  | 31  * egglib-cpp library. The library is presented as several modules, but | 
|  | 32  * note that they are only used to structure the documentation. | 
|  | 33  * | 
|  | 34  * There is a single namespace (egglib) in which all classes are | 
|  | 35  * defined. See an example of programming with egglib-cpp in the | 
|  | 36  * EggLib package main documentation. Use "Modules" or "Classes" above | 
|  | 37  * to navigate in the library reference manual. | 
|  | 38  * | 
|  | 39  */ | 
|  | 40 | 
|  | 41 | 
|  | 42 /** \defgroup core core | 
|  | 43  * | 
|  | 44  * \brief Central core of the C++ library of Egglib | 
|  | 45  * | 
|  | 46  * Data storage classes, parsers/formatters and tools, plus exception | 
|  | 47  * types. | 
|  | 48  * | 
|  | 49  */ | 
|  | 50 | 
|  | 51 namespace egglib { | 
|  | 52 | 
|  | 53 | 
|  | 54    /** \brief Handles a sequence alignment | 
|  | 55     * | 
|  | 56     * \ingroup core | 
|  | 57     * | 
|  | 58     * Creation from a file or string stream should be performed using | 
|  | 59     * the class Fasta. Align objects can be created by deep copy from | 
|  | 60     * both Align and Container type. In the latter case, the length are | 
|  | 61     * artificially equalized by "?" characters. Align objects can be | 
|  | 62     * created from a DataMatrix object (and all the way arround) using | 
|  | 63     * the specific class DMAConverter. | 
|  | 64     * | 
|  | 65     * Sequences are represented by two strings (name and sequence) and | 
|  | 66     * an integer (group) that can be accessed or modified by index.The | 
|  | 67     * order of sequences is guaranteed to be conserved, as if Align was | 
|  | 68     * a list of triplets (name, sequence, group). | 
|  | 69     * | 
|  | 70     * The data matrix is implemented as continuous array (char**) and | 
|  | 71     * allows efficient access and modification of data. For very large | 
|  | 72     * data matrices you might claim immediately the required memory | 
|  | 73     * using the constructor Align(unsigned int, char**). | 
|  | 74     * | 
|  | 75     */ | 
|  | 76     class Align : public Container, public CharMatrix { | 
|  | 77         public: | 
|  | 78 | 
|  | 79            /** \brief Creates an empty alignment | 
|  | 80             * | 
|  | 81             */ | 
|  | 82             Align(); | 
|  | 83 | 
|  | 84 | 
|  | 85            /** \brief Creates an alignment from a data matrix. | 
|  | 86             * | 
|  | 87             * Allows you to create an object from data stored in a char* | 
|  | 88             * array. The array's dimensions must be passed to the | 
|  | 89             * constructor, and as a result there is not need to | 
|  | 90             * terminate each sequence by a NULL character. | 
|  | 91             * | 
|  | 92             * \param number_of_sequences the number of sequences (the | 
|  | 93             * length of the first dimension of the array). | 
|  | 94             * | 
|  | 95             * \param alignment_length the length of sequences (the | 
|  | 96             * length of all lines of the array). | 
|  | 97             * | 
|  | 98             * \param cstring_array the pointer to the data matrix. | 
|  | 99             * | 
|  | 100             */ | 
|  | 101             Align(unsigned int number_of_sequences, unsigned int alignment_length, char const * const * const cstring_array); | 
|  | 102 | 
|  | 103 | 
|  | 104             /** \brief Creates an alignment with given dimensions | 
|  | 105              * | 
|  | 106              * Allows you to allocate directly a data matrix of a given | 
|  | 107              * size. Names are empty strings, groups 0, and all | 
|  | 108              * characters are ?. | 
|  | 109              * | 
|  | 110             * \param number_of_sequences the number of sequences (the | 
|  | 111             * length of the first dimension of the array). | 
|  | 112             * | 
|  | 113             * \param alignment_length the length of sequences (the | 
|  | 114             * length of all lines of the array). | 
|  | 115             * | 
|  | 116             */ | 
|  | 117             Align(unsigned int number_of_sequences, unsigned int alignment_length); | 
|  | 118 | 
|  | 119 | 
|  | 120            /** \brief Copy constructor | 
|  | 121             * | 
|  | 122             */ | 
|  | 123             Align(const Align& align); | 
|  | 124 | 
|  | 125 | 
|  | 126            /** \brief Copy constructor accepting a Container object | 
|  | 127             * | 
|  | 128             * All but the longest sequences are padded with ? to match | 
|  | 129             * the longest sequence's length. | 
|  | 130             * | 
|  | 131             */ | 
|  | 132             Align(const Container& container); | 
|  | 133 | 
|  | 134 | 
|  | 135            /** \brief Copy operator | 
|  | 136             * | 
|  | 137             */ | 
|  | 138             Align& operator=(const Align& align); | 
|  | 139 | 
|  | 140 | 
|  | 141            /** \brief Copy operator accepting a Container object | 
|  | 142             * | 
|  | 143             * All but the longest sequences are padded with ? to match | 
|  | 144             * the longest sequence's length. | 
|  | 145             * | 
|  | 146             */ | 
|  | 147             Align& operator=(const Container& container); | 
|  | 148 | 
|  | 149 | 
|  | 150            /** \brief Destructor | 
|  | 151             * | 
|  | 152             */ | 
|  | 153             virtual ~Align(); | 
|  | 154 | 
|  | 155 | 
|  | 156            /** \brief Adds a sequence | 
|  | 157             * | 
|  | 158             * If the object already contains at least one sequence, the | 
|  | 159             * new sequence must have the same length. Otherwise, a | 
|  | 160             * EggUnalignedError is raised. | 
|  | 161             * | 
|  | 162             * \param name the name of the sequence. | 
|  | 163             * \param sequence the sequence string. | 
|  | 164             * \param group the group index of the sequence. | 
|  | 165             * \return The new number of sequences. | 
|  | 166             * | 
|  | 167             */ | 
|  | 168             virtual unsigned int append(const char* name, const char* sequence, unsigned int group=0); | 
|  | 169 | 
|  | 170 | 
|  | 171            /** \brief Removes a position (column) of the alignment | 
|  | 172             * | 
|  | 173             * \param pos the position to remove in the alignment. | 
|  | 174             * \return The new length of the alignment. | 
|  | 175             * | 
|  | 176             */ | 
|  | 177             virtual unsigned int removePosition(unsigned int pos); | 
|  | 178 | 
|  | 179 | 
|  | 180            /** \brief Removes a sequence from the alignment | 
|  | 181             * | 
|  | 182             * \param pos the index of the sequence to remove. | 
|  | 183             * \return The new number of sequences. | 
|  | 184             * | 
|  | 185             */ | 
|  | 186             virtual unsigned int remove(unsigned int pos); | 
|  | 187 | 
|  | 188 | 
|  | 189            /** \brief Replace a sequence string | 
|  | 190             * | 
|  | 191             * The new sequence must have the same length than the | 
|  | 192             * alignment. Otherwise, a EggUnalignedError is raised. | 
|  | 193             * | 
|  | 194             * \param seq the index of the sequence to change. | 
|  | 195             * \param sequence the new sequence. | 
|  | 196             * | 
|  | 197             */ | 
|  | 198             virtual void sequence(unsigned int seq, const char* sequence); | 
|  | 199 | 
|  | 200 | 
|  | 201            /** \brief Gets the name of a given sequence | 
|  | 202             * | 
|  | 203             * \param pos the index of the sequence. | 
|  | 204             * | 
|  | 205             * \return The sequence string for that particular sequence. | 
|  | 206             * | 
|  | 207             */ | 
|  | 208             virtual inline const char* sequence(unsigned int pos) const { return Container::sequence(pos); } | 
|  | 209 | 
|  | 210 | 
|  | 211            /** \brief Alignment length | 
|  | 212             * | 
|  | 213             * Returns 0 if the alignment is empty. | 
|  | 214             * | 
|  | 215             */ | 
|  | 216             virtual  unsigned int ls() const; | 
|  | 217 | 
|  | 218 | 
|  | 219            /** \brief Length of a given sequence | 
|  | 220             * | 
|  | 221             * Calling this function is exactly the same as calling ls() | 
|  | 222             * (without arguments), regardless of the index provided, | 
|  | 223             * except that an exception is thrown if the index is out of | 
|  | 224             * bounds. Provided for compatibility with Container. | 
|  | 225             * | 
|  | 226             * \param pos the index of the sequence. | 
|  | 227             * \return the length of the alignment. | 
|  | 228             * | 
|  | 229             */ | 
|  | 230             virtual unsigned int ls(unsigned int pos) const; | 
|  | 231 | 
|  | 232 | 
|  | 233            /** \brief Fast and unsecure accessor | 
|  | 234             * | 
|  | 235             * This accessor doesn't perform out-of-bound checking! | 
|  | 236             * | 
|  | 237             * \param s the index of the sequence (line). | 
|  | 238             * \param p the position in the alignment (column). | 
|  | 239             * \return The character at the given position. | 
|  | 240             * | 
|  | 241             */ | 
|  | 242             inline char character(unsigned int s, unsigned int p) const { return sequences[s][p]; } | 
|  | 243 | 
|  | 244 | 
|  | 245            /** \brief Gets a nucleotide | 
|  | 246             * | 
|  | 247             * This modifier does perform out-of-bound checking. | 
|  | 248             * The specified position must exist. | 
|  | 249             * | 
|  | 250             * \param sequence the index of the sequence (line). | 
|  | 251             * \param position the position in the alignment (column). | 
|  | 252             * \return the character at the given position. | 
|  | 253             * | 
|  | 254             */ | 
|  | 255             virtual char get(unsigned int sequence, unsigned int position) const; | 
|  | 256 | 
|  | 257 | 
|  | 258            /** \brief Sets a matrix position to a new character | 
|  | 259             * | 
|  | 260             * This modifier does perform out-of-bound checking. | 
|  | 261             * The specified position must exist. | 
|  | 262             * | 
|  | 263             * \param sequence the index of the sequence (line). | 
|  | 264             * \param position the position in the alignment (column). | 
|  | 265             * \param ch the new character value. | 
|  | 266             */ | 
|  | 267             virtual void set(unsigned int sequence, unsigned position, char ch); | 
|  | 268 | 
|  | 269 | 
|  | 270            /** \brief Reverse a given column in binary data | 
|  | 271             * | 
|  | 272             * The specified column must contain only "0" ans "1" characters. | 
|  | 273             * "0" is replaced by "1" and all the way around | 
|  | 274             * | 
|  | 275             */ | 
|  | 276             void binSwitch(unsigned int pos); | 
|  | 277 | 
|  | 278 | 
|  | 279            /** \brief Extracts specified positions (columns) of the alignment | 
|  | 280             * | 
|  | 281             * All the specified sites are extracted in the specified | 
|  | 282             * order. This function is suitable for bootstrap (resample | 
|  | 283             * allowing redrawing the same site) and permutations. | 
|  | 284             * | 
|  | 285             * This function doesn't perform out-of-bound checking. | 
|  | 286             * | 
|  | 287             * \param list_of_sites a vector containing alignment | 
|  | 288             * positions. | 
|  | 289             * | 
|  | 290             * \return A copy of the object containing the specified | 
|  | 291             * set of positions. | 
|  | 292             * | 
|  | 293             */ | 
|  | 294             Align vslice(std::vector<unsigned int> list_of_sites); | 
|  | 295 | 
|  | 296 | 
|  | 297            /** \brief Extracts a range of positions (columns) | 
|  | 298             * | 
|  | 299             * \param a the first position. | 
|  | 300             * | 
|  | 301             * \param b the index immediately passed the last sequence to | 
|  | 302             * extract. | 
|  | 303             * | 
|  | 304             * \return A copy of the object containing the specified | 
|  | 305             * range of sequences. | 
|  | 306             * | 
|  | 307             * Positions a to b-1 are extracted, provided that the | 
|  | 308             * indices fit in the current length of sequences. To extract | 
|  | 309             * all sequences, use align.vslice(0, align.ls()). | 
|  | 310             * | 
|  | 311             * Note: invalid ranges will be silently supported. If | 
|  | 312             * a>=ls or b<=a, an empty object is returned. If b>ns, | 
|  | 313             * ls will be substituted to a. | 
|  | 314             */ | 
|  | 315             Align vslice(unsigned int a, unsigned int b); | 
|  | 316 | 
|  | 317 | 
|  | 318            /** \brief Deletes all the content of the object | 
|  | 319             * | 
|  | 320             */ | 
|  | 321             virtual void clear(); | 
|  | 322 | 
|  | 323 | 
|  | 324            /** \brief Same as ns() | 
|  | 325             * | 
|  | 326             */ | 
|  | 327             inline unsigned int numberOfSequences() const { | 
|  | 328                 return _ns; | 
|  | 329             } | 
|  | 330 | 
|  | 331 | 
|  | 332            /** \brief Same as ls() | 
|  | 333             * | 
|  | 334             */ | 
|  | 335             inline unsigned int numberOfSites() const { | 
|  | 336                 return _ls; | 
|  | 337             } | 
|  | 338 | 
|  | 339 | 
|  | 340            /** \brief Gets a group label (insecure) | 
|  | 341             * | 
|  | 342             */ | 
|  | 343             inline unsigned int populationLabel(unsigned int sequenceIndex) const { | 
|  | 344                 return groups[sequenceIndex]; | 
|  | 345             } | 
|  | 346 | 
|  | 347 | 
|  | 348            /** \brief Just return the passed value | 
|  | 349             * | 
|  | 350             */ | 
|  | 351             inline double sitePosition(unsigned int position) const { | 
|  | 352                 return (double) position; | 
|  | 353             } | 
|  | 354 | 
|  | 355 | 
|  | 356         protected: | 
|  | 357 | 
|  | 358             /// This function is not available for alignments | 
|  | 359             virtual void appendSequence(unsigned int pos, const char* sequence) {} | 
|  | 360 | 
|  | 361             // Initializer (creates a valid empty alignment) | 
|  | 362             virtual void init(); | 
|  | 363 | 
|  | 364             // Makes a deep copy of the specified data matrix - if cstring_array is NULL, then ignores it and pads with ?'s | 
|  | 365             virtual void setFromSource(unsigned int number_of_sequences, unsigned int alignment_length, const char* const * const cstring_array); | 
|  | 366 | 
|  | 367             // Copies from a Container | 
|  | 368             virtual void copyObject(const Container&); | 
|  | 369 | 
|  | 370             // Copies from an Align | 
|  | 371             virtual void copyObject(const Align&); | 
|  | 372 | 
|  | 373             // Alignment length | 
|  | 374             unsigned int _ls; | 
|  | 375     }; | 
|  | 376 } | 
|  | 377 | 
|  | 378 #endif |