Mercurial > repos > padge > trimal
diff trimal_repo/source/rwAlignment.cpp @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author | padge |
---|---|
date | Fri, 25 Mar 2022 17:10:43 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trimal_repo/source/rwAlignment.cpp Fri Mar 25 17:10:43 2022 +0000 @@ -0,0 +1,2253 @@ +/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** + ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** + + trimAl v1.4: a tool for automated alignment trimming in large-scale + phylogenetics analyses. + + readAl v1.4: a tool for automated alignment conversion among different + formats. + + 2009-2015 Capella-Gutierrez S. and Gabaldon, T. + [scapella, tgabaldon]@crg.es + + This file is part of trimAl/readAl. + + trimAl/readAl are free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, the last available version. + + trimAl/readAl are distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with trimAl/readAl. If not, see <http://www.gnu.org/licenses/>. + +***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** +***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ + +#include "alignment.h" +#include "defines.h" +#include "utils.h" + +extern int errno; +#include <errno.h> +#include <ctype.h> +#include <string> + +using namespace std; + +bool alignment::fillMatrices(bool aligned) { + /* Function to determine if a set of sequences, that can be aligned or not, + * have been correctly load and are free of errors. */ + int i, j; + + /* Initialize some variables */ + residuesNumber = new int[sequenNumber]; + for(i = 0; i < sequenNumber; i++) { + residuesNumber[i] = sequences[i].size(); + } + + /* Check whether there are any unknow/no allowed character in the sequences */ + for(i = 0; i < sequenNumber; i++) + for(j = 0; j < residuesNumber[i]; j++) + if((!isalpha(sequences[i][j])) && (!ispunct(sequences[i][j]))) { + cerr << endl << "ERROR: The sequence \"" << seqsName[i] << "\" has an " + << "unknown (" << sequences[i][j] << ") character." << endl; + return false; + } + + /* Check whether all sequences have same size or not */ + for(i = 1; i < sequenNumber; i++) + if(residuesNumber[i] != residuesNumber[i-1]) + break; + /* Set an appropriate flag for indicating if sequences are aligned or not */ + isAligned = (i != sequenNumber) ? false : true; + + /* Warm about those cases where sequences should be aligned + * and there are not */ + if (aligned and !isAligned) { + cerr << endl << "ERROR: Sequences should be aligned (all with same length) " + << "and there are not. Check your input alignment" << endl; + return false; + } + + /* Full-fill some information about input alignment */ + if(residNumber == 0) + residNumber = residuesNumber[0]; + + /* Check whether aligned sequences have the length fixed for the input alig */ + for(i = 0; (i < sequenNumber) and (aligned); i++) { + if(residuesNumber[i] != residNumber) { + cerr << endl << "ERROR: The sequence \"" << seqsName[i] << "\" (" + << residuesNumber[i] << ") does not have the same number of residues " + << "fixed by the alignment (" << residNumber << ")." << endl; + return false; + } + } + + /* If the sequences are aligned, initialize some additional variables. + * These variables will be useful for posterior analysis */ + if((aligned) || (isAligned)) { + + /* Asign its position to each column. That will be used to determine which + * columns should be kept in output alignment after applying any method + * and which columns should not */ + saveResidues = new int[residNumber]; + for(i = 0; i < residNumber; i++) + saveResidues[i] = i; + + /* Asign its position to each sequence. Similar to the columns numbering + * process, assign to each sequence its position is useful to know which + * sequences will be in the output alignment */ + saveSequences = new int[sequenNumber]; + for(i = 0; i < sequenNumber; i++) + saveSequences[i] = i; + } + + /* Return an flag indicating that everything is fine */ + return true; +} + +void alignment::getSequences(ostream &file) { + /* Get only residues sequences from input alignment */ + + int i, j; + string *tmpMatrix; + + /* Allocate local memory for generating output alignment */ + tmpMatrix = new string[sequenNumber]; + + /* Depending on alignment orientation: forward or reverse. Copy directly + * sequence information or get firstly the reversed sequences and then copy + * it into local memory. Once orientation has been determined, remove gaps + * from resuting sequences */ + for(i = 0; i < sequenNumber; i++) + tmpMatrix[i] = (!reverse) ? utils::removeCharacter('-', sequences[i]) : \ + utils::removeCharacter('-', utils::getReverse(sequences[i])); + + /* Print output set of sequences in FASTA format*/ + for(i = 0; i < sequenNumber; i++) { + file << ">" << seqsName[i] << endl; + for(j = 0; j < (int) tmpMatrix[i].size(); j += 60) + file << tmpMatrix[i].substr(j, 60) << endl; + file << endl; + } + + /* Deallocate local memory */ + delete [] tmpMatrix; +} + +int alignment::formatInputAlignment(char *alignmentFile) { + /* Guess input alignment format */ + + char c, *firstWord = NULL, *line = NULL; + int format = 0, blocks = 0; + ifstream file; + string nline; + + /* Check the file and its content */ + file.open(alignmentFile, ifstream::in); + if(!utils::checkFile(file)) + return false; + + /* Read first valid line in a safer way */ + do { + line = utils::readLine(file); + } while ((line == NULL) && (!file.eof())); + + /* If the file end is reached without a valid line, warn about it */ + if (file.eof()) + return false; + + /* Otherwise, split line */ + firstWord = strtok(line, OTHDELIMITERS); + + /* Clustal Format */ + if((!strcmp(firstWord, "CLUSTAL")) || (!strcmp(firstWord, "clustal"))) + format = 1; + + /* NBRF/PIR Format */ + else if(firstWord[0] == '>' && firstWord[3] == ';') + format = 3; + + /* Fasta Format */ + else if(firstWord[0] == '>') + format = 8; + + /* Nexus Format */ + else if((!strcmp(firstWord, "#NEXUS")) || (!strcmp(firstWord, "#nexus"))) + format = 17; + + /* Mega Format */ + else if((!strcmp(firstWord, "#MEGA")) || (!strcmp(firstWord, "#mega"))) { + + /* Determine specific mega format: sequential or interleaved. + * Counting the number of blocks (set of lines starting by "#") in + * the input file. */ + blocks = 0; + do { + file.read(&c, 1); + } while((c != '#') && (!file.eof())); + + do { + while((c != '\n') && (!file.eof())) + file.read(&c, 1); + file.read(&c, 1); + if(c == '#') + blocks++; + } while((c != '\n') && (!file.eof())); + + /* MEGA Sequential (22) or Interleaved (21) */ + format = (!blocks) ? 22 : 21; + } + + /* Phylip Format */ + else { + /* Determine specific phylip format: sequential or interleaved. */ + + /* Get number of sequences and residues */ + sequenNumber = atoi(firstWord); + firstWord = strtok(NULL, DELIMITERS); + if(firstWord != NULL) + residNumber = atoi(firstWord); + + /* If there is only one sequence, use by default sequential format since + * it is impossible to determine exactly which phylip format is */ + if((sequenNumber == 1) && (residNumber != 0)) + format = 12; + + /* If there are more than one sequence, analyze sequences distribution to + * determine its format. */ + else if((sequenNumber != 0) && (residNumber != 0)) { + blocks = 0; + + /* Read line in a safer way */ + do { + if (line != NULL) + delete [] line; + line = utils::readLine(file); + } while ((line == NULL) && (!file.eof())); + + /* If the file end is reached without a valid line, warn about it */ + if (file.eof()) + return false; + + firstWord = strtok(line, DELIMITERS); + while(firstWord != NULL) { + blocks++; + firstWord = strtok(NULL, DELIMITERS); + } + + /* Read line in a safer way */ + do { + if (line != NULL) + delete [] line; + line = utils::readLine(file); + } while ((line == NULL) && (!file.eof())); + + firstWord = strtok(line, DELIMITERS); + while(firstWord != NULL) { + blocks--; + firstWord = strtok(NULL, DELIMITERS); + } + + /* If the file end is reached without a valid line, warn about it */ + if (file.eof()) + return false; + + /* Phylip Interleaved (12) or Sequential (11) */ + format = (!blocks) ? 12 : 11; + } + } + + /* Close the input file and delete dinamic memory */ + file.close(); + if (line != NULL) + delete [] line; + + /* Return the input alignment format */ + return format; +} + +bool alignment::loadPhylipAlignment(char *alignmentFile) { + /* PHYLIP/PHYLIP 4 (Sequential) file format parser */ + + char *str, *line = NULL; + ifstream file; + int i; + + /* Check the file and its content */ + file.open(alignmentFile, ifstream::in); + if(!utils::checkFile(file)) + return false; + + /* Store some data about filename for possible uses in other formats */ + filename.append("!Title "); + filename.append(alignmentFile); + filename.append(";"); + + /* Read first valid line in a safer way */ + do { + line = utils::readLine(file); + } while ((line == NULL) && (!file.eof())); + + /* If the file end is reached without a valid line, warn about it */ + if (file.eof()) + return false; + + /* Read the input sequences and residues for each sequence numbers */ + str = strtok(line, DELIMITERS); + sequenNumber = 0; + if(str != NULL) + sequenNumber = atoi(str); + + str = strtok(NULL, DELIMITERS); + residNumber = 0; + if(str != NULL) + residNumber = atoi(str); + + /* If something is wrong about the sequences or/and residues number, + * return an error to warn about that */ + if((sequenNumber == 0) || (residNumber == 0)) + return false; + + /* Allocate memory for the input data */ + sequences = new string[sequenNumber]; + seqsName = new string[sequenNumber]; + + /* Read the lines block containing the sequences name + first fragment */ + i = 0; + while((i < sequenNumber) && (!file.eof())){ + + /* Read lines in a safer way. Destroy previous stored information */ + if (line != NULL) + delete [] line; + line = utils::readLine(file); + + /* It the input line/s are blank lines, skip the loop iteration */ + if(line == NULL) + continue; + + /* First token: Sequence name */ + str = strtok(line, DELIMITERS); + seqsName[i].append(str, strlen(str)); + + /* Trim the rest of the line from blank spaces, tabs, etc and store it */ + str = strtok(NULL, DELIMITERS); + while(str != NULL) { + sequences[i].append(str, strlen(str)); + str = strtok(NULL, DELIMITERS); + } + i++; + } + + /* Read the rest of the input file */ + while(!file.eof()) { + + /* Try to get for each sequences its corresponding residues */ + i = 0; + while((i < sequenNumber) && (!file.eof())) { + /* Read lines in a safer way. Destroy previous stored information */ + if (line != NULL) + delete [] line; + + line = utils::readLine(file); + /* It the input line/s are blank lines, skip the loop iteration */ + if(line == NULL) + continue; + + /* Remove from the current line non-printable characters and add fragments + * to previous stored sequence */ + str = strtok(line, DELIMITERS); + while(str != NULL) { + sequences[i].append(str, strlen(str)); + str = strtok(NULL, DELIMITERS); + } + i++; + } + } + + /* Close the input file and delete dinamic memory */ + file.close(); + if (line != NULL) + delete [] line; + + /* Check the matrix's content */ + return fillMatrices(true); +} + +bool alignment::loadPhylip3_2Alignment(char *alignmentFile) { + /* PHYLIP 3.2 (Interleaved) file format parser */ + + int i, blocksFirstLine, firstLine = true; + char *str, *line = NULL; + ifstream file; + + /* Check the file and its content */ + file.open(alignmentFile, ifstream::in); + if(!utils::checkFile(file)) + return false; + + /* Store the file name for futher format conversion*/ + filename.append("!Title "); + filename.append(alignmentFile); + filename.append(";"); + + /* Read first valid line in a safer way */ + do { + line = utils::readLine(file); + } while ((line == NULL) && (!file.eof())); + + /* If the file end is reached without a valid line, warn about it */ + if (file.eof()) + return false; + + /* Get the sequences and residues numbers. If there is any mistake, + * return a FALSE value to warn about the possible error */ + str = strtok(line, DELIMITERS); + sequenNumber = 0; + if(str != NULL) + sequenNumber = atoi(str); + + str = strtok(NULL, DELIMITERS); + residNumber = 0; + if(str != NULL) + residNumber = atoi(str); + + if((sequenNumber == 0) || (residNumber == 0)) + return false; + + /* Reserve memory according to the input parameters */ + sequences = new string[sequenNumber]; + seqsName = new string[sequenNumber]; + + /* Point to the first sequence in the alignment. Since the alignment could not + * have blank lines to separate the different sequences. Store the blocks size + * for the first line including a sequence identifier */ + i = 0; + blocksFirstLine = 0; + + do { + /* Read lines in a safer way. Destroy previous stored information */ + if (line != NULL) + delete [] line; + + line = utils::readLine(file); + /* If there is nothing in the input line, skip the loop instructions */ + if(line == NULL) + continue; + + str = strtok(line, DELIMITERS); + /* First block: Sequence Name + Sequence fragment. Count how many blocks + * the first sequence line is divided. It could help to identify the + * different sequences from the input file */ + if(firstLine) { + seqsName[i].append(str, strlen(str)); + str = strtok(NULL, OTHDELIMITERS); + firstLine = 1; + } + + /* Sequence fragment */ + while(str != NULL) { + sequences[i].append(str, strlen(str)); + str = strtok(NULL, OTHDELIMITERS); + /* Count the blocks number for the sequences first line */ + if (firstLine) + firstLine += 1; + } + + /* Store the blocks number for the first sequence including the name */ + if ((blocksFirstLine == 0) and firstLine) + blocksFirstLine = firstLine; + + /* If a false positive new sequence was detected, add stored information for + * the current sequence to the previous one and clear the data structure + * for the current sequence. Finally, move the sequence pointer to the + * previous one. */ + if ((firstLine != false) and (firstLine != blocksFirstLine)) { + sequences[i-1].append(seqsName[i]); + seqsName[i].clear(); + sequences[i-1].append(sequences[i]); + sequences[i].clear(); + i --; + } + + firstLine = false; + /* There are many ways to detect a new sequence. */ + /* One of them -experimental- is just to detect if the residues number for + * the current entry is equal to the residues number for the whole align */ + if ((int) sequences[i].size() == residNumber) { + firstLine = true; + i++; + } + } while(!file.eof()); + + /* Close the input file and delete dinamic memory */ + file.close(); + if (line != NULL) + delete [] line; + + /* Check the matrix's content */ + return fillMatrices(true); +} + +bool alignment::loadClustalAlignment(char *alignmentFile) { + /* CLUSTAL file format parser */ + + int i, seqLength, pos, firstBlock; + char *str, *line = NULL; + ifstream file; + + /* Check input file and its content */ + file.open(alignmentFile, ifstream::in); + if(!utils::checkFile(file)) + return false; + + /* Store some details about input file to be used in posterior format + * conversions */ + filename.append("!Title "); + filename.append(alignmentFile); + filename.append(";"); + + /* The first valid line corresponding to CLUSTAL label is ignored */ + do { + line = utils::readLine(file); + } while ((line == NULL) && (!file.eof())); + + /* If the file end is reached without a valid line, warn about it */ + if (file.eof()) + return false; + + /* Ignore blank lines before first sequence block starts */ + while(!file.eof()) { + + /* Deallocate previously used dynamic memory */ + if (line != NULL) + delete [] line; + + /* Read lines in safe way */ + line = utils::readLine(file); + + if (line != NULL) + break; + } + + /* The program in only interested in the first blocks of sequences since + * it wants to know how many sequences are in the input file */ + sequenNumber = 0; + while(!file.eof()) { + + /* If a new line without any valid character is detected + * means the first block is over */ + if (line == NULL) + break; + + /* Count how many times standard characters as well as + * gap symbol "-" is detected in current line. */ + seqLength = (int) strlen(line); + for(pos = 0; pos < seqLength; pos++) + if((isalpha(line[pos])) || (line[pos] == '-')) + break; + + /* If not standard characters are detected in current line means that + * the program has found typical line in clustal alignment files to mark + * some scores for some columns. In that case, the first block is over */ + if(pos == seqLength) + break; + sequenNumber++; + + /* Deallocate previously used dynamic memory */ + if (line != NULL) + delete [] line; + + /* Read lines in safe way */ + line = utils::readLine(file); + } + + /* Finish to preprocess the input file. */ + file.clear(); + file.seekg(0); + + /* Allocate memory for the input alignmet */ + seqsName = new string[sequenNumber]; + sequences = new string[sequenNumber]; + + /* Read the title line and store it */ + line = utils::readLine(file); + if (line == NULL) + return false; + aligInfo.append(line, strlen(line)); + + /* Ignore blank lines before first sequence block starts */ + while(!file.eof()) { + + /* Deallocate previously used dynamic memory */ + if (line != NULL) + delete [] line; + + /* Read lines in safe way */ + line = utils::readLine(file); + + if (line != NULL) + break; + } + + /* Set-up sequences pointer to the first one and the flag to indicate + * the first blocks. That flag implies that sequences names have to be + * stored */ + i = 0; + firstBlock = true; + + while(!file.eof()) { + + if (line == NULL) { + /* Sometimes, clustalw files does not have any marker after first block + * to indicate conservation between its columns residues. In that cases, + * mark the end of first block */ + if (i == 0) + firstBlock = false; + /* Read current line and analyze it*/ + line = utils::readLine(file); + continue; + } + + /* Check whteher current line is a standard line or it is a line to mark + * quality scores for that alignment column */ + seqLength = (int) strlen(line); + for(pos = 0; pos < seqLength; pos++) + if((isalpha(line[pos])) || (line[pos] == '-')) + break; + + /* Start a new block in the input alignment */ + if (pos == seqLength) { + firstBlock = false; + + /* Deallocate dinamic memory if it has been used before */ + if (line != NULL) + delete [] line; + + /* Read current line and analyze it*/ + line = utils::readLine(file); + + continue; + } + + /* If it is a standard line, split it into two parts. The first one contains + * sequence name and the second one the residues. If the "firstBlock" flag + * is active then store the sequence name */ + str = strtok(line, OTHDELIMITERS); + if(str != NULL) { + if(firstBlock) + seqsName[i].append(str, strlen(str)); + str = strtok(NULL, OTHDELIMITERS); + if(str != NULL) + sequences[i].append(str, strlen(str)); + + /* Move sequences pointer in a circular way */ + i = (i + 1) % sequenNumber; + } + + /* Deallocate dinamic memory if it has been used before */ + if (line != NULL) + delete [] line; + + /* Read current line and analyze it*/ + line = utils::readLine(file); + } + + /* Close the input file */ + file.close(); + + /* Deallocate dinamic memory */ + if (line != NULL) + delete [] line; + + /* Check the matrix's content */ + return fillMatrices(true); +} + +bool alignment::loadFastaAlignment(char *alignmentFile) { + /* FASTA file format parser */ + + char *str, *line = NULL; + ifstream file; + int i; + + /* Check the file and its content */ + file.open(alignmentFile, ifstream::in); + if(!utils::checkFile(file)) + return false; + + /* Store input file name for posterior uses in other formats */ + filename.append("!Title "); + filename.append(alignmentFile); + filename.append(";"); + + /* Compute how many sequences are in the input alignment */ + sequenNumber = 0; + while(!file.eof()) { + + /* Deallocate previously used dinamic memory */ + if (line != NULL) + delete [] line; + + /* Read lines in a safe way */ + line = utils::readLine(file); + if (line == NULL) + continue; + + /* It the line starts by ">" means that a new sequence has been found */ + str = strtok(line, DELIMITERS); + if (str == NULL) + continue; + + /* If a sequence name flag is detected, increase sequences counter */ + if(str[0] == '>') + sequenNumber++; + } + + /* Finish to preprocess the input file. */ + file.clear(); + file.seekg(0); + + /* Allocate memory for the input alignmet */ + seqsName = new string[sequenNumber]; + sequences = new string[sequenNumber]; + seqsInfo = new string[sequenNumber]; + + for(i = -1; (i < sequenNumber) && (!file.eof()); ) { + + /* Deallocate previously used dinamic memory */ + if (line != NULL) + delete [] line; + + /* Read lines in a safe way */ + line = utils::readLine(file); + if (line == NULL) + continue; + + /* Store original header fom input sequences including non-standard + * characters */ + if (line[0] == '>') + seqsInfo[i+1].append(&line[1], strlen(line) - 1); + + /* Cut the current line and check whether there are valid characters */ + str = strtok(line, OTHDELIMITERS); + if (str == NULL) + continue; + + /* Check whether current line belongs to the current sequence + * or it is a new one. In that case, store the sequence name */ + if(str[0] == '>') { + /* Move sequence name pointer until a valid string name is obtained */ + do { + str = str + 1; + } while(strlen(str) == 0); + seqsName[++i].append(str, strlen(str)); + continue; + } + + /* Sequence */ + while(str != NULL) { + sequences[i].append(str, strlen(str)); + str = strtok(NULL, DELIMITERS); + } + } + + /* Close the input file */ + file.close(); + + /* Deallocate previously used dinamic memory */ + if (line != NULL) + delete [] line; + + /* Check the matrix's content */ + return fillMatrices(false); +} + +bool alignment::loadNexusAlignment(char *alignmentFile) { + + /* NEXUS file format parser */ + char *frag = NULL, *str = NULL, *line = NULL; + int i, pos, state, firstBlock; + ifstream file; + + /* Check the file and its content */ + file.open(alignmentFile, ifstream::in); + if(!utils::checkFile(file)) + return false; + + /* Store input file name for posterior uses in other formats */ + /* We store the file name */ + filename.append("!Title "); + filename.append(alignmentFile); + filename.append(";"); + + state = false; + do { + + /* Destroy previous assigned memory */ + if (line != NULL) + delete [] line; + + /* Read line in a safer way */ + line = utils::readLine(file); + if (line == NULL) + continue; + + /* Discard line where there is not information */ + str = strtok(line, DELIMITERS); + if(str == NULL) + continue; + + /* If the line has any kind of information, try to catch it */ + /* Firstly, convert to capital letters the input line */ + for(i = 0; i < (int) strlen(str); i++) + str[i] = toupper(str[i]); + + /* ... and then compare it again specific tags */ + if(!strcmp(str, "BEGIN")) + state = true; + + else if(!strcmp(str, "MATRIX")) + break; + + /* Store information about input format file */ + else if(!strcmp(str, "FORMAT")) { + str = strtok(NULL, DELIMITERS); + while(str != NULL) { + aligInfo.append(str, strlen(str)); + aligInfo.append(" ", strlen(" ")); + str = strtok(NULL, DELIMITERS); + } + } + + /* In this case, try to get matrix dimensions */ + else if((!strcmp(str, "DIMENSIONS")) && state) { + str = strtok(NULL, DELIMITERS); + frag = strtok(NULL, DELIMITERS); + str = strtok(str, "=;"); + sequenNumber = atoi(strtok(NULL, "=;")); + frag = strtok(frag, "=;"); + residNumber = atoi(strtok(NULL, "=;")); + } + } while(!file.eof()); + + /* Check all parameters */ + if(strcmp(str, "MATRIX") || (sequenNumber == 0) || (residNumber == 0)) + return false; + + /* Allocate memory for the input alignmet */ + seqsName = new string[sequenNumber]; + sequences = new string[sequenNumber]; + + pos = 0; + state = false; + firstBlock = true; + + while(!file.eof()) { + /* Destroy previous assigned memory */ + if (line != NULL) + delete [] line; + + /* Read line in a safer way */ + line = utils::readLine(file); + if (line == NULL) + continue; + + /* Discard any comments from input file */ + for(i = 0; i < (int) strlen(line); i++) { + if (line[i] == '[') + state = true; + else if (line[i] == ']' && state) { + state = false; + break; + } + } + + /* If there is a multi-line comments, skip it as well */ + if ((state) || (not state && i != (int) strlen(line))) + continue; + + /* Check for a specific tag indicating matrix end */ + if((!strncmp(line, "end;", 4)) || (!strncmp(line, "END;", 4))) + break; + + /* Split input line and check it if it is valid */ + str = strtok(line, OTH2DELIMITERS); + if (str == NULL) + continue; + + /* Store the sequence name, only from the first block */ + if(firstBlock) + seqsName[pos].append(str, strlen(str)); + + /* Store rest of line as part of sequence */ + str = strtok(NULL, OTH2DELIMITERS); + while(str != NULL) { + sequences[pos].append(str, strlen(str)); + str = strtok(NULL, OTH2DELIMITERS); + } + + /* Move sequences pointer to next one. It if it is last one, move it to + * the beginning and set the first block to false for avoiding to rewrite + * sequences name */ + pos = (pos + 1) % sequenNumber; + if (not pos) + firstBlock = false; + } + + /* Deallocate memory */ + if (line != NULL) + delete [] line; + + /* Close the input file */ + file.close(); + + /* Check the matrix's content */ + return fillMatrices(true); +} + +bool alignment::loadMegaNonInterleavedAlignment(char *alignmentFile) { + /* MEGA sequential file format parser */ + + char *frag = NULL, *str = NULL, *line = NULL; + ifstream file; + int i; + + /* Check the file and its content */ + file.open(alignmentFile, ifstream::in); + if(!utils::checkFile(file)) + return false; + + /* Filename is stored as a title for MEGA input alignment. + * If it is detected later a label "TITLE" in input file, this information + * will be replaced for that one */ + filename.append("!Title "); + filename.append(alignmentFile); + filename.append(";"); + + /* Skip first valid line */ + do { + line = utils::readLine(file); + } while ((line == NULL) && (!file.eof())); + + /* If the file end is reached without a valid line, warn about it */ + if (file.eof()) + return false; + + /* Try to get input alignment information */ + while(!file.eof()) { + + /* Destroy previously allocated memory */ + if (line != NULL) + delete [] line; + + /* Read a new line in a safe way */ + line = utils::readLine(file); + if (line == NULL) + continue; + + /* If a sequence name flag is found, go out from getting information loop */ + if(!strncmp(line, "#", 1)) + break; + + /* Destroy previously allocated memory */ + if (frag != NULL) + delete [] frag; + + /* Create a local copy from input line */ + frag = new char[strlen(line) + 1]; + strcpy(frag, line); + + /* Split input line copy into pieces and analize it + * looking for specific labels */ + str = strtok(frag, "!: "); + for(i = 0; i < (int) strlen(str); i++) + str[i] = toupper(str[i]); + + /* If TITLE label is found, replace previously stored information with + * this info */ + if(!strcmp(str, "TITLE")) { + filename.clear(); + if(strncmp(line, "!", 1)) + filename += "!"; + filename += line; + } + + /* If FORMAT label is found, try to get some details from input file */ + else if(!strcmp(str, "FORMAT")) + aligInfo.append(line, strlen(line)); + } + + /* Deallocate local memory */ + if (frag != NULL) + delete [] frag; + + /* Count how many sequences are in input alignment file */ + do { + + /* Check whether input line is valid or not */ + if (line == NULL) { + line = utils::readLine(file); + continue; + } + + /* If current line starts by a # means that it is a sequence name */ + if (!strncmp(line, "#", 1)) + sequenNumber++; + + /* Destroy previously allocated memory */ + if (line != NULL) + delete [] line; + + /* Read a new line in a safe way */ + line = utils::readLine(file); + + } while(!file.eof()); + + /* Move file pointer to the beginner */ + file.clear(); + file.seekg(0); + + /* Allocate memory */ + seqsName = new string[sequenNumber]; + sequences = new string[sequenNumber]; + + /* Skip first line */ + line = utils::readLine(file); + + /* Skip lines until first sequence name is found */ + while(!file.eof()) { + + /* Destroy previously allocated memory */ + if (line != NULL) + delete [] line; + + /* Read a new line in a safe way */ + line = utils::readLine(file); + if (line == NULL) + continue; + + /* If sequence name label is found, go out from loop */ + if (!strncmp(line, "#", 1)) + break; + } + + /* First sequence is already detected so its name should be stored */ + i = -1; + + /* This loop is a bit tricky because first sequence name has been already + * detected, so it is necessary to process it before moving to next line. + * That implies that lines are read at loop ends */ + while(!file.eof()) { + + /* Skip blank lines */ + if (line == NULL) { + line = utils::readLine(file); + continue; + } + + /* Skip lines with comments */ + if (!strncmp(line, "!", 1)) { + /* Deallocate memory and read a new line */ + delete [] line; + line = utils::readLine(file); + continue; + } + + /* Remove comments inside of sequences and split input line */ + frag = utils::trimLine(line); + + /* Skip lines with only comments */ + if (frag == NULL) { + + /* Deallocate memory and read a new line */ + if (line != NULL) + delete [] line; + line = utils::readLine(file); + continue; + } + + /* Otherwise, split it into fragments */ + str = strtok(frag, " #\n"); + + /* Sequence Name */ + if (!strncmp(line, "#", 1)) { + i += 1; + seqsName[i].append(str, strlen(str)); + str = strtok(NULL, " #\n"); + } + + /* Sequence itself */ + while(str != NULL) { + sequences[i].append(str, strlen(str)); + str = strtok(NULL, " \n"); + } + + /* Deallocate dynamic memory */ + if (frag != NULL) + delete [] frag; + + if (line != NULL) + delete [] line; + + /* Read a new line in a safe way */ + line = utils::readLine(file); + } + + /* Close input file */ + file.close(); + + /* Deallocate dynamic memory */ + if (line != NULL) + delete [] line; + + /* Check the matrix's content */ + return fillMatrices(true); +} + +bool alignment::loadMegaInterleavedAlignment(char *alignmentFile) { + /* MEGA interleaved file format parser */ + + char *frag = NULL, *str = NULL, *line = NULL; + int i, firstBlock = true; + ifstream file; + + /* Check the file and its content */ + file.open(alignmentFile, ifstream::in); + if(!utils::checkFile(file)) + return false; + + /* Filename is stored as a title for MEGA input alignment. + * If it is detected later a label "TITLE" in input file, this information + * will be replaced for that one */ + filename.append("!Title "); + filename.append(alignmentFile); + filename.append(";"); + + /* Skip first valid line */ + do { + line = utils::readLine(file); + } while ((line == NULL) && (!file.eof())); + + /* If the file end is reached without a valid line, warn about it */ + if (file.eof()) + return false; + + /* Try to get input alignment information */ + while(!file.eof()) { + + /* Destroy previously allocated memory */ + if (line != NULL) + delete [] line; + + /* Read a new line in a safe way */ + line = utils::readLine(file); + if (line == NULL) + continue; + + /* If a sequence name flag is found, go out from getting information loop */ + if(!strncmp(line, "#", 1)) + break; + + /* Create a local copy from input line */ + frag = new char[strlen(line) + 1]; + strcpy(frag, line); + + /* Split input line copy into pieces and analize it + * looking for specific labels */ + str = strtok(frag, "!: "); + for(i = 0; i < (int) strlen(str); i++) + str[i] = toupper(str[i]); + + /* If TITLE label is found, replace previously stored information with + * this info */ + if(!strcmp(str, "TITLE")) { + filename.clear(); + if(strncmp(line, "!", 1)) + filename += "!"; + filename += line; + } + + /* If FORMAT label is found, try to get some details from input file */ + else if(!strcmp(str, "FORMAT")) + aligInfo.append(line, strlen(line)); + + /* Destroy previously allocated memory */ + if (frag != NULL) + delete [] frag; + } + + /* Count how many sequences are in input file */ + while(!file.eof()) { + + /* If a sequence name flag has been detected, increase counter */ + if(!strncmp(line, "#", 1)) + sequenNumber++; + + /* Deallocate dynamic memory */ + if(line != NULL) + delete [] line; + + /* Read lines in a safe way */ + line = utils::readLine(file); + + /* If a blank line is detected means first block of sequences is over */ + /* Then, break counting sequences loop */ + if (line == NULL) + break; + } + + /* Finish to preprocess the input file. */ + file.clear(); + file.seekg(0); + + /* Allocate memory */ + seqsName = new string[sequenNumber]; + sequences = new string[sequenNumber]; + + /* Skip first line */ + line = utils::readLine(file); + + /* Skip lines until first # flag is reached */ + while(!file.eof()) { + + /* Deallocate previously used dynamic memory */ + if (line != NULL) + delete [] line; + + /* Read line in a safer way */ + line = utils::readLine(file); + + /* Determine whether a # flag has been found in current string */ + if (line != NULL) + if(!strncmp(line, "#", 1)) + break; + } + + /* Read sequences and get from first block, the sequences names */ + i = 0; + firstBlock = true; + + while(!file.eof()) { + + if (line == NULL) { + /* Read line in a safer way */ + line = utils::readLine(file); + continue; + } + + if (!strncmp(line, "!", 1)) { + /* Deallocate memory and read a new line */ + delete [] line; + line = utils::readLine(file); + continue; + } + + /* Trim lines from any kind of comments and split it */ + frag = utils::trimLine(line); + str = strtok(frag, " #\n"); + + /* Check whether a line fragment is valid or not */ + if (str == NULL) + continue; + + /* Store sequences names if firstBlock flag is TRUE */ + if(firstBlock) + seqsName[i].append(str, strlen(str)); + + /* Store sequence */ + str = strtok(NULL, " \n"); + while(str != NULL) { + sequences[i].append(str, strlen(str)); + str = strtok(NULL, " \n"); + } + + /* Deallocate previously used dynamic memory */ + if (frag != NULL) + delete [] frag; + + if (line != NULL) + delete [] line; + + /* Read line in a safer way */ + line = utils::readLine(file); + + i = (i + 1) % sequenNumber; + if (i == 0) + firstBlock = false; + } + + /* Close input file */ + file.close(); + + /* Deallocate local memory */ + if (line != NULL) + delete [] line; + + /* Check the matrix's content */ + return fillMatrices(true); +} + +bool alignment::loadNBRF_PirAlignment(char *alignmentFile) { + /* NBRF/PIR file format parser */ + + bool seqIdLine, seqLines; + char *str, *line = NULL; + ifstream file; + int i; + + /* Check the file and its content */ + file.open(alignmentFile, ifstream::in); + if(!utils::checkFile(file)) + return false; + + /* Store input file name for posterior uses in other formats */ + filename.append("!Title "); + filename.append(alignmentFile); + filename.append(";"); + + /* Compute how many sequences are in the input alignment */ + sequenNumber = 0; + while(!file.eof()) { + + /* Deallocate previously used dinamic memory */ + if (line != NULL) + delete [] line; + + /* Read lines in a safe way */ + line = utils::readLine(file); + if (line == NULL) + continue; + + /* It the line starts by ">" means that a new sequence has been found */ + str = strtok(line, DELIMITERS); + if (str == NULL) + continue; + + /* If a sequence name flag is detected, increase sequences counter */ + if(str[0] == '>') + sequenNumber++; + } + + /* Finish to preprocess the input file. */ + file.clear(); + file.seekg(0); + + /* Allocate memory for the input alignmet */ + sequences = new string[sequenNumber]; + seqsName = new string[sequenNumber]; + seqsInfo = new string[sequenNumber]; + + /* Initialize some local variables */ + seqIdLine = true; + seqLines = false; + i = -1; + + /* Read the entire input file */ + while(!file.eof()) { + + /* Deallocate local memory */ + if (line != NULL) + delete [] line; + + /* Read lines in a safe way */ + line = utils::readLine(file); + if (line == NULL) + continue; + + /* Sequence ID line. + * Identification of these kind of lines is based on presence of ">" and ";" + * symbols at positions 0 and 3 respectively */ + if((line[0] == '>') && (line[3] == ';') && (seqIdLine)) { + seqIdLine = false; + i += 1; + + /* Store information about sequence datatype */ + str = strtok(line, ">;"); + seqsInfo[i].append(str, strlen(str)); + + /* and the sequence identifier itself */ + str = strtok(NULL, ">;"); + seqsName[i].append(str, strlen(str)); + } + + /* Line just after sequence Id line contains a textual description of + * the sequence. */ + else if((!seqIdLine) && (!seqLines)) { + seqLines = true; + seqsInfo[i].append(line, strlen(line)); + } + + /* Sequence lines itself */ + else if (seqLines) { + + /* Check whether a sequence end symbol '*' exists in current line. + * In that case, set appropriate flags to read a new sequence */ + if (line[strlen(line) - 1] == '*') { + seqLines = false; + seqIdLine = true; + } + + /* Process line */ + str = strtok(line, OTHDELIMITERS); + while (str != NULL) { + sequences[i].append(str, strlen(str)); + str = strtok(NULL, OTHDELIMITERS); + } + + /* In case the end symbol '*' has been detected, remove it */ + if(sequences[i][sequences[i].size() - 1] == '*') + sequences[i].erase(sequences[i].size()-1); + + } + } + /* Close the input file */ + file.close(); + + /* Deallocate dinamic memory */ + if (line != NULL) + delete [] line; + + /* Check the matrix's content */ + return fillMatrices(true); +} + +void alignment::alignmentPhylipToFile(ostream &file) { + /* Generate output alignment in PHYLIP/PHYLIP 4 format (sequential) */ + + int i, j, maxLongName; + string *tmpMatrix; + + /* Check whether sequences in the alignment are aligned or not. + * Warn about it if there are not aligned. */ + if (!isAligned) { + cerr << endl << "ERROR: Sequences are not aligned. Format (PHYLIP) " + << "not compatible with unaligned sequences." << endl << endl; + return ; + } + + /* Allocate local memory for generating output alignment */ + tmpMatrix = new string[sequenNumber]; + + /* Depending on alignment orientation: forward or reverse. Copy directly + * sequence information or get firstly the reversed sequences and then + * copy it into local memory */ + for(i = 0; i < sequenNumber; i++) + tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); + + /* Depending on if short name flag is activated (limits sequence name up to + * 10 characters) or not, get maximum sequence name length */ + maxLongName = PHYLIPDISTANCE; + for(i = 0; (i < sequenNumber) && (!shortNames); i++) + maxLongName = utils::max(maxLongName, seqsName[i].size()); + + /* Generating output alignment */ + /* First Line: Sequences Number & Residued Number */ + file << " " << sequenNumber << " " << residNumber << endl; + + /* First Block: Sequences Names & First 60 residues */ + for(i = 0; i < sequenNumber; i++) + file << setw(maxLongName + 3) << left << seqsName[i].substr(0, maxLongName) + << tmpMatrix[i].substr(0, 60) << endl; + file << endl; + + /* Rest of blocks: Print 60 residues per each blocks of sequences */ + for(i = 60; i < residNumber; i += 60) { + for(j = 0; j < sequenNumber; j++) + file << tmpMatrix[j].substr(i, 60) << endl; + file << endl; + } + file << endl; + + /* Deallocate local memory */ + delete [] tmpMatrix; +} + +void alignment::alignmentPhylip3_2ToFile(ostream &file) { + /* Generate output alignment in PHYLIP 3.2 format (interleaved) */ + + int i, j, k, maxLongName; + string *tmpMatrix; + + /* Check whether sequences in the alignment are aligned or not. + * Warn about it if there are not aligned. */ + if (!isAligned) { + cerr << endl << "ERROR: Sequences are not aligned. Format (PHYLIP) " + << "not compatible with unaligned sequences." << endl << endl; + return ; + } + + /* Allocate local memory for generating output alignment */ + tmpMatrix = new string[sequenNumber]; + + /* Depending on alignment orientation: forward or reverse. Copy directly + * sequence information or get firstly the reversed sequences and then + * copy it into local memory */ + for(i = 0; i < sequenNumber; i++) + tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); + + /* Depending on if short name flag is activated (limits sequence name up to + * 10 characters) or not, get maximum sequence name length */ + maxLongName = PHYLIPDISTANCE; + for(i = 0; (i < sequenNumber) && (!shortNames); i++) + maxLongName = utils::max(maxLongName, seqsName[i].size()); + + /* Generating output alignment */ + /* First Line: Sequences Number & Residued Number */ + file << " " << sequenNumber << " " << residNumber << endl; + + /* Alignment */ + /* For each sequence, print its identifier and then the sequence itself in + * blocks of 50 residues */ + for(i = 0; i < sequenNumber; i++) { + /* Sequence Name */ + file << setw(maxLongName + 3) << left << seqsName[i].substr(0, maxLongName); + /* Sequence. Each line contains a block of 5 times 10 residues. */ + for(j = 0; j < residNumber; j += 50) { + for(k = j; (k < residNumber) && (k < (j + 50)); k += 10) + file << sequences[i].substr(k, 10) << " "; + file << endl; + /* If the sequences end has not been reached, print black spaces + * to follow format specifications */ + if((j + 50) < residNumber) + file << setw(maxLongName + 3) << " "; + } + /* Print a blank line to mark sequences separation */ + file << endl; + } + + /* Deallocate local memory */ + delete [] tmpMatrix; +} + +void alignment::alignmentPhylip_PamlToFile(ostream &file) { + /* Generate output alignment in PHYLIP format compatible with PAML program */ + + int i, maxLongName; + string *tmpMatrix; + + /* Check whether sequences in the alignment are aligned or not. + * Warn about it if there are not aligned. */ + if (!isAligned) { + cerr << endl << "ERROR: Sequences are not aligned. Format (PHYLIP) " + << "not compatible with unaligned sequences." << endl << endl; + return ; + } + + /* Allocate local memory for generating output alignment */ + tmpMatrix = new string[sequenNumber]; + + /* Depending on alignment orientation: forward or reverse. Copy directly + * sequence information or get firstly the reversed sequences and then + * copy it into local memory */ + for(i = 0; i < sequenNumber; i++) + tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); + + /* Depending on if short name flag is activated (limits sequence name up to + * 10 characters) or not, get maximum sequence name length */ + maxLongName = PHYLIPDISTANCE; + for(i = 0; (i < sequenNumber) && (!shortNames); i++) + maxLongName = utils::max(maxLongName, seqsName[i].size()); + + /* Generating output alignment */ + /* First Line: Sequences Number & Residued Number */ + file << " " << sequenNumber << " " << residNumber << endl; + + /* Print alignment */ + /* Print sequences name follow by the sequence itself in the same line */ + for(i = 0; i < sequenNumber; i++) + file << setw(maxLongName + 3) << left << seqsName[i].substr(0, maxLongName) + << sequences[i] << endl; + file << endl; + + /* Deallocate local memory */ + delete [] tmpMatrix; +} + +void alignment::alignmentClustalToFile(ostream &file) { + /* Generate output alignment in CLUSTAL format */ + + int i, j, maxLongName = 0; + string *tmpMatrix; + + /* Check whether sequences in the alignment are aligned or not. + * Warn about it if there are not aligned. */ + if (!isAligned) { + cerr << endl << "ERROR: Sequences are not aligned. Format (CLUSTAL) " + << "not compatible with unaligned sequences." << endl << endl; + return ; + } + + /* Allocate local memory for generating output alignment */ + tmpMatrix = new string[sequenNumber]; + + /* Depending on alignment orientation: forward or reverse. Copy directly + * sequence information or get firstly the reversed sequences and then + * copy it into local memory */ + for(i = 0; i < sequenNumber; i++) + tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); + + /* Compute maximum sequences name length */ + for(i = 0; (i < sequenNumber) && (!shortNames); i++) + maxLongName = utils::max(maxLongName, seqsName[i].size()); + + /* Print alignment header */ + if((aligInfo.size() != 0) && (iformat == oformat)) + file << aligInfo << endl << endl; + else + file << "CLUSTAL multiple sequence alignment" << endl << endl; + + /* Print alignment itself */ + /* Print as many blocks as it is needed of lines composed + * by sequences name and 60 residues */ + for(j = 0; j < residNumber; j += 60) { + for(i = 0; i < sequenNumber; i++) + file << setw(maxLongName + 5) << left << seqsName[i] + << tmpMatrix[i].substr(j, 60) << endl; + file << endl << endl; + } + + /* Deallocate local memory */ + delete [] tmpMatrix; +} + +void alignment::alignmentFastaToFile(ostream &file) { + /* Generate output alignment in FASTA format. Sequences can be unaligned. */ + + int i, j, maxLongName; + string *tmpMatrix; + + /* Allocate local memory for generating output alignment */ + tmpMatrix = new string[sequenNumber]; + + /* Depending on alignment orientation: forward or reverse. Copy directly + * sequence information or get firstly the reversed sequences and then + * copy it into local memory */ + for(i = 0; i < sequenNumber; i++) + tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); + + /* Depending on if short name flag is activated (limits sequence name up to + * 10 characters) or not, get maximum sequence name length. Consider those + * cases when the user has asked to keep original sequence header */ + maxLongName = 0; + for(i = 0; i < sequenNumber; i++) + if (!keepHeader) + maxLongName = utils::max(maxLongName, seqsName[i].size()); + else if (seqsInfo != NULL) + maxLongName = utils::max(maxLongName, seqsInfo[i].size()); + + if (shortNames && maxLongName > PHYLIPDISTANCE) { + maxLongName = PHYLIPDISTANCE; + if (keepHeader) + cerr << endl << "WARNING: Original sequence header will be cut by charac" + << "ter 10" << endl; + } + /* Print alignment. First, sequences name id and then the sequences itself */ + for(i = 0; i < sequenNumber; i++) { + if (!keepHeader) + file << ">" << seqsName[i].substr(0, maxLongName) << endl; + else if (seqsInfo != NULL) + file << ">" << seqsInfo[i].substr(0, maxLongName) << endl; + for(j = 0; j < residuesNumber[i]; j+= 60) + file << tmpMatrix[i].substr(j, 60) << endl; + } + + /* Deallocate local memory */ + delete [] tmpMatrix; +} + +void alignment::alignmentNexusToFile(ostream &file) { + /* Generate output alignment in NEXUS format setting only alignment block */ + + int i, j, k, maxLongName = 0; + string *tmpMatrix; + + /* Check whether sequences in the alignment are aligned or not. + * Warn about it if there are not aligned. */ + if (!isAligned) { + cerr << endl << "ERROR: Sequences are not aligned. Format (NEXUS) " + << "not compatible with unaligned sequences." << endl << endl; + return ; + } + + /* Allocate local memory for generating output alignment */ + tmpMatrix = new string[sequenNumber]; + + /* Depending on alignment orientation: forward or reverse. Copy directly + * sequence information or get firstly the reversed sequences and then + * copy it into local memory */ + for(i = 0; i < sequenNumber; i++) + tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); + + /* Compute maximum sequences name length */ + for(i = 0; (i < sequenNumber) && (!shortNames); i++) + maxLongName = utils::max(maxLongName, seqsName[i].size()); + + /* Compute output file datatype */ + getTypeAlignment(); + + /* Remove characters like ";" from input alignment information line */ + while((int) aligInfo.find(";") != (int) string::npos) + aligInfo.erase(aligInfo.find(";"), 1); + + /* Print Alignment header */ + file << "#NEXUS" << endl << "BEGIN DATA;" << endl << " DIMENSIONS NTAX=" + << sequenNumber << " NCHAR=" << residNumber <<";" << endl; + + /* Print alignment datatype */ + if ((dataType == DNAType) || (dataType == DNADeg)) + file << "FORMAT DATATYPE=DNA INTERLEAVE=yes GAP=-"; + else if ((dataType == RNAType) || (dataType == RNADeg)) + file << "FORMAT DATATYPE=RNA INTERLEAVE=yes GAP=-"; + else if (dataType == AAType) + file << "FORMAT DATATYPE=PROTEIN INTERLEAVE=yes GAP=-"; + + i = 0; + /* Using information from input alignment. Use only some tags. */ + while((j = aligInfo.find(" ", i)) != (int) string::npos) { + + if((aligInfo.substr(i, j - i)).compare(0, 7, "MISSING") == 0 || + (aligInfo.substr(i, j)).compare(0, 7, "missing") == 0) + file << " " << (aligInfo.substr(i, j - i)); + + else if((aligInfo.substr(i, j)).compare(0, 9, "MATCHCHAR") == 0 || + (aligInfo.substr(i, j)).compare(0, 9, "matchchar") == 0) + file << " " << (aligInfo.substr(i, j - i)); + + i = j + 1; + } + file << ";" << endl; + + /* Print sequence name and sequence length */ + for(i = 0; i < sequenNumber; i++) + file << "[Name: " << setw(maxLongName + 4) << left << seqsName[i] << "Len: " + << residNumber << "]" << endl; + file << endl << "MATRIX" << endl; + + /* Print alignment itself. Sequence name and 50 residues blocks */ + for(j = 0; j < residNumber; j += 50) { + for(i = 0; i < sequenNumber; i++) { + file << setw(maxLongName + 4) << left << seqsName[i]; + for(k = j; k < (j + 50) && k < residNumber; k += 10) + file << " " << sequences[i].substr(k, 10); + file << endl; + } + file << endl; + } + file << ";" << endl << "END;" << endl; + + /* Deallocate local memory */ + delete [] tmpMatrix; +} + +void alignment::alignmentMegaToFile(ostream &file) { + /* Generate output alignment in MEGA format */ + + int i, j, k; + string *tmpMatrix; + + /* Check whether sequences in the alignment are aligned or not. + * Warn about it if there are not aligned. */ + if (!isAligned) { + cerr << endl << "ERROR: Sequences are not aligned. Format (MEGA) " + << "not compatible with unaligned sequences." << endl << endl; + return ; + } + + /* Allocate local memory for generating output alignment */ + tmpMatrix = new string[sequenNumber]; + + /* Depending on alignment orientation: forward or reverse. Copy directly + * sequence information or get firstly the reversed sequences and then + * copy it into local memory */ + for(i = 0; i < sequenNumber; i++) + tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); + + /* Compute output file datatype */ + getTypeAlignment(); + + /* Print output alignment header */ + file << "#MEGA" << endl << filename << endl; + + /* Print alignment datatype */ + if ((dataType == DNAType) || (dataType == DNADeg)) + file << "!Format DataType=DNA "; + else if ((dataType == RNAType) || (dataType == RNADeg)) + file << "!Format DataType=RNA "; + else if (dataType == AAType) + file << "!Format DataType=protein "; + + /* Print number of sequences and alignment length */ + file << "NSeqs=" << sequenNumber << " Nsites=" << residNumber + << " indel=- CodeTable=Standard;" << endl << endl; + + /* Print sequences name and sequences divided into blocks of 50 residues */ + for(i = 0; i < sequenNumber; i++) { + file << "#" << seqsName[i] << endl; + for(j = 0; j < residNumber; j += 50) { + for(k = j; ((k < residNumber) && (k < j + 50)); k += 10) + file << tmpMatrix[i].substr(k, 10) << " "; + file << endl; + } + file << endl; + } + + /* Deallocate local memory */ + delete [] tmpMatrix; +} + +void alignment::alignmentNBRF_PirToFile(ostream &file) { + /* Generate output alignment in NBRF/PIR format. Sequences can be unaligned */ + + int i, j, k; + string alg_datatype, *tmpMatrix; + + /* Allocate local memory for generating output alignment */ + tmpMatrix = new string[sequenNumber]; + + /* Depending on alignment orientation: forward or reverse. Copy directly + * sequence information or get firstly the reversed sequences and then + * copy it into local memory */ + for(i = 0; i < sequenNumber; i++) + tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); + + /* Compute output file datatype */ + getTypeAlignment(); + if ((dataType == DNAType) || (dataType == DNADeg)) + alg_datatype = "DL"; + else if ((dataType == RNAType) || (dataType == RNADeg)) + alg_datatype = "RL"; + else if (dataType == AAType) + alg_datatype = "P1"; + + /* Print alignment */ + for(i = 0; i < sequenNumber; i++) { + + /* Print sequence datatype and its name */ + if((seqsInfo != NULL) && (iformat == oformat)) + file << ">" << seqsInfo[i].substr(0, 2) << ";" << seqsName[i] + << endl << seqsInfo[i].substr(2) << endl; + else + file << ">" << alg_datatype << ";" << seqsName[i] << endl + << seqsName[i] << " " << residuesNumber[i] << " bases" << endl; + + /* Write the sequence */ + for(j = 0; j < residuesNumber[i]; j += 50) { + for(k = j; (k < residuesNumber[i]) && (k < (j + 50)); k += 10) + file << " " << tmpMatrix[i].substr(k, 10); + + if(k >= residuesNumber[i]) { + if((residuesNumber[i] % 50) == 0) + file << endl << " "; + else if((residuesNumber[i] % 10) == 0) + file << " "; + file << "*"; + } + file << endl; + } + file << endl; + } + + /* Deallocate local memory */ + delete [] tmpMatrix; +} + +bool alignment::alignmentSummaryHTML(char *destFile, int residues, int seqs, \ + int *selectedRes, int *selectedSeq, float *consValues) { + + /* Generate an HTML file with a visual summary about which sequences/columns + * have been selected and which have not */ + + int i, j, k, kj, upper, minHTML, maxLongName, *gapsValues; + string tmpColumn; + float *simValues; + bool *res, *seq; + ofstream file; + char type; + + /* Allocate some local memory */ + tmpColumn.reserve(sequenNumber); + + /* Check whether sequences in the alignment are aligned or not. + * Warn about it if there are not aligned. */ + if (!isAligned) { + cerr << endl << "ERROR: Sequences are not aligned." << endl << endl; + return false; + } + + /* Open output file and check that file pointer is valid */ + file.open(destFile); + if(!file) + return false; + + /* Compute maximum sequences name length. */ + maxLongName = 0; + for(i = 0; i < sequenNumber; i++) + maxLongName = utils::max(maxLongName, seqsName[i].size()); + + /* Compute HTML blank spaces */ + minHTML = utils::max(25, maxLongName + 10); + + /* Initialize local variables to control which columns/sequences + * will be kept in the output alignment */ + res = new bool[residNumber]; + for(i = 0; i < residNumber; i++) + res[i] = false; + + seq = new bool[sequenNumber]; + for(i = 0; i < sequenNumber; i++) + seq[i] = false; + + /* Record which columns/sequences from original alignment + * have been kept in the final one */ + for(i = 0; i < residues; i++) + res[selectedRes[i]] = true; + for(i = 0; i < seqs; i++) + seq[selectedSeq[i]] = true; + + /* Recover some stats about different scores from current alignment */ + gapsValues = NULL; + if (sgaps != NULL) + gapsValues = sgaps -> getGapsWindow(); + simValues = NULL; + if (scons != NULL) + simValues = scons -> getMdkwVector(); + + /* Print HTML header into output file */ + file << "<!DOCTYPE html>" << endl << "<html><head>" << endl << " <meta " + << "http-equiv=\"Content-Type\" content=\"text/html;charset=ISO-8859-1\" />" + << endl << " <title>trimAl v1.4 Summary</title>" << endl + << " <style type=\"text/css\" media=\"all\">" << endl + + << " #b { background-color: #3366ff; }\n" + << " #r { background-color: #cc0000; }\n" + << " #g { background-color: #33cc00; }\n" + << " #p { background-color: #ff6666; }\n" + << " #m { background-color: #cc33cc; }\n" + << " #o { background-color: #ff9900; }\n" + << " #c { background-color: #46C7C7; }\n" + << " #y { background-color: #FFFF00; }\n" + + << " .sel { background-color: #B9B9B9; }\n" + << " .nsel { background-color: #E9E9E9; }\n" + + /* Sets of colors for high-lighting scores intervals */ + << " .c1 { background-color: #FFFBF2; }\n" + << " .c2 { background-color: #FFF8CC; }\n" + << " .c3 { background-color: #FAF0BE; }\n" + << " .c4 { background-color: #F0EAD6; }\n" + << " .c5 { background-color: #F3E5AB; }\n" + << " .c6 { background-color: #F4C430; }\n" + << " .c7 { background-color: #C2B280; color: white; }\n" + << " .c8 { background-color: #DAA520; color: white; }\n" + << " .c9 { background-color: #B8860B; color: white; }\n" + << " .c10 { background-color: #918151; color: white; }\n" + << " .c11 { background-color: #967117; color: white; }\n" + << " .c12 { background-color: #6E5411; color: white; }\n" + + /* Other HTML elements */ + << " </style>\n </head>\n\n" << " <body>\n" << " <pre>" << endl; + + /* Show information about how many sequences/residues have been selected */ + file << " <span class=sel>Selected Sequences: " << setw(5) << right << seqs + <<" /Selected Residues: " << setw(7) << right << residues << "</span>" + << endl << " <span class=nsel>Deleted Sequences: " << setw(5) << right + << sequenNumber - seqs << " /Deleted Residues: " << setw(7) << right + << residNumber - residues << "</span>" << endl; + + /* Print headers for different scores derived from input alignment/s */ + if (gapsValues != NULL) + file << endl << setw(minHTML) << left << " Gaps Scores: " + << "<span class=c1> =0= </span><span class=c2> <.001 </span>" + << "<span class=c3> <.050 </span><span class=c4> <.100 </span>" + << "<span class=c5> <.150 </span><span class=c6> <.200 </span>" + << "<span class=c7> <.250 </span><span class=c8> <.350 </span>" + << "<span class=c9> <.500 </span><span class=c10> <.750 </span>" + << "<span class=c11> <1.00 </span><span class=c12> =1= </span>"; + + if (simValues != NULL) + file << endl << setw(minHTML) << left << " Similarity Scores: " + << "<span class=c1> =0= </span><span class=c2> <1e-6 </span>" + << "<span class=c3> <1e-5 </span><span class=c4> <1e-4 </span>" + << "<span class=c5> <.001 </span><span class=c6> <.010 </span>" + << "<span class=c7> <.100 </span><span class=c8> <.250 </span>" + << "<span class=c9> <.500 </span><span class=c10> <.750 </span>" + << "<span class=c11> <1.00 </span><span class=c12> =1= </span>"; + + if (consValues != NULL) + file << endl << setw(minHTML) << left << " Consistency Scores: " + << "<span class=c1> =0= </span><span class=c2> <.001 </span>" + << "<span class=c3> <.050 </span><span class=c4> <.100 </span>" + << "<span class=c5> <.150 </span><span class=c6> <.200 </span>" + << "<span class=c7> <.250 </span><span class=c8> <.350 </span>" + << "<span class=c9> <.500 </span><span class=c10> <.750 </span>" + << "<span class=c11> <1.00 </span><span class=c12> =1= </span>"; + + if ((gapsValues != NULL) or (simValues == NULL) or (consValues == NULL)) + file << endl; + + /* Print Sequences in block of BLOCK_SIZE */ + for(j = 0, upper = HTMLBLOCKS; j < residNumber; j += HTMLBLOCKS, upper += \ + HTMLBLOCKS) { + + /* Print main columns number */ + file << endl << setw(minHTML + 10) << right << (j + 10); + for(i = j + 20; ((i <= residNumber) && (i <= upper)); i += 10) + file << setw(10) << right << (i); + + /* Print special characters to delimit sequences blocks */ + file << endl << setw(minHTML + 1) << right; + for(i = j + 1; ((i <= residNumber) && (i <= upper)); i++) + file << (!(i % 10) ? "+" : "="); + file << endl; + + /* Print sequences name */ + for(i = 0; i < sequenNumber; i++) { + file << " <span class=" << ((seq[i]) ? "sel>" : "nsel>") << seqsName[i] + << "</span>" << setw(minHTML - 4 - seqsName[i].size()) << right << ""; + + /* Print residues corresponding to current sequences block */ + for(k = j; ((k < residNumber) && (k < upper)); k++) { + for(kj = 0, tmpColumn.clear(); kj < sequenNumber; kj++) + tmpColumn += sequences[kj][k]; + /* Determine residue color based on residues across the alig column */ + type = utils::determineColor(sequences[i][k], tmpColumn); + if (type == 'w') + file << sequences[i][k]; + else + file << "<span id=" << type << ">" << sequences[i][k] << "</span>"; + } + file << endl; + } + + file << endl << setw(minHTML) << left << " Selected Cols: "; + for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++) + file << "<span class=" << (res[k] ? "sel" : "nsel") << "> </span>"; + file << endl; + + /* If there is not any score to print, skip this part of the function */ + if ((gapsValues == NULL) and (simValues == NULL) and (consValues == NULL)) + continue; + + /* Print score colors according to certain predefined thresholds */ + if (gapsValues != NULL) { + file << endl << setw(minHTML) << left << " Gaps Scores: "; + for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++) + if(gapsValues[k] == 0) + file << "<span class=c12> </span>"; + else if(gapsValues[k] == sequenNumber) + file << "<span class=c1> </span>"; + else if(1 - (float(gapsValues[k])/sequenNumber) >= .750) + file << "<span class=c11> </span>"; + else if(1 - (float(gapsValues[k])/sequenNumber) >= .500) + file << "<span class=c10> </span>"; + else if(1 - (float(gapsValues[k])/sequenNumber) >= .350) + file << "<span class=c9> </span>"; + else if(1 - (float(gapsValues[k])/sequenNumber) >= .250) + file << "<span class=c8> </span>"; + else if(1 - (float(gapsValues[k])/sequenNumber) >= .200) + file << "<span class=c7> </span>"; + else if(1 - (float(gapsValues[k])/sequenNumber) >= .150) + file << "<span class=c6> </span>"; + else if(1 - (float(gapsValues[k])/sequenNumber) >= .100) + file << "<span class=c5> </span>"; + else if(1 - (float(gapsValues[k])/sequenNumber) >= .050) + file << "<span class=c4> </span>"; + else if(1 - (float(gapsValues[k])/sequenNumber) >= .001) + file << "<span class=c3> </span>"; + else + file << "<span class=c2> </span>"; + } + if (simValues != NULL) { + file << endl << setw(minHTML) << left << " Similarity Scores: "; + for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++) + if(simValues[k] == 1) + file << "<span class=c12> </span>"; + else if(simValues[k] == 0) + file << "<span class=c1> </span>"; + else if(simValues[k] >= .750) + file << "<span class=c11> </span>"; + else if(simValues[k] >= .500) + file << "<span class=c10> </span>"; + else if(simValues[k] >= .250) + file << "<span class=c9> </span>"; + else if(simValues[k] >= .100) + file << "<span class=c8> </span>"; + else if(simValues[k] >= .010) + file << "<span class=c7> </span>"; + else if(simValues[k] >= .001) + file << "<span class=c6> </span>"; + else if(simValues[k] >= 1e-4) + file << "<span class=c5> </span>"; + else if(simValues[k] >= 1e-5) + file << "<span class=c4> </span>"; + else if(simValues[k] >= 1e-6) + file << "<span class=c3> </span>"; + else + file << "<span class=c2> </span>"; + } + if (consValues != NULL) { + file << endl << setw(minHTML) << left << " Consistency Scores: "; + for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++) + if(consValues[k] == 1) + file << "<span class=c12> </span>"; + else if(consValues[k] == 0) + file << "<span class=c1> </span>"; + else if(consValues[k] >= .750) + file << "<span class=c11> </span>"; + else if(consValues[k] >= .500) + file << "<span class=c10> </span>"; + else if(consValues[k] >= .350) + file << "<span class=c9> </span>"; + else if(consValues[k] >= .250) + file << "<span class=c8> </span>"; + else if(consValues[k] >= .200) + file << "<span class=c7> </span>"; + else if(consValues[k] >= .150) + file << "<span class=c6> </span>"; + else if(consValues[k] >= .100) + file << "<span class=c5> </span>"; + else if(consValues[k] >= .050) + file << "<span class=c4> </span>"; + else if(consValues[k] >= .001) + file << "<span class=c3> </span>"; + else + file << "<span class=c2> </span>"; + } + file << endl; + } + + /* Print HTML footer into output file */ + file << " </pre>" << endl << " </body>" << endl << "</html>" << endl; + + /* Close output file and deallocate local memory */ + file.close(); + delete [] seq; + delete [] res; + + return true; +} + +bool alignment::alignmentColourHTML(ostream &file) { + + int i, j, kj, upper, k = 0, maxLongName = 0; + string tmpColumn; + char type; + + /* Allocate some local memory */ + tmpColumn.reserve(sequenNumber); + + /* Check whether sequences in the alignment are aligned or not. + * Warn about it if there are not aligned. */ + if (!isAligned) { + cerr << endl << "ERROR: Sequences are not aligned." << endl << endl; + return false; + } + + /* Compute maximum sequences name length */ + maxLongName = 0; + for(i = 0; i < sequenNumber; i++) + maxLongName = utils::max(maxLongName, seqsName[i].size()); + + + /* Print HTML header into output file */ + file << "<!DOCTYPE html>" << endl << "<html><head>" << endl << " <meta " + << "http-equiv=\"Content-Type\" content=\"text/html;charset=ISO-8859-1\" />" + << endl << " <title>readAl v1.4</title>" << endl + << " <style type=\"text/css\">" << endl + << " #b { background-color: #3366ff; }\n" + << " #r { background-color: #cc0000; }\n" + << " #g { background-color: #33cc00; }\n" + << " #p { background-color: #ff6666; }\n" + << " #m { background-color: #cc33cc; }\n" + << " #o { background-color: #ff9900; }\n" + << " #c { background-color: #46C7C7; }\n" + << " #y { background-color: #FFFF00; }\n" + << " </style>\n </head>\n\n" << " <body>\n <pre>" << endl; + + /* Print sequences colored according to CLUSTAL scheme based on + * physical-chemical properties */ + for(j = 0, upper = HTMLBLOCKS; j < residNumber; j += HTMLBLOCKS, upper += \ + HTMLBLOCKS) { + + file << endl; + /* Print main columns number */ + file << setw(maxLongName + 19) << right << (j + 10); + for(i = j + 20; ((i <= residNumber) && (i <= upper)); i += 10) + file << setw(10) << right << i; + + /* Print special characters to delimit sequences blocks */ + file << endl << setw(maxLongName + 10); + for(i = j + 1; ((i <= residNumber) && (i <= upper)); i++) + file << (!(i % 10) ? "+" : "="); + + /* Print sequences themselves */ + for(i = 0; i < sequenNumber; i++) { + + /* Print sequences name */ + file << endl << setw(maxLongName + 9) << left << seqsName[i]; + + /* Print residues corresponding to current sequences block */ + for(k = j; ((k < residNumber) && (k < upper)); k++) { + for(kj = 0, tmpColumn.clear(); kj < sequenNumber; kj++) + tmpColumn += sequences[kj][k]; + /* Determine residue color based on residues across the alig column */ + type = utils::determineColor(sequences[i][k], tmpColumn); + if (type == 'w') + file << sequences[i][k]; + else + file << "<span id=" << type << ">" << sequences[i][k] << "</span>"; + } + } + file << endl; + } + + /* Print HTML footer into output file */ + file << " </pre>" << endl << " </body>" << endl << "</html>" << endl; + + return true; +} + +void alignment::printAlignmentInfo(ostream &file) { + /* Print information about sequences number, average sequence length, maximum + * and minimum sequences length, etc */ + + int i, j, valid_res, max, min, max_pos, min_pos, total_res; + + /* Storage which sequences are the longest and shortest ones */ + max = 0; + max_pos = 0; + min_pos = 0; + min = residuesNumber[0]; + + for(i = 0, total_res = 0; i < sequenNumber; i++) { + + /* Discard gaps from current sequence and then compute real length */ + for(j = 0, valid_res = 0; j < residuesNumber[i]; j++) + valid_res += (sequences[i][j] != '-' ? 1 : 0); + + /* Compute the total residues in the alignment to calculate avg. sequence + * length */ + total_res += valid_res; + + /* Get values for the longest sequence */ + max_pos = (max > valid_res) ? max_pos : i; + max = (max > valid_res) ? max : valid_res; + /* Similarily, get values for the shortest sequence */ + min_pos = (min < valid_res) ? min_pos : i; + min = (min < valid_res) ? min : valid_res; + } + + file << "## Total sequences\t" << sequenNumber << endl; + if (isFileAligned()) + file << "## Alignment length\t" << residNumber << endl; + file << "## Avg. sequence length\t" << (float) total_res / sequenNumber << endl + << "## Longest seq. name\t'" << seqsName[max_pos] << "'" << endl + << "## Longest seq. length\t" << max << endl + << "## Shortest seq. name\t'" << seqsName[min_pos] << "'" << endl + << "## Shortest seq. length\t" << min << endl; +}