Mercurial > repos > padge > trimal
view trimal_repo/source/rwAlignment.cpp @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author | padge |
---|---|
date | Fri, 25 Mar 2022 17:10:43 +0000 |
parents | |
children |
line wrap: on
line source
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** trimAl v1.4: a tool for automated alignment trimming in large-scale phylogenetics analyses. readAl v1.4: a tool for automated alignment conversion among different formats. 2009-2015 Capella-Gutierrez S. and Gabaldon, T. [scapella, tgabaldon]@crg.es This file is part of trimAl/readAl. trimAl/readAl are free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, the last available version. trimAl/readAl are distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with trimAl/readAl. If not, see <http://www.gnu.org/licenses/>. ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ #include "alignment.h" #include "defines.h" #include "utils.h" extern int errno; #include <errno.h> #include <ctype.h> #include <string> using namespace std; bool alignment::fillMatrices(bool aligned) { /* Function to determine if a set of sequences, that can be aligned or not, * have been correctly load and are free of errors. */ int i, j; /* Initialize some variables */ residuesNumber = new int[sequenNumber]; for(i = 0; i < sequenNumber; i++) { residuesNumber[i] = sequences[i].size(); } /* Check whether there are any unknow/no allowed character in the sequences */ for(i = 0; i < sequenNumber; i++) for(j = 0; j < residuesNumber[i]; j++) if((!isalpha(sequences[i][j])) && (!ispunct(sequences[i][j]))) { cerr << endl << "ERROR: The sequence \"" << seqsName[i] << "\" has an " << "unknown (" << sequences[i][j] << ") character." << endl; return false; } /* Check whether all sequences have same size or not */ for(i = 1; i < sequenNumber; i++) if(residuesNumber[i] != residuesNumber[i-1]) break; /* Set an appropriate flag for indicating if sequences are aligned or not */ isAligned = (i != sequenNumber) ? false : true; /* Warm about those cases where sequences should be aligned * and there are not */ if (aligned and !isAligned) { cerr << endl << "ERROR: Sequences should be aligned (all with same length) " << "and there are not. Check your input alignment" << endl; return false; } /* Full-fill some information about input alignment */ if(residNumber == 0) residNumber = residuesNumber[0]; /* Check whether aligned sequences have the length fixed for the input alig */ for(i = 0; (i < sequenNumber) and (aligned); i++) { if(residuesNumber[i] != residNumber) { cerr << endl << "ERROR: The sequence \"" << seqsName[i] << "\" (" << residuesNumber[i] << ") does not have the same number of residues " << "fixed by the alignment (" << residNumber << ")." << endl; return false; } } /* If the sequences are aligned, initialize some additional variables. * These variables will be useful for posterior analysis */ if((aligned) || (isAligned)) { /* Asign its position to each column. That will be used to determine which * columns should be kept in output alignment after applying any method * and which columns should not */ saveResidues = new int[residNumber]; for(i = 0; i < residNumber; i++) saveResidues[i] = i; /* Asign its position to each sequence. Similar to the columns numbering * process, assign to each sequence its position is useful to know which * sequences will be in the output alignment */ saveSequences = new int[sequenNumber]; for(i = 0; i < sequenNumber; i++) saveSequences[i] = i; } /* Return an flag indicating that everything is fine */ return true; } void alignment::getSequences(ostream &file) { /* Get only residues sequences from input alignment */ int i, j; string *tmpMatrix; /* Allocate local memory for generating output alignment */ tmpMatrix = new string[sequenNumber]; /* Depending on alignment orientation: forward or reverse. Copy directly * sequence information or get firstly the reversed sequences and then copy * it into local memory. Once orientation has been determined, remove gaps * from resuting sequences */ for(i = 0; i < sequenNumber; i++) tmpMatrix[i] = (!reverse) ? utils::removeCharacter('-', sequences[i]) : \ utils::removeCharacter('-', utils::getReverse(sequences[i])); /* Print output set of sequences in FASTA format*/ for(i = 0; i < sequenNumber; i++) { file << ">" << seqsName[i] << endl; for(j = 0; j < (int) tmpMatrix[i].size(); j += 60) file << tmpMatrix[i].substr(j, 60) << endl; file << endl; } /* Deallocate local memory */ delete [] tmpMatrix; } int alignment::formatInputAlignment(char *alignmentFile) { /* Guess input alignment format */ char c, *firstWord = NULL, *line = NULL; int format = 0, blocks = 0; ifstream file; string nline; /* Check the file and its content */ file.open(alignmentFile, ifstream::in); if(!utils::checkFile(file)) return false; /* Read first valid line in a safer way */ do { line = utils::readLine(file); } while ((line == NULL) && (!file.eof())); /* If the file end is reached without a valid line, warn about it */ if (file.eof()) return false; /* Otherwise, split line */ firstWord = strtok(line, OTHDELIMITERS); /* Clustal Format */ if((!strcmp(firstWord, "CLUSTAL")) || (!strcmp(firstWord, "clustal"))) format = 1; /* NBRF/PIR Format */ else if(firstWord[0] == '>' && firstWord[3] == ';') format = 3; /* Fasta Format */ else if(firstWord[0] == '>') format = 8; /* Nexus Format */ else if((!strcmp(firstWord, "#NEXUS")) || (!strcmp(firstWord, "#nexus"))) format = 17; /* Mega Format */ else if((!strcmp(firstWord, "#MEGA")) || (!strcmp(firstWord, "#mega"))) { /* Determine specific mega format: sequential or interleaved. * Counting the number of blocks (set of lines starting by "#") in * the input file. */ blocks = 0; do { file.read(&c, 1); } while((c != '#') && (!file.eof())); do { while((c != '\n') && (!file.eof())) file.read(&c, 1); file.read(&c, 1); if(c == '#') blocks++; } while((c != '\n') && (!file.eof())); /* MEGA Sequential (22) or Interleaved (21) */ format = (!blocks) ? 22 : 21; } /* Phylip Format */ else { /* Determine specific phylip format: sequential or interleaved. */ /* Get number of sequences and residues */ sequenNumber = atoi(firstWord); firstWord = strtok(NULL, DELIMITERS); if(firstWord != NULL) residNumber = atoi(firstWord); /* If there is only one sequence, use by default sequential format since * it is impossible to determine exactly which phylip format is */ if((sequenNumber == 1) && (residNumber != 0)) format = 12; /* If there are more than one sequence, analyze sequences distribution to * determine its format. */ else if((sequenNumber != 0) && (residNumber != 0)) { blocks = 0; /* Read line in a safer way */ do { if (line != NULL) delete [] line; line = utils::readLine(file); } while ((line == NULL) && (!file.eof())); /* If the file end is reached without a valid line, warn about it */ if (file.eof()) return false; firstWord = strtok(line, DELIMITERS); while(firstWord != NULL) { blocks++; firstWord = strtok(NULL, DELIMITERS); } /* Read line in a safer way */ do { if (line != NULL) delete [] line; line = utils::readLine(file); } while ((line == NULL) && (!file.eof())); firstWord = strtok(line, DELIMITERS); while(firstWord != NULL) { blocks--; firstWord = strtok(NULL, DELIMITERS); } /* If the file end is reached without a valid line, warn about it */ if (file.eof()) return false; /* Phylip Interleaved (12) or Sequential (11) */ format = (!blocks) ? 12 : 11; } } /* Close the input file and delete dinamic memory */ file.close(); if (line != NULL) delete [] line; /* Return the input alignment format */ return format; } bool alignment::loadPhylipAlignment(char *alignmentFile) { /* PHYLIP/PHYLIP 4 (Sequential) file format parser */ char *str, *line = NULL; ifstream file; int i; /* Check the file and its content */ file.open(alignmentFile, ifstream::in); if(!utils::checkFile(file)) return false; /* Store some data about filename for possible uses in other formats */ filename.append("!Title "); filename.append(alignmentFile); filename.append(";"); /* Read first valid line in a safer way */ do { line = utils::readLine(file); } while ((line == NULL) && (!file.eof())); /* If the file end is reached without a valid line, warn about it */ if (file.eof()) return false; /* Read the input sequences and residues for each sequence numbers */ str = strtok(line, DELIMITERS); sequenNumber = 0; if(str != NULL) sequenNumber = atoi(str); str = strtok(NULL, DELIMITERS); residNumber = 0; if(str != NULL) residNumber = atoi(str); /* If something is wrong about the sequences or/and residues number, * return an error to warn about that */ if((sequenNumber == 0) || (residNumber == 0)) return false; /* Allocate memory for the input data */ sequences = new string[sequenNumber]; seqsName = new string[sequenNumber]; /* Read the lines block containing the sequences name + first fragment */ i = 0; while((i < sequenNumber) && (!file.eof())){ /* Read lines in a safer way. Destroy previous stored information */ if (line != NULL) delete [] line; line = utils::readLine(file); /* It the input line/s are blank lines, skip the loop iteration */ if(line == NULL) continue; /* First token: Sequence name */ str = strtok(line, DELIMITERS); seqsName[i].append(str, strlen(str)); /* Trim the rest of the line from blank spaces, tabs, etc and store it */ str = strtok(NULL, DELIMITERS); while(str != NULL) { sequences[i].append(str, strlen(str)); str = strtok(NULL, DELIMITERS); } i++; } /* Read the rest of the input file */ while(!file.eof()) { /* Try to get for each sequences its corresponding residues */ i = 0; while((i < sequenNumber) && (!file.eof())) { /* Read lines in a safer way. Destroy previous stored information */ if (line != NULL) delete [] line; line = utils::readLine(file); /* It the input line/s are blank lines, skip the loop iteration */ if(line == NULL) continue; /* Remove from the current line non-printable characters and add fragments * to previous stored sequence */ str = strtok(line, DELIMITERS); while(str != NULL) { sequences[i].append(str, strlen(str)); str = strtok(NULL, DELIMITERS); } i++; } } /* Close the input file and delete dinamic memory */ file.close(); if (line != NULL) delete [] line; /* Check the matrix's content */ return fillMatrices(true); } bool alignment::loadPhylip3_2Alignment(char *alignmentFile) { /* PHYLIP 3.2 (Interleaved) file format parser */ int i, blocksFirstLine, firstLine = true; char *str, *line = NULL; ifstream file; /* Check the file and its content */ file.open(alignmentFile, ifstream::in); if(!utils::checkFile(file)) return false; /* Store the file name for futher format conversion*/ filename.append("!Title "); filename.append(alignmentFile); filename.append(";"); /* Read first valid line in a safer way */ do { line = utils::readLine(file); } while ((line == NULL) && (!file.eof())); /* If the file end is reached without a valid line, warn about it */ if (file.eof()) return false; /* Get the sequences and residues numbers. If there is any mistake, * return a FALSE value to warn about the possible error */ str = strtok(line, DELIMITERS); sequenNumber = 0; if(str != NULL) sequenNumber = atoi(str); str = strtok(NULL, DELIMITERS); residNumber = 0; if(str != NULL) residNumber = atoi(str); if((sequenNumber == 0) || (residNumber == 0)) return false; /* Reserve memory according to the input parameters */ sequences = new string[sequenNumber]; seqsName = new string[sequenNumber]; /* Point to the first sequence in the alignment. Since the alignment could not * have blank lines to separate the different sequences. Store the blocks size * for the first line including a sequence identifier */ i = 0; blocksFirstLine = 0; do { /* Read lines in a safer way. Destroy previous stored information */ if (line != NULL) delete [] line; line = utils::readLine(file); /* If there is nothing in the input line, skip the loop instructions */ if(line == NULL) continue; str = strtok(line, DELIMITERS); /* First block: Sequence Name + Sequence fragment. Count how many blocks * the first sequence line is divided. It could help to identify the * different sequences from the input file */ if(firstLine) { seqsName[i].append(str, strlen(str)); str = strtok(NULL, OTHDELIMITERS); firstLine = 1; } /* Sequence fragment */ while(str != NULL) { sequences[i].append(str, strlen(str)); str = strtok(NULL, OTHDELIMITERS); /* Count the blocks number for the sequences first line */ if (firstLine) firstLine += 1; } /* Store the blocks number for the first sequence including the name */ if ((blocksFirstLine == 0) and firstLine) blocksFirstLine = firstLine; /* If a false positive new sequence was detected, add stored information for * the current sequence to the previous one and clear the data structure * for the current sequence. Finally, move the sequence pointer to the * previous one. */ if ((firstLine != false) and (firstLine != blocksFirstLine)) { sequences[i-1].append(seqsName[i]); seqsName[i].clear(); sequences[i-1].append(sequences[i]); sequences[i].clear(); i --; } firstLine = false; /* There are many ways to detect a new sequence. */ /* One of them -experimental- is just to detect if the residues number for * the current entry is equal to the residues number for the whole align */ if ((int) sequences[i].size() == residNumber) { firstLine = true; i++; } } while(!file.eof()); /* Close the input file and delete dinamic memory */ file.close(); if (line != NULL) delete [] line; /* Check the matrix's content */ return fillMatrices(true); } bool alignment::loadClustalAlignment(char *alignmentFile) { /* CLUSTAL file format parser */ int i, seqLength, pos, firstBlock; char *str, *line = NULL; ifstream file; /* Check input file and its content */ file.open(alignmentFile, ifstream::in); if(!utils::checkFile(file)) return false; /* Store some details about input file to be used in posterior format * conversions */ filename.append("!Title "); filename.append(alignmentFile); filename.append(";"); /* The first valid line corresponding to CLUSTAL label is ignored */ do { line = utils::readLine(file); } while ((line == NULL) && (!file.eof())); /* If the file end is reached without a valid line, warn about it */ if (file.eof()) return false; /* Ignore blank lines before first sequence block starts */ while(!file.eof()) { /* Deallocate previously used dynamic memory */ if (line != NULL) delete [] line; /* Read lines in safe way */ line = utils::readLine(file); if (line != NULL) break; } /* The program in only interested in the first blocks of sequences since * it wants to know how many sequences are in the input file */ sequenNumber = 0; while(!file.eof()) { /* If a new line without any valid character is detected * means the first block is over */ if (line == NULL) break; /* Count how many times standard characters as well as * gap symbol "-" is detected in current line. */ seqLength = (int) strlen(line); for(pos = 0; pos < seqLength; pos++) if((isalpha(line[pos])) || (line[pos] == '-')) break; /* If not standard characters are detected in current line means that * the program has found typical line in clustal alignment files to mark * some scores for some columns. In that case, the first block is over */ if(pos == seqLength) break; sequenNumber++; /* Deallocate previously used dynamic memory */ if (line != NULL) delete [] line; /* Read lines in safe way */ line = utils::readLine(file); } /* Finish to preprocess the input file. */ file.clear(); file.seekg(0); /* Allocate memory for the input alignmet */ seqsName = new string[sequenNumber]; sequences = new string[sequenNumber]; /* Read the title line and store it */ line = utils::readLine(file); if (line == NULL) return false; aligInfo.append(line, strlen(line)); /* Ignore blank lines before first sequence block starts */ while(!file.eof()) { /* Deallocate previously used dynamic memory */ if (line != NULL) delete [] line; /* Read lines in safe way */ line = utils::readLine(file); if (line != NULL) break; } /* Set-up sequences pointer to the first one and the flag to indicate * the first blocks. That flag implies that sequences names have to be * stored */ i = 0; firstBlock = true; while(!file.eof()) { if (line == NULL) { /* Sometimes, clustalw files does not have any marker after first block * to indicate conservation between its columns residues. In that cases, * mark the end of first block */ if (i == 0) firstBlock = false; /* Read current line and analyze it*/ line = utils::readLine(file); continue; } /* Check whteher current line is a standard line or it is a line to mark * quality scores for that alignment column */ seqLength = (int) strlen(line); for(pos = 0; pos < seqLength; pos++) if((isalpha(line[pos])) || (line[pos] == '-')) break; /* Start a new block in the input alignment */ if (pos == seqLength) { firstBlock = false; /* Deallocate dinamic memory if it has been used before */ if (line != NULL) delete [] line; /* Read current line and analyze it*/ line = utils::readLine(file); continue; } /* If it is a standard line, split it into two parts. The first one contains * sequence name and the second one the residues. If the "firstBlock" flag * is active then store the sequence name */ str = strtok(line, OTHDELIMITERS); if(str != NULL) { if(firstBlock) seqsName[i].append(str, strlen(str)); str = strtok(NULL, OTHDELIMITERS); if(str != NULL) sequences[i].append(str, strlen(str)); /* Move sequences pointer in a circular way */ i = (i + 1) % sequenNumber; } /* Deallocate dinamic memory if it has been used before */ if (line != NULL) delete [] line; /* Read current line and analyze it*/ line = utils::readLine(file); } /* Close the input file */ file.close(); /* Deallocate dinamic memory */ if (line != NULL) delete [] line; /* Check the matrix's content */ return fillMatrices(true); } bool alignment::loadFastaAlignment(char *alignmentFile) { /* FASTA file format parser */ char *str, *line = NULL; ifstream file; int i; /* Check the file and its content */ file.open(alignmentFile, ifstream::in); if(!utils::checkFile(file)) return false; /* Store input file name for posterior uses in other formats */ filename.append("!Title "); filename.append(alignmentFile); filename.append(";"); /* Compute how many sequences are in the input alignment */ sequenNumber = 0; while(!file.eof()) { /* Deallocate previously used dinamic memory */ if (line != NULL) delete [] line; /* Read lines in a safe way */ line = utils::readLine(file); if (line == NULL) continue; /* It the line starts by ">" means that a new sequence has been found */ str = strtok(line, DELIMITERS); if (str == NULL) continue; /* If a sequence name flag is detected, increase sequences counter */ if(str[0] == '>') sequenNumber++; } /* Finish to preprocess the input file. */ file.clear(); file.seekg(0); /* Allocate memory for the input alignmet */ seqsName = new string[sequenNumber]; sequences = new string[sequenNumber]; seqsInfo = new string[sequenNumber]; for(i = -1; (i < sequenNumber) && (!file.eof()); ) { /* Deallocate previously used dinamic memory */ if (line != NULL) delete [] line; /* Read lines in a safe way */ line = utils::readLine(file); if (line == NULL) continue; /* Store original header fom input sequences including non-standard * characters */ if (line[0] == '>') seqsInfo[i+1].append(&line[1], strlen(line) - 1); /* Cut the current line and check whether there are valid characters */ str = strtok(line, OTHDELIMITERS); if (str == NULL) continue; /* Check whether current line belongs to the current sequence * or it is a new one. In that case, store the sequence name */ if(str[0] == '>') { /* Move sequence name pointer until a valid string name is obtained */ do { str = str + 1; } while(strlen(str) == 0); seqsName[++i].append(str, strlen(str)); continue; } /* Sequence */ while(str != NULL) { sequences[i].append(str, strlen(str)); str = strtok(NULL, DELIMITERS); } } /* Close the input file */ file.close(); /* Deallocate previously used dinamic memory */ if (line != NULL) delete [] line; /* Check the matrix's content */ return fillMatrices(false); } bool alignment::loadNexusAlignment(char *alignmentFile) { /* NEXUS file format parser */ char *frag = NULL, *str = NULL, *line = NULL; int i, pos, state, firstBlock; ifstream file; /* Check the file and its content */ file.open(alignmentFile, ifstream::in); if(!utils::checkFile(file)) return false; /* Store input file name for posterior uses in other formats */ /* We store the file name */ filename.append("!Title "); filename.append(alignmentFile); filename.append(";"); state = false; do { /* Destroy previous assigned memory */ if (line != NULL) delete [] line; /* Read line in a safer way */ line = utils::readLine(file); if (line == NULL) continue; /* Discard line where there is not information */ str = strtok(line, DELIMITERS); if(str == NULL) continue; /* If the line has any kind of information, try to catch it */ /* Firstly, convert to capital letters the input line */ for(i = 0; i < (int) strlen(str); i++) str[i] = toupper(str[i]); /* ... and then compare it again specific tags */ if(!strcmp(str, "BEGIN")) state = true; else if(!strcmp(str, "MATRIX")) break; /* Store information about input format file */ else if(!strcmp(str, "FORMAT")) { str = strtok(NULL, DELIMITERS); while(str != NULL) { aligInfo.append(str, strlen(str)); aligInfo.append(" ", strlen(" ")); str = strtok(NULL, DELIMITERS); } } /* In this case, try to get matrix dimensions */ else if((!strcmp(str, "DIMENSIONS")) && state) { str = strtok(NULL, DELIMITERS); frag = strtok(NULL, DELIMITERS); str = strtok(str, "=;"); sequenNumber = atoi(strtok(NULL, "=;")); frag = strtok(frag, "=;"); residNumber = atoi(strtok(NULL, "=;")); } } while(!file.eof()); /* Check all parameters */ if(strcmp(str, "MATRIX") || (sequenNumber == 0) || (residNumber == 0)) return false; /* Allocate memory for the input alignmet */ seqsName = new string[sequenNumber]; sequences = new string[sequenNumber]; pos = 0; state = false; firstBlock = true; while(!file.eof()) { /* Destroy previous assigned memory */ if (line != NULL) delete [] line; /* Read line in a safer way */ line = utils::readLine(file); if (line == NULL) continue; /* Discard any comments from input file */ for(i = 0; i < (int) strlen(line); i++) { if (line[i] == '[') state = true; else if (line[i] == ']' && state) { state = false; break; } } /* If there is a multi-line comments, skip it as well */ if ((state) || (not state && i != (int) strlen(line))) continue; /* Check for a specific tag indicating matrix end */ if((!strncmp(line, "end;", 4)) || (!strncmp(line, "END;", 4))) break; /* Split input line and check it if it is valid */ str = strtok(line, OTH2DELIMITERS); if (str == NULL) continue; /* Store the sequence name, only from the first block */ if(firstBlock) seqsName[pos].append(str, strlen(str)); /* Store rest of line as part of sequence */ str = strtok(NULL, OTH2DELIMITERS); while(str != NULL) { sequences[pos].append(str, strlen(str)); str = strtok(NULL, OTH2DELIMITERS); } /* Move sequences pointer to next one. It if it is last one, move it to * the beginning and set the first block to false for avoiding to rewrite * sequences name */ pos = (pos + 1) % sequenNumber; if (not pos) firstBlock = false; } /* Deallocate memory */ if (line != NULL) delete [] line; /* Close the input file */ file.close(); /* Check the matrix's content */ return fillMatrices(true); } bool alignment::loadMegaNonInterleavedAlignment(char *alignmentFile) { /* MEGA sequential file format parser */ char *frag = NULL, *str = NULL, *line = NULL; ifstream file; int i; /* Check the file and its content */ file.open(alignmentFile, ifstream::in); if(!utils::checkFile(file)) return false; /* Filename is stored as a title for MEGA input alignment. * If it is detected later a label "TITLE" in input file, this information * will be replaced for that one */ filename.append("!Title "); filename.append(alignmentFile); filename.append(";"); /* Skip first valid line */ do { line = utils::readLine(file); } while ((line == NULL) && (!file.eof())); /* If the file end is reached without a valid line, warn about it */ if (file.eof()) return false; /* Try to get input alignment information */ while(!file.eof()) { /* Destroy previously allocated memory */ if (line != NULL) delete [] line; /* Read a new line in a safe way */ line = utils::readLine(file); if (line == NULL) continue; /* If a sequence name flag is found, go out from getting information loop */ if(!strncmp(line, "#", 1)) break; /* Destroy previously allocated memory */ if (frag != NULL) delete [] frag; /* Create a local copy from input line */ frag = new char[strlen(line) + 1]; strcpy(frag, line); /* Split input line copy into pieces and analize it * looking for specific labels */ str = strtok(frag, "!: "); for(i = 0; i < (int) strlen(str); i++) str[i] = toupper(str[i]); /* If TITLE label is found, replace previously stored information with * this info */ if(!strcmp(str, "TITLE")) { filename.clear(); if(strncmp(line, "!", 1)) filename += "!"; filename += line; } /* If FORMAT label is found, try to get some details from input file */ else if(!strcmp(str, "FORMAT")) aligInfo.append(line, strlen(line)); } /* Deallocate local memory */ if (frag != NULL) delete [] frag; /* Count how many sequences are in input alignment file */ do { /* Check whether input line is valid or not */ if (line == NULL) { line = utils::readLine(file); continue; } /* If current line starts by a # means that it is a sequence name */ if (!strncmp(line, "#", 1)) sequenNumber++; /* Destroy previously allocated memory */ if (line != NULL) delete [] line; /* Read a new line in a safe way */ line = utils::readLine(file); } while(!file.eof()); /* Move file pointer to the beginner */ file.clear(); file.seekg(0); /* Allocate memory */ seqsName = new string[sequenNumber]; sequences = new string[sequenNumber]; /* Skip first line */ line = utils::readLine(file); /* Skip lines until first sequence name is found */ while(!file.eof()) { /* Destroy previously allocated memory */ if (line != NULL) delete [] line; /* Read a new line in a safe way */ line = utils::readLine(file); if (line == NULL) continue; /* If sequence name label is found, go out from loop */ if (!strncmp(line, "#", 1)) break; } /* First sequence is already detected so its name should be stored */ i = -1; /* This loop is a bit tricky because first sequence name has been already * detected, so it is necessary to process it before moving to next line. * That implies that lines are read at loop ends */ while(!file.eof()) { /* Skip blank lines */ if (line == NULL) { line = utils::readLine(file); continue; } /* Skip lines with comments */ if (!strncmp(line, "!", 1)) { /* Deallocate memory and read a new line */ delete [] line; line = utils::readLine(file); continue; } /* Remove comments inside of sequences and split input line */ frag = utils::trimLine(line); /* Skip lines with only comments */ if (frag == NULL) { /* Deallocate memory and read a new line */ if (line != NULL) delete [] line; line = utils::readLine(file); continue; } /* Otherwise, split it into fragments */ str = strtok(frag, " #\n"); /* Sequence Name */ if (!strncmp(line, "#", 1)) { i += 1; seqsName[i].append(str, strlen(str)); str = strtok(NULL, " #\n"); } /* Sequence itself */ while(str != NULL) { sequences[i].append(str, strlen(str)); str = strtok(NULL, " \n"); } /* Deallocate dynamic memory */ if (frag != NULL) delete [] frag; if (line != NULL) delete [] line; /* Read a new line in a safe way */ line = utils::readLine(file); } /* Close input file */ file.close(); /* Deallocate dynamic memory */ if (line != NULL) delete [] line; /* Check the matrix's content */ return fillMatrices(true); } bool alignment::loadMegaInterleavedAlignment(char *alignmentFile) { /* MEGA interleaved file format parser */ char *frag = NULL, *str = NULL, *line = NULL; int i, firstBlock = true; ifstream file; /* Check the file and its content */ file.open(alignmentFile, ifstream::in); if(!utils::checkFile(file)) return false; /* Filename is stored as a title for MEGA input alignment. * If it is detected later a label "TITLE" in input file, this information * will be replaced for that one */ filename.append("!Title "); filename.append(alignmentFile); filename.append(";"); /* Skip first valid line */ do { line = utils::readLine(file); } while ((line == NULL) && (!file.eof())); /* If the file end is reached without a valid line, warn about it */ if (file.eof()) return false; /* Try to get input alignment information */ while(!file.eof()) { /* Destroy previously allocated memory */ if (line != NULL) delete [] line; /* Read a new line in a safe way */ line = utils::readLine(file); if (line == NULL) continue; /* If a sequence name flag is found, go out from getting information loop */ if(!strncmp(line, "#", 1)) break; /* Create a local copy from input line */ frag = new char[strlen(line) + 1]; strcpy(frag, line); /* Split input line copy into pieces and analize it * looking for specific labels */ str = strtok(frag, "!: "); for(i = 0; i < (int) strlen(str); i++) str[i] = toupper(str[i]); /* If TITLE label is found, replace previously stored information with * this info */ if(!strcmp(str, "TITLE")) { filename.clear(); if(strncmp(line, "!", 1)) filename += "!"; filename += line; } /* If FORMAT label is found, try to get some details from input file */ else if(!strcmp(str, "FORMAT")) aligInfo.append(line, strlen(line)); /* Destroy previously allocated memory */ if (frag != NULL) delete [] frag; } /* Count how many sequences are in input file */ while(!file.eof()) { /* If a sequence name flag has been detected, increase counter */ if(!strncmp(line, "#", 1)) sequenNumber++; /* Deallocate dynamic memory */ if(line != NULL) delete [] line; /* Read lines in a safe way */ line = utils::readLine(file); /* If a blank line is detected means first block of sequences is over */ /* Then, break counting sequences loop */ if (line == NULL) break; } /* Finish to preprocess the input file. */ file.clear(); file.seekg(0); /* Allocate memory */ seqsName = new string[sequenNumber]; sequences = new string[sequenNumber]; /* Skip first line */ line = utils::readLine(file); /* Skip lines until first # flag is reached */ while(!file.eof()) { /* Deallocate previously used dynamic memory */ if (line != NULL) delete [] line; /* Read line in a safer way */ line = utils::readLine(file); /* Determine whether a # flag has been found in current string */ if (line != NULL) if(!strncmp(line, "#", 1)) break; } /* Read sequences and get from first block, the sequences names */ i = 0; firstBlock = true; while(!file.eof()) { if (line == NULL) { /* Read line in a safer way */ line = utils::readLine(file); continue; } if (!strncmp(line, "!", 1)) { /* Deallocate memory and read a new line */ delete [] line; line = utils::readLine(file); continue; } /* Trim lines from any kind of comments and split it */ frag = utils::trimLine(line); str = strtok(frag, " #\n"); /* Check whether a line fragment is valid or not */ if (str == NULL) continue; /* Store sequences names if firstBlock flag is TRUE */ if(firstBlock) seqsName[i].append(str, strlen(str)); /* Store sequence */ str = strtok(NULL, " \n"); while(str != NULL) { sequences[i].append(str, strlen(str)); str = strtok(NULL, " \n"); } /* Deallocate previously used dynamic memory */ if (frag != NULL) delete [] frag; if (line != NULL) delete [] line; /* Read line in a safer way */ line = utils::readLine(file); i = (i + 1) % sequenNumber; if (i == 0) firstBlock = false; } /* Close input file */ file.close(); /* Deallocate local memory */ if (line != NULL) delete [] line; /* Check the matrix's content */ return fillMatrices(true); } bool alignment::loadNBRF_PirAlignment(char *alignmentFile) { /* NBRF/PIR file format parser */ bool seqIdLine, seqLines; char *str, *line = NULL; ifstream file; int i; /* Check the file and its content */ file.open(alignmentFile, ifstream::in); if(!utils::checkFile(file)) return false; /* Store input file name for posterior uses in other formats */ filename.append("!Title "); filename.append(alignmentFile); filename.append(";"); /* Compute how many sequences are in the input alignment */ sequenNumber = 0; while(!file.eof()) { /* Deallocate previously used dinamic memory */ if (line != NULL) delete [] line; /* Read lines in a safe way */ line = utils::readLine(file); if (line == NULL) continue; /* It the line starts by ">" means that a new sequence has been found */ str = strtok(line, DELIMITERS); if (str == NULL) continue; /* If a sequence name flag is detected, increase sequences counter */ if(str[0] == '>') sequenNumber++; } /* Finish to preprocess the input file. */ file.clear(); file.seekg(0); /* Allocate memory for the input alignmet */ sequences = new string[sequenNumber]; seqsName = new string[sequenNumber]; seqsInfo = new string[sequenNumber]; /* Initialize some local variables */ seqIdLine = true; seqLines = false; i = -1; /* Read the entire input file */ while(!file.eof()) { /* Deallocate local memory */ if (line != NULL) delete [] line; /* Read lines in a safe way */ line = utils::readLine(file); if (line == NULL) continue; /* Sequence ID line. * Identification of these kind of lines is based on presence of ">" and ";" * symbols at positions 0 and 3 respectively */ if((line[0] == '>') && (line[3] == ';') && (seqIdLine)) { seqIdLine = false; i += 1; /* Store information about sequence datatype */ str = strtok(line, ">;"); seqsInfo[i].append(str, strlen(str)); /* and the sequence identifier itself */ str = strtok(NULL, ">;"); seqsName[i].append(str, strlen(str)); } /* Line just after sequence Id line contains a textual description of * the sequence. */ else if((!seqIdLine) && (!seqLines)) { seqLines = true; seqsInfo[i].append(line, strlen(line)); } /* Sequence lines itself */ else if (seqLines) { /* Check whether a sequence end symbol '*' exists in current line. * In that case, set appropriate flags to read a new sequence */ if (line[strlen(line) - 1] == '*') { seqLines = false; seqIdLine = true; } /* Process line */ str = strtok(line, OTHDELIMITERS); while (str != NULL) { sequences[i].append(str, strlen(str)); str = strtok(NULL, OTHDELIMITERS); } /* In case the end symbol '*' has been detected, remove it */ if(sequences[i][sequences[i].size() - 1] == '*') sequences[i].erase(sequences[i].size()-1); } } /* Close the input file */ file.close(); /* Deallocate dinamic memory */ if (line != NULL) delete [] line; /* Check the matrix's content */ return fillMatrices(true); } void alignment::alignmentPhylipToFile(ostream &file) { /* Generate output alignment in PHYLIP/PHYLIP 4 format (sequential) */ int i, j, maxLongName; string *tmpMatrix; /* Check whether sequences in the alignment are aligned or not. * Warn about it if there are not aligned. */ if (!isAligned) { cerr << endl << "ERROR: Sequences are not aligned. Format (PHYLIP) " << "not compatible with unaligned sequences." << endl << endl; return ; } /* Allocate local memory for generating output alignment */ tmpMatrix = new string[sequenNumber]; /* Depending on alignment orientation: forward or reverse. Copy directly * sequence information or get firstly the reversed sequences and then * copy it into local memory */ for(i = 0; i < sequenNumber; i++) tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); /* Depending on if short name flag is activated (limits sequence name up to * 10 characters) or not, get maximum sequence name length */ maxLongName = PHYLIPDISTANCE; for(i = 0; (i < sequenNumber) && (!shortNames); i++) maxLongName = utils::max(maxLongName, seqsName[i].size()); /* Generating output alignment */ /* First Line: Sequences Number & Residued Number */ file << " " << sequenNumber << " " << residNumber << endl; /* First Block: Sequences Names & First 60 residues */ for(i = 0; i < sequenNumber; i++) file << setw(maxLongName + 3) << left << seqsName[i].substr(0, maxLongName) << tmpMatrix[i].substr(0, 60) << endl; file << endl; /* Rest of blocks: Print 60 residues per each blocks of sequences */ for(i = 60; i < residNumber; i += 60) { for(j = 0; j < sequenNumber; j++) file << tmpMatrix[j].substr(i, 60) << endl; file << endl; } file << endl; /* Deallocate local memory */ delete [] tmpMatrix; } void alignment::alignmentPhylip3_2ToFile(ostream &file) { /* Generate output alignment in PHYLIP 3.2 format (interleaved) */ int i, j, k, maxLongName; string *tmpMatrix; /* Check whether sequences in the alignment are aligned or not. * Warn about it if there are not aligned. */ if (!isAligned) { cerr << endl << "ERROR: Sequences are not aligned. Format (PHYLIP) " << "not compatible with unaligned sequences." << endl << endl; return ; } /* Allocate local memory for generating output alignment */ tmpMatrix = new string[sequenNumber]; /* Depending on alignment orientation: forward or reverse. Copy directly * sequence information or get firstly the reversed sequences and then * copy it into local memory */ for(i = 0; i < sequenNumber; i++) tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); /* Depending on if short name flag is activated (limits sequence name up to * 10 characters) or not, get maximum sequence name length */ maxLongName = PHYLIPDISTANCE; for(i = 0; (i < sequenNumber) && (!shortNames); i++) maxLongName = utils::max(maxLongName, seqsName[i].size()); /* Generating output alignment */ /* First Line: Sequences Number & Residued Number */ file << " " << sequenNumber << " " << residNumber << endl; /* Alignment */ /* For each sequence, print its identifier and then the sequence itself in * blocks of 50 residues */ for(i = 0; i < sequenNumber; i++) { /* Sequence Name */ file << setw(maxLongName + 3) << left << seqsName[i].substr(0, maxLongName); /* Sequence. Each line contains a block of 5 times 10 residues. */ for(j = 0; j < residNumber; j += 50) { for(k = j; (k < residNumber) && (k < (j + 50)); k += 10) file << sequences[i].substr(k, 10) << " "; file << endl; /* If the sequences end has not been reached, print black spaces * to follow format specifications */ if((j + 50) < residNumber) file << setw(maxLongName + 3) << " "; } /* Print a blank line to mark sequences separation */ file << endl; } /* Deallocate local memory */ delete [] tmpMatrix; } void alignment::alignmentPhylip_PamlToFile(ostream &file) { /* Generate output alignment in PHYLIP format compatible with PAML program */ int i, maxLongName; string *tmpMatrix; /* Check whether sequences in the alignment are aligned or not. * Warn about it if there are not aligned. */ if (!isAligned) { cerr << endl << "ERROR: Sequences are not aligned. Format (PHYLIP) " << "not compatible with unaligned sequences." << endl << endl; return ; } /* Allocate local memory for generating output alignment */ tmpMatrix = new string[sequenNumber]; /* Depending on alignment orientation: forward or reverse. Copy directly * sequence information or get firstly the reversed sequences and then * copy it into local memory */ for(i = 0; i < sequenNumber; i++) tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); /* Depending on if short name flag is activated (limits sequence name up to * 10 characters) or not, get maximum sequence name length */ maxLongName = PHYLIPDISTANCE; for(i = 0; (i < sequenNumber) && (!shortNames); i++) maxLongName = utils::max(maxLongName, seqsName[i].size()); /* Generating output alignment */ /* First Line: Sequences Number & Residued Number */ file << " " << sequenNumber << " " << residNumber << endl; /* Print alignment */ /* Print sequences name follow by the sequence itself in the same line */ for(i = 0; i < sequenNumber; i++) file << setw(maxLongName + 3) << left << seqsName[i].substr(0, maxLongName) << sequences[i] << endl; file << endl; /* Deallocate local memory */ delete [] tmpMatrix; } void alignment::alignmentClustalToFile(ostream &file) { /* Generate output alignment in CLUSTAL format */ int i, j, maxLongName = 0; string *tmpMatrix; /* Check whether sequences in the alignment are aligned or not. * Warn about it if there are not aligned. */ if (!isAligned) { cerr << endl << "ERROR: Sequences are not aligned. Format (CLUSTAL) " << "not compatible with unaligned sequences." << endl << endl; return ; } /* Allocate local memory for generating output alignment */ tmpMatrix = new string[sequenNumber]; /* Depending on alignment orientation: forward or reverse. Copy directly * sequence information or get firstly the reversed sequences and then * copy it into local memory */ for(i = 0; i < sequenNumber; i++) tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); /* Compute maximum sequences name length */ for(i = 0; (i < sequenNumber) && (!shortNames); i++) maxLongName = utils::max(maxLongName, seqsName[i].size()); /* Print alignment header */ if((aligInfo.size() != 0) && (iformat == oformat)) file << aligInfo << endl << endl; else file << "CLUSTAL multiple sequence alignment" << endl << endl; /* Print alignment itself */ /* Print as many blocks as it is needed of lines composed * by sequences name and 60 residues */ for(j = 0; j < residNumber; j += 60) { for(i = 0; i < sequenNumber; i++) file << setw(maxLongName + 5) << left << seqsName[i] << tmpMatrix[i].substr(j, 60) << endl; file << endl << endl; } /* Deallocate local memory */ delete [] tmpMatrix; } void alignment::alignmentFastaToFile(ostream &file) { /* Generate output alignment in FASTA format. Sequences can be unaligned. */ int i, j, maxLongName; string *tmpMatrix; /* Allocate local memory for generating output alignment */ tmpMatrix = new string[sequenNumber]; /* Depending on alignment orientation: forward or reverse. Copy directly * sequence information or get firstly the reversed sequences and then * copy it into local memory */ for(i = 0; i < sequenNumber; i++) tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); /* Depending on if short name flag is activated (limits sequence name up to * 10 characters) or not, get maximum sequence name length. Consider those * cases when the user has asked to keep original sequence header */ maxLongName = 0; for(i = 0; i < sequenNumber; i++) if (!keepHeader) maxLongName = utils::max(maxLongName, seqsName[i].size()); else if (seqsInfo != NULL) maxLongName = utils::max(maxLongName, seqsInfo[i].size()); if (shortNames && maxLongName > PHYLIPDISTANCE) { maxLongName = PHYLIPDISTANCE; if (keepHeader) cerr << endl << "WARNING: Original sequence header will be cut by charac" << "ter 10" << endl; } /* Print alignment. First, sequences name id and then the sequences itself */ for(i = 0; i < sequenNumber; i++) { if (!keepHeader) file << ">" << seqsName[i].substr(0, maxLongName) << endl; else if (seqsInfo != NULL) file << ">" << seqsInfo[i].substr(0, maxLongName) << endl; for(j = 0; j < residuesNumber[i]; j+= 60) file << tmpMatrix[i].substr(j, 60) << endl; } /* Deallocate local memory */ delete [] tmpMatrix; } void alignment::alignmentNexusToFile(ostream &file) { /* Generate output alignment in NEXUS format setting only alignment block */ int i, j, k, maxLongName = 0; string *tmpMatrix; /* Check whether sequences in the alignment are aligned or not. * Warn about it if there are not aligned. */ if (!isAligned) { cerr << endl << "ERROR: Sequences are not aligned. Format (NEXUS) " << "not compatible with unaligned sequences." << endl << endl; return ; } /* Allocate local memory for generating output alignment */ tmpMatrix = new string[sequenNumber]; /* Depending on alignment orientation: forward or reverse. Copy directly * sequence information or get firstly the reversed sequences and then * copy it into local memory */ for(i = 0; i < sequenNumber; i++) tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); /* Compute maximum sequences name length */ for(i = 0; (i < sequenNumber) && (!shortNames); i++) maxLongName = utils::max(maxLongName, seqsName[i].size()); /* Compute output file datatype */ getTypeAlignment(); /* Remove characters like ";" from input alignment information line */ while((int) aligInfo.find(";") != (int) string::npos) aligInfo.erase(aligInfo.find(";"), 1); /* Print Alignment header */ file << "#NEXUS" << endl << "BEGIN DATA;" << endl << " DIMENSIONS NTAX=" << sequenNumber << " NCHAR=" << residNumber <<";" << endl; /* Print alignment datatype */ if ((dataType == DNAType) || (dataType == DNADeg)) file << "FORMAT DATATYPE=DNA INTERLEAVE=yes GAP=-"; else if ((dataType == RNAType) || (dataType == RNADeg)) file << "FORMAT DATATYPE=RNA INTERLEAVE=yes GAP=-"; else if (dataType == AAType) file << "FORMAT DATATYPE=PROTEIN INTERLEAVE=yes GAP=-"; i = 0; /* Using information from input alignment. Use only some tags. */ while((j = aligInfo.find(" ", i)) != (int) string::npos) { if((aligInfo.substr(i, j - i)).compare(0, 7, "MISSING") == 0 || (aligInfo.substr(i, j)).compare(0, 7, "missing") == 0) file << " " << (aligInfo.substr(i, j - i)); else if((aligInfo.substr(i, j)).compare(0, 9, "MATCHCHAR") == 0 || (aligInfo.substr(i, j)).compare(0, 9, "matchchar") == 0) file << " " << (aligInfo.substr(i, j - i)); i = j + 1; } file << ";" << endl; /* Print sequence name and sequence length */ for(i = 0; i < sequenNumber; i++) file << "[Name: " << setw(maxLongName + 4) << left << seqsName[i] << "Len: " << residNumber << "]" << endl; file << endl << "MATRIX" << endl; /* Print alignment itself. Sequence name and 50 residues blocks */ for(j = 0; j < residNumber; j += 50) { for(i = 0; i < sequenNumber; i++) { file << setw(maxLongName + 4) << left << seqsName[i]; for(k = j; k < (j + 50) && k < residNumber; k += 10) file << " " << sequences[i].substr(k, 10); file << endl; } file << endl; } file << ";" << endl << "END;" << endl; /* Deallocate local memory */ delete [] tmpMatrix; } void alignment::alignmentMegaToFile(ostream &file) { /* Generate output alignment in MEGA format */ int i, j, k; string *tmpMatrix; /* Check whether sequences in the alignment are aligned or not. * Warn about it if there are not aligned. */ if (!isAligned) { cerr << endl << "ERROR: Sequences are not aligned. Format (MEGA) " << "not compatible with unaligned sequences." << endl << endl; return ; } /* Allocate local memory for generating output alignment */ tmpMatrix = new string[sequenNumber]; /* Depending on alignment orientation: forward or reverse. Copy directly * sequence information or get firstly the reversed sequences and then * copy it into local memory */ for(i = 0; i < sequenNumber; i++) tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); /* Compute output file datatype */ getTypeAlignment(); /* Print output alignment header */ file << "#MEGA" << endl << filename << endl; /* Print alignment datatype */ if ((dataType == DNAType) || (dataType == DNADeg)) file << "!Format DataType=DNA "; else if ((dataType == RNAType) || (dataType == RNADeg)) file << "!Format DataType=RNA "; else if (dataType == AAType) file << "!Format DataType=protein "; /* Print number of sequences and alignment length */ file << "NSeqs=" << sequenNumber << " Nsites=" << residNumber << " indel=- CodeTable=Standard;" << endl << endl; /* Print sequences name and sequences divided into blocks of 50 residues */ for(i = 0; i < sequenNumber; i++) { file << "#" << seqsName[i] << endl; for(j = 0; j < residNumber; j += 50) { for(k = j; ((k < residNumber) && (k < j + 50)); k += 10) file << tmpMatrix[i].substr(k, 10) << " "; file << endl; } file << endl; } /* Deallocate local memory */ delete [] tmpMatrix; } void alignment::alignmentNBRF_PirToFile(ostream &file) { /* Generate output alignment in NBRF/PIR format. Sequences can be unaligned */ int i, j, k; string alg_datatype, *tmpMatrix; /* Allocate local memory for generating output alignment */ tmpMatrix = new string[sequenNumber]; /* Depending on alignment orientation: forward or reverse. Copy directly * sequence information or get firstly the reversed sequences and then * copy it into local memory */ for(i = 0; i < sequenNumber; i++) tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]); /* Compute output file datatype */ getTypeAlignment(); if ((dataType == DNAType) || (dataType == DNADeg)) alg_datatype = "DL"; else if ((dataType == RNAType) || (dataType == RNADeg)) alg_datatype = "RL"; else if (dataType == AAType) alg_datatype = "P1"; /* Print alignment */ for(i = 0; i < sequenNumber; i++) { /* Print sequence datatype and its name */ if((seqsInfo != NULL) && (iformat == oformat)) file << ">" << seqsInfo[i].substr(0, 2) << ";" << seqsName[i] << endl << seqsInfo[i].substr(2) << endl; else file << ">" << alg_datatype << ";" << seqsName[i] << endl << seqsName[i] << " " << residuesNumber[i] << " bases" << endl; /* Write the sequence */ for(j = 0; j < residuesNumber[i]; j += 50) { for(k = j; (k < residuesNumber[i]) && (k < (j + 50)); k += 10) file << " " << tmpMatrix[i].substr(k, 10); if(k >= residuesNumber[i]) { if((residuesNumber[i] % 50) == 0) file << endl << " "; else if((residuesNumber[i] % 10) == 0) file << " "; file << "*"; } file << endl; } file << endl; } /* Deallocate local memory */ delete [] tmpMatrix; } bool alignment::alignmentSummaryHTML(char *destFile, int residues, int seqs, \ int *selectedRes, int *selectedSeq, float *consValues) { /* Generate an HTML file with a visual summary about which sequences/columns * have been selected and which have not */ int i, j, k, kj, upper, minHTML, maxLongName, *gapsValues; string tmpColumn; float *simValues; bool *res, *seq; ofstream file; char type; /* Allocate some local memory */ tmpColumn.reserve(sequenNumber); /* Check whether sequences in the alignment are aligned or not. * Warn about it if there are not aligned. */ if (!isAligned) { cerr << endl << "ERROR: Sequences are not aligned." << endl << endl; return false; } /* Open output file and check that file pointer is valid */ file.open(destFile); if(!file) return false; /* Compute maximum sequences name length. */ maxLongName = 0; for(i = 0; i < sequenNumber; i++) maxLongName = utils::max(maxLongName, seqsName[i].size()); /* Compute HTML blank spaces */ minHTML = utils::max(25, maxLongName + 10); /* Initialize local variables to control which columns/sequences * will be kept in the output alignment */ res = new bool[residNumber]; for(i = 0; i < residNumber; i++) res[i] = false; seq = new bool[sequenNumber]; for(i = 0; i < sequenNumber; i++) seq[i] = false; /* Record which columns/sequences from original alignment * have been kept in the final one */ for(i = 0; i < residues; i++) res[selectedRes[i]] = true; for(i = 0; i < seqs; i++) seq[selectedSeq[i]] = true; /* Recover some stats about different scores from current alignment */ gapsValues = NULL; if (sgaps != NULL) gapsValues = sgaps -> getGapsWindow(); simValues = NULL; if (scons != NULL) simValues = scons -> getMdkwVector(); /* Print HTML header into output file */ file << "<!DOCTYPE html>" << endl << "<html><head>" << endl << " <meta " << "http-equiv=\"Content-Type\" content=\"text/html;charset=ISO-8859-1\" />" << endl << " <title>trimAl v1.4 Summary</title>" << endl << " <style type=\"text/css\" media=\"all\">" << endl << " #b { background-color: #3366ff; }\n" << " #r { background-color: #cc0000; }\n" << " #g { background-color: #33cc00; }\n" << " #p { background-color: #ff6666; }\n" << " #m { background-color: #cc33cc; }\n" << " #o { background-color: #ff9900; }\n" << " #c { background-color: #46C7C7; }\n" << " #y { background-color: #FFFF00; }\n" << " .sel { background-color: #B9B9B9; }\n" << " .nsel { background-color: #E9E9E9; }\n" /* Sets of colors for high-lighting scores intervals */ << " .c1 { background-color: #FFFBF2; }\n" << " .c2 { background-color: #FFF8CC; }\n" << " .c3 { background-color: #FAF0BE; }\n" << " .c4 { background-color: #F0EAD6; }\n" << " .c5 { background-color: #F3E5AB; }\n" << " .c6 { background-color: #F4C430; }\n" << " .c7 { background-color: #C2B280; color: white; }\n" << " .c8 { background-color: #DAA520; color: white; }\n" << " .c9 { background-color: #B8860B; color: white; }\n" << " .c10 { background-color: #918151; color: white; }\n" << " .c11 { background-color: #967117; color: white; }\n" << " .c12 { background-color: #6E5411; color: white; }\n" /* Other HTML elements */ << " </style>\n </head>\n\n" << " <body>\n" << " <pre>" << endl; /* Show information about how many sequences/residues have been selected */ file << " <span class=sel>Selected Sequences: " << setw(5) << right << seqs <<" /Selected Residues: " << setw(7) << right << residues << "</span>" << endl << " <span class=nsel>Deleted Sequences: " << setw(5) << right << sequenNumber - seqs << " /Deleted Residues: " << setw(7) << right << residNumber - residues << "</span>" << endl; /* Print headers for different scores derived from input alignment/s */ if (gapsValues != NULL) file << endl << setw(minHTML) << left << " Gaps Scores: " << "<span class=c1> =0= </span><span class=c2> <.001 </span>" << "<span class=c3> <.050 </span><span class=c4> <.100 </span>" << "<span class=c5> <.150 </span><span class=c6> <.200 </span>" << "<span class=c7> <.250 </span><span class=c8> <.350 </span>" << "<span class=c9> <.500 </span><span class=c10> <.750 </span>" << "<span class=c11> <1.00 </span><span class=c12> =1= </span>"; if (simValues != NULL) file << endl << setw(minHTML) << left << " Similarity Scores: " << "<span class=c1> =0= </span><span class=c2> <1e-6 </span>" << "<span class=c3> <1e-5 </span><span class=c4> <1e-4 </span>" << "<span class=c5> <.001 </span><span class=c6> <.010 </span>" << "<span class=c7> <.100 </span><span class=c8> <.250 </span>" << "<span class=c9> <.500 </span><span class=c10> <.750 </span>" << "<span class=c11> <1.00 </span><span class=c12> =1= </span>"; if (consValues != NULL) file << endl << setw(minHTML) << left << " Consistency Scores: " << "<span class=c1> =0= </span><span class=c2> <.001 </span>" << "<span class=c3> <.050 </span><span class=c4> <.100 </span>" << "<span class=c5> <.150 </span><span class=c6> <.200 </span>" << "<span class=c7> <.250 </span><span class=c8> <.350 </span>" << "<span class=c9> <.500 </span><span class=c10> <.750 </span>" << "<span class=c11> <1.00 </span><span class=c12> =1= </span>"; if ((gapsValues != NULL) or (simValues == NULL) or (consValues == NULL)) file << endl; /* Print Sequences in block of BLOCK_SIZE */ for(j = 0, upper = HTMLBLOCKS; j < residNumber; j += HTMLBLOCKS, upper += \ HTMLBLOCKS) { /* Print main columns number */ file << endl << setw(minHTML + 10) << right << (j + 10); for(i = j + 20; ((i <= residNumber) && (i <= upper)); i += 10) file << setw(10) << right << (i); /* Print special characters to delimit sequences blocks */ file << endl << setw(minHTML + 1) << right; for(i = j + 1; ((i <= residNumber) && (i <= upper)); i++) file << (!(i % 10) ? "+" : "="); file << endl; /* Print sequences name */ for(i = 0; i < sequenNumber; i++) { file << " <span class=" << ((seq[i]) ? "sel>" : "nsel>") << seqsName[i] << "</span>" << setw(minHTML - 4 - seqsName[i].size()) << right << ""; /* Print residues corresponding to current sequences block */ for(k = j; ((k < residNumber) && (k < upper)); k++) { for(kj = 0, tmpColumn.clear(); kj < sequenNumber; kj++) tmpColumn += sequences[kj][k]; /* Determine residue color based on residues across the alig column */ type = utils::determineColor(sequences[i][k], tmpColumn); if (type == 'w') file << sequences[i][k]; else file << "<span id=" << type << ">" << sequences[i][k] << "</span>"; } file << endl; } file << endl << setw(minHTML) << left << " Selected Cols: "; for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++) file << "<span class=" << (res[k] ? "sel" : "nsel") << "> </span>"; file << endl; /* If there is not any score to print, skip this part of the function */ if ((gapsValues == NULL) and (simValues == NULL) and (consValues == NULL)) continue; /* Print score colors according to certain predefined thresholds */ if (gapsValues != NULL) { file << endl << setw(minHTML) << left << " Gaps Scores: "; for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++) if(gapsValues[k] == 0) file << "<span class=c12> </span>"; else if(gapsValues[k] == sequenNumber) file << "<span class=c1> </span>"; else if(1 - (float(gapsValues[k])/sequenNumber) >= .750) file << "<span class=c11> </span>"; else if(1 - (float(gapsValues[k])/sequenNumber) >= .500) file << "<span class=c10> </span>"; else if(1 - (float(gapsValues[k])/sequenNumber) >= .350) file << "<span class=c9> </span>"; else if(1 - (float(gapsValues[k])/sequenNumber) >= .250) file << "<span class=c8> </span>"; else if(1 - (float(gapsValues[k])/sequenNumber) >= .200) file << "<span class=c7> </span>"; else if(1 - (float(gapsValues[k])/sequenNumber) >= .150) file << "<span class=c6> </span>"; else if(1 - (float(gapsValues[k])/sequenNumber) >= .100) file << "<span class=c5> </span>"; else if(1 - (float(gapsValues[k])/sequenNumber) >= .050) file << "<span class=c4> </span>"; else if(1 - (float(gapsValues[k])/sequenNumber) >= .001) file << "<span class=c3> </span>"; else file << "<span class=c2> </span>"; } if (simValues != NULL) { file << endl << setw(minHTML) << left << " Similarity Scores: "; for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++) if(simValues[k] == 1) file << "<span class=c12> </span>"; else if(simValues[k] == 0) file << "<span class=c1> </span>"; else if(simValues[k] >= .750) file << "<span class=c11> </span>"; else if(simValues[k] >= .500) file << "<span class=c10> </span>"; else if(simValues[k] >= .250) file << "<span class=c9> </span>"; else if(simValues[k] >= .100) file << "<span class=c8> </span>"; else if(simValues[k] >= .010) file << "<span class=c7> </span>"; else if(simValues[k] >= .001) file << "<span class=c6> </span>"; else if(simValues[k] >= 1e-4) file << "<span class=c5> </span>"; else if(simValues[k] >= 1e-5) file << "<span class=c4> </span>"; else if(simValues[k] >= 1e-6) file << "<span class=c3> </span>"; else file << "<span class=c2> </span>"; } if (consValues != NULL) { file << endl << setw(minHTML) << left << " Consistency Scores: "; for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++) if(consValues[k] == 1) file << "<span class=c12> </span>"; else if(consValues[k] == 0) file << "<span class=c1> </span>"; else if(consValues[k] >= .750) file << "<span class=c11> </span>"; else if(consValues[k] >= .500) file << "<span class=c10> </span>"; else if(consValues[k] >= .350) file << "<span class=c9> </span>"; else if(consValues[k] >= .250) file << "<span class=c8> </span>"; else if(consValues[k] >= .200) file << "<span class=c7> </span>"; else if(consValues[k] >= .150) file << "<span class=c6> </span>"; else if(consValues[k] >= .100) file << "<span class=c5> </span>"; else if(consValues[k] >= .050) file << "<span class=c4> </span>"; else if(consValues[k] >= .001) file << "<span class=c3> </span>"; else file << "<span class=c2> </span>"; } file << endl; } /* Print HTML footer into output file */ file << " </pre>" << endl << " </body>" << endl << "</html>" << endl; /* Close output file and deallocate local memory */ file.close(); delete [] seq; delete [] res; return true; } bool alignment::alignmentColourHTML(ostream &file) { int i, j, kj, upper, k = 0, maxLongName = 0; string tmpColumn; char type; /* Allocate some local memory */ tmpColumn.reserve(sequenNumber); /* Check whether sequences in the alignment are aligned or not. * Warn about it if there are not aligned. */ if (!isAligned) { cerr << endl << "ERROR: Sequences are not aligned." << endl << endl; return false; } /* Compute maximum sequences name length */ maxLongName = 0; for(i = 0; i < sequenNumber; i++) maxLongName = utils::max(maxLongName, seqsName[i].size()); /* Print HTML header into output file */ file << "<!DOCTYPE html>" << endl << "<html><head>" << endl << " <meta " << "http-equiv=\"Content-Type\" content=\"text/html;charset=ISO-8859-1\" />" << endl << " <title>readAl v1.4</title>" << endl << " <style type=\"text/css\">" << endl << " #b { background-color: #3366ff; }\n" << " #r { background-color: #cc0000; }\n" << " #g { background-color: #33cc00; }\n" << " #p { background-color: #ff6666; }\n" << " #m { background-color: #cc33cc; }\n" << " #o { background-color: #ff9900; }\n" << " #c { background-color: #46C7C7; }\n" << " #y { background-color: #FFFF00; }\n" << " </style>\n </head>\n\n" << " <body>\n <pre>" << endl; /* Print sequences colored according to CLUSTAL scheme based on * physical-chemical properties */ for(j = 0, upper = HTMLBLOCKS; j < residNumber; j += HTMLBLOCKS, upper += \ HTMLBLOCKS) { file << endl; /* Print main columns number */ file << setw(maxLongName + 19) << right << (j + 10); for(i = j + 20; ((i <= residNumber) && (i <= upper)); i += 10) file << setw(10) << right << i; /* Print special characters to delimit sequences blocks */ file << endl << setw(maxLongName + 10); for(i = j + 1; ((i <= residNumber) && (i <= upper)); i++) file << (!(i % 10) ? "+" : "="); /* Print sequences themselves */ for(i = 0; i < sequenNumber; i++) { /* Print sequences name */ file << endl << setw(maxLongName + 9) << left << seqsName[i]; /* Print residues corresponding to current sequences block */ for(k = j; ((k < residNumber) && (k < upper)); k++) { for(kj = 0, tmpColumn.clear(); kj < sequenNumber; kj++) tmpColumn += sequences[kj][k]; /* Determine residue color based on residues across the alig column */ type = utils::determineColor(sequences[i][k], tmpColumn); if (type == 'w') file << sequences[i][k]; else file << "<span id=" << type << ">" << sequences[i][k] << "</span>"; } } file << endl; } /* Print HTML footer into output file */ file << " </pre>" << endl << " </body>" << endl << "</html>" << endl; return true; } void alignment::printAlignmentInfo(ostream &file) { /* Print information about sequences number, average sequence length, maximum * and minimum sequences length, etc */ int i, j, valid_res, max, min, max_pos, min_pos, total_res; /* Storage which sequences are the longest and shortest ones */ max = 0; max_pos = 0; min_pos = 0; min = residuesNumber[0]; for(i = 0, total_res = 0; i < sequenNumber; i++) { /* Discard gaps from current sequence and then compute real length */ for(j = 0, valid_res = 0; j < residuesNumber[i]; j++) valid_res += (sequences[i][j] != '-' ? 1 : 0); /* Compute the total residues in the alignment to calculate avg. sequence * length */ total_res += valid_res; /* Get values for the longest sequence */ max_pos = (max > valid_res) ? max_pos : i; max = (max > valid_res) ? max : valid_res; /* Similarily, get values for the shortest sequence */ min_pos = (min < valid_res) ? min_pos : i; min = (min < valid_res) ? min : valid_res; } file << "## Total sequences\t" << sequenNumber << endl; if (isFileAligned()) file << "## Alignment length\t" << residNumber << endl; file << "## Avg. sequence length\t" << (float) total_res / sequenNumber << endl << "## Longest seq. name\t'" << seqsName[max_pos] << "'" << endl << "## Longest seq. length\t" << max << endl << "## Shortest seq. name\t'" << seqsName[min_pos] << "'" << endl << "## Shortest seq. length\t" << min << endl; }