Mercurial > repos > padge > trimal
view trimal_repo/source/alignment.cpp @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author | padge |
---|---|
date | Fri, 25 Mar 2022 17:10:43 +0000 |
parents | |
children |
line wrap: on
line source
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** trimAl v1.4: a tool for automated alignment trimming in large-scale phylogenetics analyses. readAl v1.4: a tool for automated alignment conversion among different formats. statAl v1.4: a tool for getting stats about multiple sequence alignments. 2009-2015 Capella-Gutierrez S. and Gabaldon, T. [scapella, tgabaldon]@crg.es This file is part of trimAl/readAl. trimAl/readAl are free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, the last available version. trimAl/readAl are distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with trimAl/readAl. If not, see <http://www.gnu.org/licenses/>. ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ using namespace std; #include <float.h> #include "alignment.h" #include "rwAlignment.cpp" #include "autAlignment.cpp" #include <deque> /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Class constructor */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment::alignment(void) { /* Alignment parameter */ sequenNumber = 0; residNumber = 0; /* Are the input sequences aligned? */ isAligned = false; /* Should the output file be reversed? */ reverse = false; /* Should be trimmed only terminal gaps? - set automated and manual boundaries * values */ terminalGapOnly = false; left_boundary = -1; right_boundary = -1; /* Input and output formats */ iformat = 0; oformat = 0; shortNames = false; forceCaps = false; upperCase = false; lowerCase = false; /* Indicate whether sequences composed only by gaps should be kept or not */ keepSequences = false; /* Indicate whether original header, they may include non-alphanumerical * characters, should be dumped into output stream without any preprocessing * step */ keepHeader = false; gapSymbol = "-"; /* Sequence datatype: DNA, RNA or Protein */ dataType = 0; /* Window sizes to trim the input alignment */ ghWindow = 0; shWindow = 0; /* Minimum block size in the new alignment */ blockSize = 0; /* Is this alignmnet new? */ oldAlignment = false; /* Sequence residues number */ residuesNumber = NULL; /* Columns and sequences that have been previously selected */ saveResidues = NULL; saveSequences = NULL; /* Input sequences as well other information such as sequences name, etc */ sequences = NULL; seqsName = NULL; seqsInfo = NULL; /* Information about input alignment */ filename = ""; aligInfo = ""; /* Information computed from alignment */ sgaps = NULL; scons = NULL; seqMatrix = NULL; identities = NULL; overlaps = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Class constructor */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment::alignment(string o_filename, string o_aligInfo, string *o_sequences, string *o_seqsName, string *o_seqsInfo, int o_sequenNumber, int o_residNumber, int o_iformat, int o_oformat, bool o_shortNames, int o_dataType, int o_isAligned, bool o_reverse, bool o_terminalGapOnly, int o_left_boundary, int o_right_boundary, bool o_keepSeqs, bool o_keepHeader, int OldSequences, int OldResidues, int *o_residuesNumber, int *o_saveResidues, int *o_saveSequences, int o_ghWindow, int o_shWindow, int o_blockSize, float **o_identities, float **o_overlaps) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ int i, j, k, ll; /* ***** ***** ***** ***** ***** ***** ***** ***** */ oldAlignment = true; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Assign the parameter values to the variables */ sequenNumber = o_sequenNumber; residNumber = o_residNumber; iformat = o_iformat; oformat = o_oformat; shortNames = o_shortNames; dataType = o_dataType; ghWindow = o_ghWindow; shWindow = o_shWindow; blockSize = o_blockSize; isAligned = o_isAligned; reverse = o_reverse; terminalGapOnly = o_terminalGapOnly; right_boundary = o_right_boundary; left_boundary = o_left_boundary; filename = o_filename; aligInfo = o_aligInfo; /* ***** ***** ***** ***** ***** ***** ***** ***** */ keepSequences = o_keepSeqs; keepHeader = o_keepHeader; forceCaps = false; upperCase = false; lowerCase = false; gapSymbol = "-"; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Basic information for the new alignment */ sequences = new string[sequenNumber]; for(i = 0; i < sequenNumber; i++) sequences[i] = o_sequences[i]; seqsName = new string[sequenNumber]; for(i = 0; i < sequenNumber; i++) seqsName[i] = o_seqsName[i]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ residuesNumber = new int[sequenNumber]; if((isAligned) || (o_residuesNumber != NULL)) { for(i = 0; i < sequenNumber; i++) residuesNumber[i] = residNumber; } else { for(i = 0; i < sequenNumber; i++) residuesNumber[i] = o_residuesNumber[i]; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(o_seqsInfo != NULL) { seqsInfo = new string[sequenNumber]; for(i = 0; i < sequenNumber; i++) seqsInfo[i] = o_seqsInfo[i]; } else seqsInfo = NULL; saveResidues = NULL; if(o_saveResidues != NULL) { saveResidues = new int[residNumber]; for(i = 0, j = 0; i < OldResidues; i++) if(o_saveResidues[i] != -1) { saveResidues[j] = o_saveResidues[i]; j++; } } saveSequences = NULL; if(o_saveSequences != NULL) { saveSequences = new int[sequenNumber]; for(i = 0, j = 0; i < OldSequences; i++) if(o_saveSequences[i] != -1) { saveSequences[j] = o_saveSequences[i]; j++; } } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ identities = NULL; if(o_identities != NULL) { identities = new float*[sequenNumber]; for(i = 0, j = 0; i < OldSequences; i++) { if(o_saveSequences[i] != -1) { identities[j] = new float[sequenNumber]; for(k = 0, ll = 0; k < OldSequences; k++) { if(o_saveSequences[k] != -1) { identities[j][ll] = o_identities[i][k]; ll++; } } j++; } } } overlaps = NULL; if(o_overlaps != NULL) { overlaps = new float*[sequenNumber]; for(i = 0, j = 0; i < OldSequences; i++) { if(o_saveSequences[i] != -1) { overlaps[j] = new float[sequenNumber]; for(k = 0, ll = 0; k < OldSequences; k++) { if(o_saveSequences[k] != -1) { overlaps[j][ll] = o_overlaps[i][k]; ll++; } } j++; } } } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Any structure associated to the new alignment is * initialize to NULL. In this way, these structure, * if it will be necessary, has to be computed */ sgaps = NULL; scons = NULL; seqMatrix = NULL; identities = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Overlapping operator = to use it as a kind of class constructor */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment &alignment::operator=(const alignment &old) { int i, j; if(this != &old) { oldAlignment = true; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Assign the parameter values to the variables */ sequenNumber = old.sequenNumber; residNumber = old.residNumber; isAligned = old.isAligned; reverse = old.reverse; terminalGapOnly = old.terminalGapOnly; right_boundary = old.right_boundary; left_boundary = old.left_boundary; iformat = old.iformat; oformat = old.oformat; shortNames = old.shortNames; dataType = old.dataType; ghWindow = old.ghWindow; shWindow = old.shWindow; blockSize = old.blockSize; filename = old.filename; aligInfo = old.aligInfo; /* ***** ***** ***** ***** ***** ***** ***** ***** */ keepSequences = old.keepSequences; keepHeader = old.keepHeader; forceCaps = old.forceCaps; upperCase = old.upperCase; lowerCase = old.lowerCase; gapSymbol = old.gapSymbol; /* ***** ***** ***** ***** ***** ***** ***** ***** */ sequences = new string[sequenNumber]; for(i = 0; i < sequenNumber; i++) sequences[i] = old.sequences[i]; seqsName = new string[sequenNumber]; for(i = 0; i < sequenNumber; i++) seqsName[i] = old.seqsName[i]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ delete [] residuesNumber; if(old.residuesNumber) { residuesNumber = new int[sequenNumber]; for(i = 0; i < sequenNumber; i++) residuesNumber[i] = old.residuesNumber[i]; } residuesNumber = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ delete [] seqsInfo; if(old.seqsInfo) { seqsInfo = new string[sequenNumber]; for(i = 0; i < sequenNumber; i++) seqsInfo[i] = old.seqsInfo[i]; } else seqsInfo = NULL; delete [] saveResidues; if(old.saveResidues) { saveResidues = new int[residNumber]; for(i = 0; i < residNumber; i++) saveResidues[i] = old.saveResidues[i]; } else saveResidues = NULL; delete [] saveSequences; if(old.saveSequences) { saveSequences = new int[sequenNumber]; for(i = 0; i < sequenNumber; i++) saveSequences[i] = old.saveSequences[i]; } else saveSequences = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ delete [] identities; if(old.identities) { identities = new float*[sequenNumber]; for(i = 0; i < sequenNumber; i++) { identities[i] = new float[sequenNumber]; for(j = 0; j < sequenNumber; j++) identities[i][j] = old.identities[i][j]; } } else identities = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ delete [] overlaps; if(old.overlaps) { overlaps = new float*[sequenNumber]; for(i = 0; i < sequenNumber; i++) { overlaps[i] = new float[sequenNumber]; for(j = 0; j < sequenNumber; j++) overlaps[i][j] = old.overlaps[i][j]; } } else overlaps = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ delete sgaps; sgaps = NULL; delete scons; scons = NULL; delete seqMatrix; seqMatrix = old.seqMatrix; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ return *this; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Class destructor */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment::~alignment(void) { int i; /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(sequences != NULL) delete [] sequences; sequences = NULL; if(seqsName != NULL) delete [] seqsName; seqsName = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(residuesNumber != NULL) delete [] residuesNumber; residuesNumber = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(seqsInfo != NULL) delete [] seqsInfo; seqsInfo = NULL; if(saveResidues != NULL) delete[] saveResidues; saveResidues = NULL; if(saveSequences != NULL) delete[] saveSequences; saveSequences = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(identities != NULL) { for(i = 0; i < sequenNumber; i++) delete [] identities[i]; delete [] identities; } identities = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(overlaps != NULL) { for(i = 0; i < sequenNumber; i++) delete [] overlaps[i]; delete [] overlaps; } overlaps = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(sgaps != NULL) delete sgaps; sgaps = NULL; if(scons != NULL) delete scons; scons = NULL; if(seqMatrix != NULL) delete seqMatrix; seqMatrix = NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ oldAlignment = false; sequenNumber = 0; residNumber = 0; isAligned = false; reverse = false; terminalGapOnly = false; right_boundary = -1; left_boundary = -1; iformat = 0; oformat = 0; shortNames = false; dataType = 0; ghWindow = 0; shWindow = 0; blockSize = 0; filename.clear(); aligInfo.clear(); /* ***** ***** ***** ***** ***** ***** ***** ***** */ } // Try to load current alignment and inform otherwise bool alignment::loadAlignment(char *alignmentFile) { // Detect input alignment format - it is an strict detection procedure iformat = formatInputAlignment(alignmentFile); // Unless it is indicated somewhere else, output alignment format will be // the same as the input one oformat = iformat; // Use the appropiate function to read input alignment switch(iformat) { case 1: return loadClustalAlignment(alignmentFile); case 3: return loadNBRF_PirAlignment(alignmentFile); case 8: return loadFastaAlignment(alignmentFile); case 11: return loadPhylip3_2Alignment(alignmentFile); case 12: return loadPhylipAlignment(alignmentFile); case 17: return loadNexusAlignment(alignmentFile); case 21: return loadMegaInterleavedAlignment(alignmentFile); case 22: return loadMegaNonInterleavedAlignment(alignmentFile); // Return a FALSE value - meaning the input alignment was not loaded default: return false; } return false; } // Return input alignment format - it is useful to set-up output alignment one int alignment::getInputFormat(void) { return iformat; } // Return output alignment format int alignment::getOutputFormat(void) { return oformat; } // Return whether there is one or more alignments at the input file. // Currently trimAl family only support single alignments per file. int alignment::typeInputFile(void) { return SINGLE; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function prints the alignmnet to the standard output using an * appropiate function depending on the output format */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ bool alignment::printAlignment(void){ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(sequences == NULL) return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ if((residNumber == 0) || (sequenNumber == 0)) { cerr << endl << "WARNING: Output alignment has not been generated. " << "It is empty" << endl << endl; return true; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ switch(oformat) { case 1: alignmentClustalToFile(cout); break; case 3: alignmentNBRF_PirToFile(cout); break; case 8: alignmentFastaToFile(cout); break; case 11: alignmentPhylip3_2ToFile(cout); break; case 12: alignmentPhylipToFile(cout); break; case 13: alignmentPhylip_PamlToFile(cout); break; case 17: alignmentNexusToFile(cout); break; case 21: case 22: alignmentMegaToFile(cout); break; case 99: getSequences(cout); break; case 100: alignmentColourHTML(cout); break; default: return false; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ return true; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function puts the alignment in a given file */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ bool alignment::saveAlignment(char *destFile) { ofstream file; /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(sequences == NULL) return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ if((residNumber == 0) || (sequenNumber == 0)) { cerr << endl << "WARNING: Output alignment has not been generated. " << "It is empty." << endl << endl; return true; } /* Check whether the input sequences file is aligned or not before creating * a given output format. */ switch(oformat) { case 1: case 11: case 12: case 13: case 17: case 21: case 22: /* Check whether sequences in the alignment are aligned or not. * Warn about it if there are not aligned. */ if (!isAligned) { cerr << endl << "ERROR: Sequences are not aligned. Output format is " << "not compatible with unaligned sequences."; return false; } } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* File open and correct open check */ file.open(destFile); if(!file) return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Depending on the output format, we call to the * appropiate function */ switch(oformat) { case 1: alignmentClustalToFile(file); break; case 3: alignmentNBRF_PirToFile(file); break; case 8: alignmentFastaToFile(file); break; case 11: alignmentPhylip3_2ToFile(file); break; case 12: alignmentPhylipToFile(file); break; case 13: alignmentPhylip_PamlToFile(file); break; case 17: alignmentNexusToFile(file); break; case 21: case 22: alignmentMegaToFile(file); break; case 99: getSequences(file); break; case 100: alignmentColourHTML(file); break; default: return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Close the output file */ file.close(); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* All is OK, return true */ return true; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function trimms a given alignment based on the gap distribution * values. To trim the alignment, this function uses two parameters: * * baseLine: Minimum percentage of columns that should * be conserved in the new alignment. * gapsPct: Maximum percentage of gaps per column that * should be in the new alignment. * * The function selects that combination of parameters that maximize the * final number of columns in the new alignment. There is an extra parameter * to ask for the complementary alignment. A complementary alignment consits * of those columns that would delete applying the standard approach. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment *alignment::cleanGaps(float baseLine, float gapsPct, bool complementary) { alignment *ret; double cut; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If gaps statistics are not calculated, we * calculate them */ if(calculateGapStats() != true) return NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Obtain the cut point using the given parameters */ cut = sgaps -> calcCutPoint(baseLine, gapsPct); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Once we have the cut value proposed, we call the * appropiate method to clean the alignment and, then, * generate the new alignment. */ ret = cleanByCutValue(cut, baseLine, sgaps -> getGapsWindow(), complementary); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return a reference of the new alignment */ return ret; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function trimms a given alignment based on the similarity distribution * values. To trim the alignment, this function uses two parameters: * * baseLine: Minimum percentage of columns that should * be conserved in the new alignment. * conservat: Minimum value of similarity per column that * should be in the new alignment. * * The function selects that combination of parameters that maximize the * final number of columns in the new alignment. There is an extra parameter * to ask for the complementary alignment. A complementary alignment consits * of those columns that would delete applying the standard approach. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment *alignment::cleanConservation(float baseLine, float conservationPct, bool complementary) { alignment *ret; float cut; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If conservation's statistics are not calculated, * we calculate them */ if(calculateConservationStats() != true) return NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Calculate the cut point using the given parameters */ cut = (float) scons -> calcCutPoint(baseLine, conservationPct); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Once we have the cut value, we call the appropiate * method to clean the alignment and, then, generate the new alignment */ ret = cleanByCutValue(cut, baseLine, scons -> getMdkwVector(), complementary); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return a reference of the new alignment */ return ret; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function trimms a given alignment based on the similarity and gaps * distribution values. To trim the alignment, this function uses three * parameters: * * baseLine: Minimum percentage of columns that should * be conserved in the new alignment. * gapsPct: Maximum percentage of gaps per column that * should be in the new alignment. * conservat: Minimum value of similarity per column that * should be in the new alignment. * * The function selects that combination of parameters that maximize the * final number of columns in the new alignment. If the baseLine parameter * is the most strict, the other two ones would be relaxed to keep columns * that would not be selected. There is an extra parameter to ask for the * complementary alignment. A complementary alignment consits of those * columns that would delete applying the standard approach. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment *alignment::clean(float baseLine, float GapsPct, float conservationPct, bool complementary) { alignment *ret; float cutCons; double cutGaps; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If gaps statistics are not calculated, we calculate * them */ if(calculateGapStats() != true) return NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If conservation's statistics are not calculated, * we calculate them */ if(calculateConservationStats() != true) return NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Calculate the two cut points using the parameters */ cutGaps = sgaps->calcCutPoint(baseLine, GapsPct); cutCons = scons->calcCutPoint(baseLine, conservationPct); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Clean the alingment using the two cut values, the * gapsWindow and MDK_Windows vectors and the baseline * value */ ret = cleanByCutValue(cutGaps, sgaps -> getGapsWindow(), baseLine, cutCons, scons -> getMdkwVector(), complementary); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return a reference of the clean alignment object */ return ret; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function cleans a given alignment based on the consistency values * asses from a dataset of alignments. The method takes three parameters: * * cutpoint: Lower limit (0-1) of comparefile value admits in * the new alignment. * baseline: Minimum percentage of columns to have in the new alignment * vectValues: A vector with alignment's consistency values. * * The function computes the optimal parameter combination value to trim * an alignment based on the consistency value from the comparison among * a dataset of alignments with the same sequences. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment *alignment::cleanCompareFile(float cutpoint, float baseLine, float *vectValues, bool complementary) { alignment *ret; float cut, *vectAux; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Allocate memory */ vectAux = new float[residNumber]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Sort a copy of the vectValues vector, and take the * value at 100% - baseline position. */ utils::copyVect((float *) vectValues, vectAux, residNumber); utils::quicksort(vectAux, 0, residNumber-1); cut = vectAux[(int) ((float)(residNumber - 1) * (100.0 - baseLine)/100.0)]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* We have to decide which is the smallest value * between the cutpoint value and the value from * the minimum percentage threshold */ cut = cutpoint < cut ? cutpoint : cut; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Deallocate memory */ delete [] vectAux; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Clean the selected alignment using the input parameters. */ ret = cleanByCutValue(cut, baseLine, vectValues, complementary); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return a refernce of the new alignment */ return ret; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function removes those sequences that are misaligned with the rest * of sequences in the alignment at given sequence and residue overlaps. * * overlapColumn: Set the minimum similarity fraction value that has * to have a position from a given sequence to be * considered as an hit. * minimumOverlap: Set the minimum proportion of hits that has to be * a sequence to keep it in the new alignment. * * At the same time, it's possible to get an alignment with those columns * that this method should remove setting for that purpose the complementary * parameter to True */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment *alignment::cleanSpuriousSeq(float overlapColumn, float minimumOverlap, bool complementary) { float *overlapVector; alignment *newAlig; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Allocate local memory */ overlapVector = new float[sequenNumber]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Compute the overlap's vector using the overlap * column's value */ if(!calculateSpuriousVector(overlapColumn, overlapVector)) return NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Select and remove the sequences with a overlap less * than threshold's overlap and create a new alignemnt */ newAlig = cleanOverlapSeq(minimumOverlap, overlapVector, complementary); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Deallocate local memory */ delete [] overlapVector; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return a reference of the clean alignment object */ return newAlig; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function implements the gappyout approach. To trim the alignment, this * method computes the slope from the gaps distribution from the input alig. * Using this information, the method asses the most abrupt change between * three consecutive points from that distribution and selects the first of * this point as aan cut-off point. * * The method also can return the complementary alignment that consists of * those columns that would be delete applying this algorithm. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment *alignment::clean2ndSlope(bool complementarity) { alignment *ret; int cut; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If gaps statistics are not calculated, we calculate * them */ if(calculateGapStats() != true) return NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* We get the cut point using a automatic method for * this purpose. */ cut = sgaps -> calcCutPoint2ndSlope(); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Using the cut point calculates in last steps, we * clean the alignment and generate a new alignment */ ret = cleanByCutValue(cut, 0, sgaps->getGapsWindow(), complementarity); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Returns the new alignment. */ return ret; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method implements the strict and strictplus approaches, to apply * these algorithms, the program computes the gaps and similarity distribution * values to decide which ones are the optimal cutoff points. To compute these * cutoff points, the method applies those steps described in the manual. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment *alignment::cleanCombMethods(bool complementarity, bool variable) { float simCut, first20Point, last80Point, *simil, *vectAux; int i, j, acm, gapCut, *positions, *gaps; double inic, fin, vlr; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If gaps statistics are not calculated, we calculate * them */ if(calculateGapStats() != true) return NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Computes the gap cut point using a automatic method * and at the same time, we get the gaps values from * the alignment. */ gapCut = sgaps -> calcCutPoint2ndSlope(); gaps = sgaps -> getGapsWindow(); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If conservation's statistics are not calculated, * we calculate them */ if(calculateConservationStats() != true) return NULL; // Once the conservation stats have been calculated, generate the vector // containing the similarity value for each column considering parameters // such as windows size simil = scons -> getMdkwVector(); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Allocate local memory and initializate it to -1 */ positions = new int[residNumber]; utils::initlVect(positions, residNumber, -1); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* The method only selects columns with gaps number * less or equal than the gap's cut point. Counts the * number of columns that have been selected */ for(i = 0, acm = 0; i < residNumber; i++) { if(gaps[i] <= gapCut) { positions[i] = i; acm++; } } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Allocate local memory and save the similaritys * values for the columns that have been selected */ vectAux = new float[acm]; for(i = 0, j = 0; i < residNumber; i++) if(positions[i] != -1) vectAux[j++] = simil[i]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Sort the conservation's value vector. */ utils::quicksort(vectAux, 0, acm-1); /* ...and search for the vector points at the 20 and * 80% of length. */ first20Point = 0; last80Point = 0; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ for(i = acm - 1, j = 1; i >= 0; i--, j++) { if((((float) j/acm) * 100.0) <= 20.0) first20Point = vectAux[i]; if((((float) j/acm) * 100.0) <= 80.0) last80Point = vectAux[i]; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Computes the logaritmic's values for those points. * Finally the method computes the similarity cut * point using these values. */ inic = log10(first20Point); fin = log10(last80Point); vlr = ((inic - fin) / 10) + fin; simCut = (float) pow(10, vlr); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Clean the alignment and generate a new alignment * object using the gaps cut and the similaritys cut * values */ alignment *ret = cleanStrict(gapCut, sgaps -> getGapsWindow(), simCut, scons -> getMdkwVector(), complementarity, variable); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Deallocate local memory */ delete [] vectAux; delete [] positions; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return a reference of the new alignment */ return ret; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function allows to delete those columns consistents of only gaps. * This function is specially useful when we remove misaligned sequences from * a given alignmnet. As the rest of functions, this function can return the * complementary alignment but this alignment would have only columns with * all gaps. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment *alignment::cleanNoAllGaps(bool complementarity) { alignment *ret; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If gaps statistics are not calculated, we calculate * them */ if(calculateGapStats() != true) return NULL; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* We want to conserve the columns with gaps' number * less or equal than sequences' number - 1 */ ret = cleanByCutValue((sequenNumber - 1), 0, sgaps->getGapsWindow(), complementarity); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Returns the new alignment. */ return ret; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method computes the overlap for each sequence given a overlap value. * This overlap set the minimum fraction that has to be a position for the * selected sequence to count as an hit. This proportion measures how much * is similar, (in terms of residues, indetermination and gaps), a given * position respect to the element on its column. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ bool alignment::calculateSpuriousVector(float overlap, float *spuriousVector) { int i, j, k, seqValue, ovrlap, hit; float floatOverlap; char indet; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Compute the overlap */ floatOverlap = overlap * float(sequenNumber-1); ovrlap = int(overlap * (sequenNumber-1)); if(floatOverlap > float(ovrlap)) ovrlap++; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If the spurious vectos is NULL, returns false. */ if(spuriousVector == NULL) return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Depending on the kind of alignment, we have * different indetermination symbol */ if(getTypeAlignment() == AAType) indet = 'X'; else indet = 'N'; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* For each alignment's sequence, computes its overlap */ for(i = 0, seqValue = 0; i < sequenNumber; i++, seqValue = 0) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* For each alignment's column, computes the overlap * between the selected sequence and the other ones */ for(j = 0; j < residNumber; j++) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* For sequences are before the sequence selected */ for(k = 0, hit = 0; k < i; k++) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If the element of sequence selected is the same * that the element of sequence considered, computes * a hit */ if(sequences[i][j] == sequences[k][j]) hit++; /* If the element of sequence selected isn't a 'X' nor * 'N' (indetermination) or a '-' (gap) and the element * of sequence considered isn't a a 'X' nor 'N' * (indetermination) or a '-' (gap), computes a hit */ else if((sequences[i][j] != indet) && (sequences[i][j] != '-') && (sequences[k][j] != indet) && (sequences[k][j] != '-')) hit++; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* For sequences are after the sequence selected */ for(k = (i + 1); k < sequenNumber; k++) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If the element of sequence selected is the same * that the element of sequence considered, computes * a hit */ if(sequences[i][j] == sequences[k][j]) hit++; /* If the element of sequence selected isn't a 'X' nor * 'N' (indetermination) or a '-' (gap) and the element * of sequence considered isn't a a 'X' nor 'N' * (indetermination) or a '-' (gap), computes a hit */ else if((sequences[i][j] != indet) && (sequences[i][j] != '-') && (sequences[k][j] != indet) && (sequences[k][j] != '-')) hit++; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Finally, if the hit's number divided by number of * sequences minus one is greater or equal than * overlap's value, computes a column's hit. */ if(hit >= ovrlap) seqValue++; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* For each alignment's sequence, computes its spurious's * or overlap's value as the column's hits -for that sequence- divided by column's number. */ spuriousVector[i] = ((float) seqValue / residNumber); /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If there is not problem in the method, return true */ return true; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method compute the cluster belongs to a given alignment at certain * identity threshold. To know the clusters from the alignment, those * clusters are calculated as follow: 1) Select the longest sequences. 2) * Using the previous computed identity values, compare if the second * longest sequence belongs, because the identity value with the cluster * representative is equal or greater than a given threshold, to this cluster * or not. 3) If the sequence belongs to this cluster, we add it, if not * belongs, we create a new cluster and fix this sequence as its representative * 4) Continue with the rest of sequences. In the case that a given sequence * can belong to more than one cluster, we choose the cluster which one * maximize the identity value respect to its representative sequence */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ int *alignment::calculateRepresentativeSeq(float maximumIdent) { int i, j, pos, clusterNum, **seqs; int *cluster; static int *repres; float max; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Ask for the sequence identities assesment */ if(identities == NULL) calculateSeqIdentity(); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ seqs = new int*[sequenNumber]; for(i = 0; i < sequenNumber; i++) { seqs[i] = new int[2]; seqs[i][0] = utils::removeCharacter('-', sequences[i]).size(); seqs[i][1] = i; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ utils::quicksort(seqs, 0, sequenNumber-1); /* ***** ***** ***** ***** ***** ***** ***** ***** */ cluster = new int[sequenNumber]; cluster[0] = seqs[sequenNumber - 1][1]; clusterNum = 1; /* ***** ***** ***** ***** ***** ***** ***** ***** */ for(i = sequenNumber - 2; i >= 0; i--) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ for(j = 0, max = 0, pos = -1; j < clusterNum; j++) { if(identities[seqs[i][1]][cluster[j]] > maximumIdent) { if(identities[seqs[i][1]][cluster[j]] > max) { max = identities[seqs[i][1]][cluster[j]]; pos = j; } } } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(pos == -1) { cluster[j] = seqs[i][1]; clusterNum++; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ repres = new int[clusterNum + 1]; repres[0] = clusterNum; for(i = 0; i < clusterNum; i++) repres[i+1] = cluster[i]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Deallocate dinamic memory */ for(i = 0; i < sequenNumber; i++) delete [] seqs[i]; delete [] cluster; delete [] seqs; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ return repres; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method looks for the optimal cut point for a given clusters number. * The idea is to find a identity cut-off point that can be used to get a * number of representative sequences similar to the input parameter */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ float alignment::getCutPointClusters(int clusterNumber) { float max, min, avg, gMax, gMin, startingPoint, prevValue = 0, iter = 0; int i, j, clusterNum, *cluster, **seqs; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If the user wants only one cluster means that all * of sequences have to be in the same clauster. * Otherwise, if the users wants the maximum number of * clusters means that each sequence have to be in their * own cluster */ if(clusterNumber == sequenNumber) return 1; else if(clusterNumber == 1) return 0; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Ask for the sequence identities assesment */ if(identities == NULL) calculateSeqIdentity(); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Compute the maximum, the minimum and the average * identity values from the sequences */ for(i = 0,gMax = 0, gMin = 1, startingPoint = 0; i < sequenNumber; i++) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ for(j = 0, max = 0, avg = 0, min = 1; j < i; j++) { if(max < identities[i][j]) max = identities[i][j]; if(min > identities[i][j]) min = identities[i][j]; avg += identities[i][j]; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ for(j = i + 1; j < sequenNumber; j++) { if(max < identities[i][j]) max = identities[i][j]; if(min > identities[i][j]) min = identities[i][j]; avg += identities[i][j]; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ startingPoint += avg / (sequenNumber - 1); if(max > gMax) gMax = max; if(min < gMin) gMin = min; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Take the starting point as the average value */ startingPoint /= sequenNumber; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Compute and sort the sequence length */ seqs = new int*[sequenNumber]; for(i = 0; i < sequenNumber; i++) { seqs[i] = new int[2]; seqs[i][0] = utils::removeCharacter('-', sequences[i]).size(); seqs[i][1] = i; } utils::quicksort(seqs, 0, sequenNumber-1); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Create the data structure to store the different * clusters for a given thresholds */ cluster = new int[sequenNumber]; cluster[0] = seqs[sequenNumber - 1][1]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /**** ***** ***** ***** ***** ***** ***** ***** */ /* Look for the optimal identity value to get the * number of cluster set by the user. To do that, the * method starts in the average identity value and moves * to the right or lefe side of this value depending on * if this value is so strict or not. We set an flag to * avoid enter in an infinite loop, if we get the same * value for more than 10 consecutive point without get * the given cluster number means that it's impossible * to get this cut-off and we need to stop the search */ do { clusterNum = 1; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Start the search */ for(i = sequenNumber - 2; i >= 0; i--) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ for(j = 0; j < clusterNum; j++) if(identities[seqs[i][1]][cluster[j]] > startingPoint) break; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ if(j == clusterNum) { cluster[j] = seqs[i][1]; clusterNum++; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Given the cutoff point, if we get the same cluster * number or we have been getting for more than 10 * consecutive times the same cutoff point, we stop */ if((clusterNum == clusterNumber) || (iter > 10)) break; else { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* We have to move to the left side from the average * to decrease the cutoff value and get a smaller number * of clusters */ if(clusterNum > clusterNumber) { gMax = startingPoint; startingPoint = (gMax + gMin)/2; } else { /* In the opposite side, we have to move to the right * side from the cutting point to be more strict and get * a bigger number of clusters */ gMin = startingPoint; startingPoint = (gMax + gMin)/2; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If the clusters number from the previous iteration * is different from the current number, we put the * iteration number to 0 and store this new value */ if(prevValue != clusterNum) { iter = 0; prevValue = clusterNum; /* Otherwise, we increase the iteration number to * avoid enter in an infinitve loop */ } else iter++; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ } while(true); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Deallocate dinamic memory */ for(i = 0; i < sequenNumber; i++) delete [] seqs[i]; delete [] seqs; delete [] cluster; /* ***** ***** ***** ***** ***** ***** ***** ***** */ return startingPoint; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method select one representative sequence (the longest one) per each * cluster from the input alignment and generate a new alignment */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment *alignment::getClustering(float identityThreshold) { string *matrixAux, *newSeqsName; int i, j, *clustering; alignment *newAlig; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Get the representative member for each cluster * given a maximum identity threshold */ clustering = calculateRepresentativeSeq(identityThreshold); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Put all sequences to be deleted and get back those * sequences that are representative for each cluster * */ for(i = 0; i < sequenNumber; i ++) saveSequences[i] = -1; for(i = 1; i <= clustering[0]; i ++) saveSequences[clustering[i]] = clustering[i]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* We allocate memory to save the sequences selected */ matrixAux = new string[clustering[0]]; newSeqsName = new string[clustering[0]]; /* Copy to new structures the information that have * been selected previously. */ for(i = 0, j = 0; i < sequenNumber; i++) if(saveSequences[i] != -1) { newSeqsName[j] = seqsName[i]; matrixAux[j] = sequences[i]; j++; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* When we have all parameters, we create the new * alignment */ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, clustering[0], residNumber, iformat, oformat, shortNames, dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities, overlaps); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Deallocated auxiliar memory */ delete [] matrixAux; delete [] newSeqsName; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return the new alignment reference */ return newAlig; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function returns the backtranslation for a given protein processed * alignment into its CDS alignment. To do this convertion, the function needs * the Coding sequences as well the original composition of each protein * sequence. Also, the function needs to know which columns/sequences will be * in the final alignment to carray out the conversion. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ alignment *alignment::getTranslationCDS(int newResidues, int newSequences, int *ColumnsToKeep, string *oldSeqsName, sequencesMatrix *seqMatrix, alignment *ProtAlig) { string *matrixAux; alignment *newAlig; int i, j, k, l, oldResidues; int *mappedSeqs, *tmpSequence, *selectedRes; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Map the selected protein sequences to the input * coding sequences */ mappedSeqs = new int[newSequences]; for(i = 0; i < sequenNumber; i++) for(j = 0; j < newSequences; j++) if(!seqsName[i].compare(oldSeqsName[j])) { mappedSeqs[j] = i; break; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Get the original alignment size as well the original * selected/non-selected columns */ oldResidues = seqMatrix -> getResidNumber(); selectedRes = new int[oldResidues]; for(i = 0; i < oldResidues; i++) selectedRes[i] = i; for(j = 0; j < ColumnsToKeep[0]; j++) selectedRes[j] = -1; for(i = 0; i < newResidues - 1; i++) for(j = ColumnsToKeep[i] + 1; j < ColumnsToKeep[i+1]; j++) selectedRes[j] = -1; for(j = ColumnsToKeep[newResidues - 1] + 1; j < oldResidues; j++) selectedRes[j] = -1; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* We allocate memory to save the columns selected */ matrixAux = new string[newSequences]; tmpSequence = new int[oldResidues]; /* Using the information about which residues for each * sequence was selected by others function, we process * these residues to recover the corresponding codons */ for(i = 0; i < newSequences; i++) if(seqMatrix -> getSequence(oldSeqsName[i], tmpSequence)) { for(j = 0; j < oldResidues; j++) { if((selectedRes[j] != -1) && (tmpSequence[j] != 0)) { for(k = 3 * (tmpSequence[j] - 1), l = 0; l < 3; k++, l++) { /* Check whether the nucleotide sequences end has been reached or not. * If it has been reached, complete backtranslation using indetermination * symbols 'N' */ if((int) sequences[mappedSeqs[i]].length() > k) matrixAux[i].resize(matrixAux[i].size() + 1, sequences[mappedSeqs[i]][k]); else matrixAux[i].resize(matrixAux[i].size() + 1, 'N'); } } else if(selectedRes[j] != -1) { matrixAux[i].resize(matrixAux[i].size() + 3, '-'); } } /* If there is any problems with a sequence then * the function returns an error */ } else { return NULL; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Remove columns/sequences composed only by gaps before making the retrotranslation */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* When we have all parameters, we create the new * alignment */ newAlig = new alignment(filename, "", matrixAux, oldSeqsName, NULL, newSequences, newResidues * 3, ProtAlig -> getInputFormat(), ProtAlig -> getOutputFormat(), ProtAlig -> getShortNames(), DNAType, true, ProtAlig -> getReverse(), terminalGapOnly, left_boundary, right_boundary, keepSequences, keepHeader, sequenNumber, oldResidues * 3, NULL, NULL, NULL, 0, 0, ProtAlig -> getBlockSize(), NULL, NULL); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Deallocated auxiliar memory */ delete [] matrixAux; delete mappedSeqs; delete tmpSequence; delete selectedRes; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return the new alignment reference */ return newAlig; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function computes the gaps statistics for the input alignment. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ bool alignment::calculateGapStats(void) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If alignment matrix is not created, return false */ if(sequences == NULL) return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If sgaps object is not created, we create them and calculate the statistics */ if(sgaps == NULL) { sgaps = new statisticsGaps(sequences, sequenNumber, residNumber, dataType); sgaps -> applyWindow(ghWindow); } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ return true; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Print the gaps value for each column in the alignment */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::printStatisticsGapsColumns(void) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Check if there is computed the gaps statistics */ if(calculateGapStats()) /* then prints the information */ sgaps -> printGapsColumns(); /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Print the acumulative gaps distribution value from the input alignment */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::printStatisticsGapsTotal(void) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Check it there is computed the gaps statistics */ if(calculateGapStats()) /* then prints the information */ sgaps -> printGapsAcl(); /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Set the similarity matrix. This matrix is necessary for some methods in * the program */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ bool alignment::setSimilarityMatrix(similarityMatrix *sm) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If scons object is not created, we create them */ if(scons == NULL) scons = new statisticsConservation(sequences, sequenNumber, residNumber, dataType); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Associate the matrix to the similarity statistics * object */ if(!scons -> setSimilarityMatrix(sm)) return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If it's OK, we return true */ return true; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method compute the similarity values from the input alignment if it * has not been computed before */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ bool alignment::calculateConservationStats(void) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* It the gaps statistics object has not been created * we create it */ if(calculateGapStats() != true) return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* It the similarity statistics object has not been * created we create it */ if(scons == NULL) return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Ask for the similarity matrix */ if(scons -> isSimMatrixDef() != true) return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Compute the similarity statistics from the input * alignment */ if(!scons -> calculateVectors(sequences, sgaps->getGapsWindow())) return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Ask to know if it is necessary to apply any window * method. If it's necessary, we apply it */ if(scons->isDefinedWindow()) return true; else return scons->applyWindow(shWindow); /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Print the similarity value for each column from the alignment */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::printStatisticsConservationColumns(void) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Check if the similarity statistics object has been * created */ if(calculateConservationStats()) /* then prints the information */ scons -> printConservationColumns(); /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Print the accumulative similarity distribution values from the alignment */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::printStatisticsConservationTotal(void) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Check if the similarity statistics object has been * created */ if(calculateConservationStats()) /* then prints the information */ scons -> printConservationAcl(); /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method prints the correspondece between the columns in the original * and in the trimmed alignment */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::printCorrespondence(void) { int i; cout << "#ColumnsMap\t"; /* Print the saveResidues relathionship */ for(i = 0; i < residNumber - 1; i++) cout << saveResidues[i] << ", "; cout << saveResidues[i] << endl; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return the number of the sequences in the alignment */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ int alignment::getNumSpecies(void) { return sequenNumber; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return the number of residues in the alignment */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ int alignment::getNumAminos(void) { return residNumber; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Sets the condition to remove only terminal gaps after applying any * trimming method or not. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::trimTerminalGaps(bool terminalOnly_, int *boundaries_) { terminalGapOnly = terminalOnly_; /* We are only interested in the first and last number of the vector - this * vector is generated by other function where it is important to store all * intervals */ if (boundaries_ != NULL) { left_boundary = boundaries_[0]; right_boundary = boundaries_[1]; } } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method stores the diferent windows values in the alignment object */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::setWindowsSize(int ghWindow_, int shWindow_) { ghWindow = ghWindow_; shWindow = shWindow_; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method lets to change the output format */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::setOutputFormat(int format_, bool shortNames_) { oformat = format_; shortNames = shortNames_; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method sets a new block size value */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::setBlockSize(int blockSize_) { blockSize = blockSize_; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return true if the shortNames flag has been setted */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ int alignment::getShortNames(void) { return shortNames; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return true if the reverse flag has been setted. */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ int alignment::getReverse(void) { return reverse; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method lets to change the output alignment orientation */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::setReverse(void) { reverse = true; } /* Set appropiate flag to decide whether sequences composed only by gaps should * be kept or not */ void alignment::setKeepSequencesFlag(bool flag) { keepSequences = flag; } /* Set appropiate flag to decide whether original sequences header should be * dumped into the output stream with or without any preprocessing step */ void alignment::setKeepSeqsHeaderFlag(bool flag) { keepHeader = flag; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Return the block size value */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ int alignment::getBlockSize(void) { return blockSize; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method returns the sequences name */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::getSequences(string *Names) { for(int i = 0; i < sequenNumber; i++) Names[i] = seqsName[i]; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::getSequences(string *Names, int *lengths) { for(int i = 0; i < sequenNumber; i++) { lengths[i] = (int) utils::removeCharacter('-', sequences[i]).length(); Names[i] = seqsName[i]; } } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::getSequences(string *Names, string *Sequences, int *Lengths) { for(int i = 0; i < sequenNumber; i++) { Names[i] = seqsName[i]; Sequences[i] = utils::removeCharacter('-', sequences[i]); Lengths[i] = (int) Sequences[i].length(); } } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* For a given set of sequences name, check for its correspondence with * the current sequences name. If there is not a complete correspondence * between both sets, the method return false, otherwise, return true */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ bool alignment::getSeqNameOrder(string *names, int *orderVector) { int i, j, numNames; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* For each name in the current alignment, we look * for its correspondence in the input set */ for(i = 0, numNames = 0; i < sequenNumber; i++) { for(j = 0; j < sequenNumber; j++) { if(seqsName[i] == names[j]) { orderVector[i] = j; numNames++; break; } } } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Depending on if we get the complete correspondence * between both sets of names, we return true or not */ if(numNames == sequenNumber) return true; else return false; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This method lets to build a sequence matrix. A sequence matrix contains * the residue position for each sequence without taking into account the * gaps in the sequence. This means that at position 10 we have the residue * 1 for that sequence */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::sequenMatrix(void) { if(seqMatrix == NULL) seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Use this method to destroy a given sequence matrix */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::destroySequenMatrix(void) { if(seqMatrix != NULL) delete seqMatrix; seqMatrix = NULL; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Print the alignment's sequence matrix */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::printSequenMatrix(void) { if(seqMatrix == NULL) seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); seqMatrix -> printMatrix(); } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Given an index, this method returns the sequence matrix column for that * index */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::getColumnSeqMatrix(int column, int *columnSeqMatrix) { if(seqMatrix == NULL) seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); seqMatrix -> getColumn(column, columnSeqMatrix); } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function looks if a given value belongs a given row. In the affirmative * case, returns the columns value for that row and value combination */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::getColumnSeqMatrix(int value, int row, int *columnSeqMatrix) { if(seqMatrix == NULL) seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); seqMatrix -> getColumn(value, row, columnSeqMatrix); } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function change the rows order in the sequence matrix */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ void alignment::setSeqMatrixOrder(int *order) { if(seqMatrix == NULL) seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); seqMatrix -> setOrder(order); } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* This function change the rows order in the sequence matrix */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ sequencesMatrix *alignment::getSeqMatrix(void) { if(seqMatrix == NULL) seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); return seqMatrix; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Computes, if it's necessary, and return the alignment's type */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ int alignment::getTypeAlignment(void) { if(dataType == 0) dataType = utils::checkTypeAlignment(sequenNumber, residNumber, sequences); return dataType; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Returns the correspondence between the columns in the original and in the * trimmed alignment */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ int *alignment::getCorrespResidues(void) { return saveResidues; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Returns the correspondence between the sequences in the original and in * the trimmed alignment */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ int *alignment::getCorrespSequences(void) { return saveSequences; } /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ /* Returns if the alignment is aligned or not */ /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ bool alignment::isFileAligned(void) { return isAligned; } /* ***************************************************************************** * ***************************************************************************** * ***************************************************************************** * ************************************************************************** */ /* This method removes those columns that exceed a given threshold. If the * number of columns in the trimmed alignment is lower than a given percentage * of the original alignment. The program relaxes the threshold until to add * enough columns to achieve this percentage, to add those columns the program * start in the middle of the original alignment and make, alternatively, * movements to the left and right side from that point. This method also can * return the complementary alignment. */ alignment *alignment::cleanByCutValue(double cut, float baseLine, const int *gInCol, bool complementary) { int i, j, k, jn, oth, pos, block, *vectAux; string *matrixAux, *newSeqsName; alignment *newAlig; newValues counter; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Select the columns with a gaps value less or equal * than the cut point. */ for(i = 0, counter.residues = 0; i < residNumber; i++) if(gInCol[i] <= cut) counter.residues++; else saveResidues[i] = -1; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Compute, if it's necessary, the number of columns * necessary to achieve the minimum number of columns * fixed by coverage parameter. */ oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* if it's necessary to recover some columns, we * applied this instructions to recover it */ if(oth > 0) { counter.residues += oth; /* Allocate memory */ vectAux = new int[residNumber]; /* Sort a copy of the gInCol vector, and take the value of the column that marks the % baseline */ utils::copyVect((int *) gInCol, vectAux, residNumber); utils::quicksort(vectAux, 0, residNumber-1); cut = vectAux[(int) ((float)(residNumber - 1) * (baseLine)/100.0)]; /* Deallocate memory */ delete [] vectAux; } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Fixed the initial size of blocks as 0.5% of * alignment's length */ for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* We start in the alignment middle then we move on * right and left side at the same time. */ for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Left side. Here, we compute the block's size. */ for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* if block's size is greater or equal than the fixed * size then we save all columns that have not been * saved previously. */ if((i - jn) >= k) { for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) { if(gInCol[jn] <= cut) { saveResidues[jn] = jn; oth--; } else break; } } i = jn; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Right side. Here, we compute the block's size. */ for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* if block's size is greater or equal than the fixed * size then we save all columns that have not been * saved previously. */ if((jn - j) >= k) { for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) { if(gInCol[jn] <= cut) { saveResidues[jn] = jn; oth--; } else break; } } j = jn; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* Keep only columns blocks bigger than an input columns block size */ if(blockSize != 0) { /* Traverse all alignment looking for columns blocks greater than LONGBLOCK, * everytime than a column is not selected by the trimming method, check * whether the current block size until that point is big enough to be kept * or it should be removed from the final alignment */ for(i = 0, pos = 0, block = 0; i < residNumber; i++) { if(saveResidues[i] != -1) block++; else { /* Remove columns from blocks smaller than input blocks size */ if(block < blockSize) for(j = pos; j <= i; j++) saveResidues[j] = -1; pos = i + 1; block = 0; } } /* Check final block separately since it could happen than last block is not * big enough but because the loop end could provoke to ignore it */ if(block < blockSize) for(j = pos; j < i; j++) saveResidues[j] = -1; } /* If the flag -terminalony is set, apply a method to look for internal * boundaries and get back columns inbetween them, if they exist */ if(terminalGapOnly == true) if(!removeOnlyTerminal()) return NULL; /* Once the columns/sequences selection is done, turn it around * if complementary flag is active */ if(complementary == true) computeComplementaryAlig(true, false); /* Check for any additional column/sequence to be removed */ /* Compute new sequences and columns numbers */ counter = removeCols_SeqsAllGaps(); /* Allocate memory for selected sequences/columns */ matrixAux = new string[counter.sequences]; newSeqsName = new string[counter.sequences]; /* Fill local allocated memory with previously selected data */ fillNewDataStructure(matrixAux, newSeqsName); /* When we have all parameters, we create the new alignment */ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities, overlaps); /* Deallocate local memory */ delete[] matrixAux; delete[] newSeqsName; /* Return the new alignment reference */ return newAlig; } /* This method removes those columns that not achieve a given threshold. If the * number of columns in the trimmed alignment is lower than a given percentage * of the original alignment. The program relaxes the threshold until to add * enough columns to achieve this percentage, to add those columns the program * start in the middle of the original alignment and make, alternatively, * movements to the left and right side from that point. This method also can * return the complementary alignment. */ alignment *alignment::cleanByCutValue(float cut, float baseLine, const float *ValueVect, bool complementary) { int i, j, k, jn, oth, pos, block; string *matrixAux, *newSeqsName; alignment *newAlig; newValues counter; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Select the columns with a conservation's value * greater than the cut point. */ for(i = 0, counter.residues = 0; i < residNumber; i++) if(ValueVect[i] > cut) counter.residues++; else saveResidues[i] = -1; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Compute, if it's necessary, the number of columns * necessary to achieve the minimum number of columns * fixed by coverage value. */ oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber); if(oth > 0) counter.residues += oth; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Fixed the initial size of blocks as 0.5% of * alignment's length */ for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* We start in the alignment middle then we move on * right and left side at the same time. */ for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Left side. Here, we compute the block's size. */ for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* if block's size is greater or equal than the fixed * size then we save all columns that have not been * saved previously. */ /* Here, we only accept column with a conservation's * value equal to conservation cut point. */ if((i - jn) >= k) { for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) { if(ValueVect[jn] == cut) { saveResidues[jn] = jn; oth--; } else break; } } i = jn; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Right side. Here, we compute the block's size. */ for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* if block's size is greater or equal than the fixed * size then we select the column and save the block's * size for the next iteraction. it's obvius that we * decrease the column's number needed to finish. */ /* Here, we only accept column with a conservation's * value equal to conservation cut point. */ if((jn - j) >= k) { for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) { if(ValueVect[jn] == cut) { saveResidues[jn] = jn; oth--; } else break; } } j = jn; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* Keep only columns blocks bigger than an input columns block size */ if(blockSize != 0) { /* Traverse all alignment looking for columns blocks greater than LONGBLOCK, * everytime than a column is not selected by the trimming method, check * whether the current block size until that point is big enough to be kept * or it should be removed from the final alignment */ for(i = 0, pos = 0, block = 0; i < residNumber; i++) { if(saveResidues[i] != -1) block++; else { /* Remove columns from blocks smaller than input blocks size */ if(block < blockSize) for(j = pos; j <= i; j++) saveResidues[j] = -1; pos = i + 1; block = 0; } } /* Check final block separately since it could happen than last block is not * big enough but because the loop end could provoke to ignore it */ if(block < blockSize) for(j = pos; j < i; j++) saveResidues[j] = -1; } /* If the flag -terminalony is set, apply a method to look for internal * boundaries and get back columns inbetween them, if they exist */ if(terminalGapOnly == true) if(!removeOnlyTerminal()) return NULL; /* Once the columns/sequences selection is done, turn it around * if complementary flag is active */ if(complementary == true) computeComplementaryAlig(true, false); /* Check for any additional column/sequence to be removed */ /* Compute new sequences and columns numbers */ counter = removeCols_SeqsAllGaps(); /* Allocate memory for selected sequences/columns */ matrixAux = new string[counter.sequences]; newSeqsName = new string[counter.sequences]; /* Fill local allocated memory with previously selected data */ fillNewDataStructure(matrixAux, newSeqsName); /* When we have all parameters, we create the new alignment */ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities, overlaps); /* Deallocate local memory */ delete[] matrixAux; delete[] newSeqsName; /* Return the new alignment reference */ return newAlig; } /* This method removes those columns that not achieve the similarity threshond, * by one hand, and exceed the gaps threshold. If the number of columns that * have been selected is lower that a given coverage, the program relaxes both * thresholds in order to get back some columns and achieve this minimum * number of columns in the new alignment. For that purpose, the program * starts at the middle of the original alignment and makes, alternatively, * movememts to the right and left sides looking for those columns necessary * to achieve the minimum number of columns set by the coverage parameter. * This method also can return the complementary alignmnet consists of those * columns that will be deleted from the original alignment applying the * standard method. */ alignment *alignment::cleanByCutValue(double cutGaps, const int *gInCol, float baseLine, float cutCons, const float *MDK_Win, bool complementary) { int i, j, k, oth, pos, block, jn, blGaps, *vectAuxGaps; string *matrixAux, *newSeqsName; float blCons, *vectAuxCons; alignment *newAlig; newValues counter; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Select the columns with a conservation's value * greater than the conservation cut point AND * less or equal than the gap cut point. */ for(i = 0, counter.residues = 0; i < residNumber; i++) if((MDK_Win[i] > cutCons) && (gInCol[i] <= cutGaps)) counter.residues++; else saveResidues[i] = -1; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Compute, if it's necessary, the number of columns * necessary to achieve the minimum number of it fixed * by the coverage parameter. */ oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber); /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* If it's needed to add new columns, we compute the * news thresholds */ if(oth > 0) { counter.residues += oth; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Allocate memory */ vectAuxCons = new float[residNumber]; vectAuxGaps = new int[residNumber]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Sort a copy of the MDK_Win vector and of the gInCol * vector, and take the value of the column that marks * the % baseline */ utils::copyVect((float *) MDK_Win, vectAuxCons, residNumber); utils::copyVect((int *) gInCol, vectAuxGaps, residNumber); utils::quicksort(vectAuxCons, 0, residNumber-1); utils::quicksort(vectAuxGaps, 0, residNumber-1); blCons = vectAuxCons[(int) ((float)(residNumber - 1) * (100.0 - baseLine)/100.0)]; blGaps = vectAuxGaps[(int) ((float)(residNumber - 1) * (baseLine)/100.0)]; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Deallocate memory */ delete [] vectAuxCons; delete [] vectAuxGaps; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Fixed the initial size of blocks as 0.5% of * alignment's length */ for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* We start in the alignment middle then we move on * right and left side at the same time. */ for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) { /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Left side. Here, we compute the block's size. */ for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* if block's size is greater or equal than the fixed * size then we select the column and save the block's * size for the next iteraction. it's obvius that we * decrease the column's number needed to finish. */ /* Here, we accept column with a conservation's value * greater or equal than the conservation cut point OR * less or equal than the gap cut point. */ if((i - jn) >= k) { for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) { if((MDK_Win[jn] >= blCons) || (gInCol[jn] <= blGaps)) { saveResidues[jn] = jn; oth--; } else break; } } i = jn; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* Right side. Here, we compute the block's size. */ for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ; /* ***** ***** ***** ***** ***** ***** ***** ***** */ /* if block's size is greater or equal than the fixed * size then we select the column and save the block's * size for the next iteraction. it's obvius that we * decrease the column's number needed to finish. */ /* Here, we accept column with a conservation's value * greater or equal than the conservation cut point OR * less or equal than the gap cut point. */ if((jn - j) >= k) { for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) { if((MDK_Win[jn] >= blCons) || (gInCol[jn] <= blGaps)) { saveResidues[jn] = jn; oth--; } else break; } } j = jn; /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* ***** ***** ***** ***** ***** ***** ***** ***** */ } /* Keep only columns blocks bigger than an input columns block size */ if(blockSize != 0) { /* Traverse all alignment looking for columns blocks greater than LONGBLOCK, * everytime than a column is not selected by the trimming method, check * whether the current block size until that point is big enough to be kept * or it should be removed from the final alignment */ for(i = 0, pos = 0, block = 0; i < residNumber; i++) { if(saveResidues[i] != -1) block++; else { /* Remove columns from blocks smaller than input blocks size */ if(block < blockSize) for(j = pos; j <= i; j++) saveResidues[j] = -1; pos = i + 1; block = 0; } } /* Check final block separately since it could happen than last block is not * big enough but because the loop end could provoke to ignore it */ if(block < blockSize) for(j = pos; j < i; j++) saveResidues[j] = -1; } /* If the flag -terminalony is set, apply a method to look for internal * boundaries and get back columns inbetween them, if they exist */ if(terminalGapOnly == true) if(!removeOnlyTerminal()) return NULL; /* Once the columns/sequences selection is done, turn it around * if complementary flag is active */ if(complementary == true) computeComplementaryAlig(true, false); /* Check for any additional column/sequence to be removed */ /* Compute new sequences and columns numbers */ counter = removeCols_SeqsAllGaps(); /* Allocate memory for selected sequences/columns */ matrixAux = new string[counter.sequences]; newSeqsName = new string[counter.sequences]; /* Fill local allocated memory with previously selected data */ fillNewDataStructure(matrixAux, newSeqsName); /* When we have all parameters, we create the new alignment */ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities, overlaps); /* Deallocate local memory */ delete[] matrixAux; delete[] newSeqsName; /* Return the new alignment reference */ return newAlig; } /* This method carries out the strict and strict plus method. To trim the * alignment in an automated way, the method uses as an input a given gaps * thresholds over a gaps vector values, a given similarity threshold over a * similarity vector values. With a flag, we can decide which method the * program shall apply. With another one, the user can ask for the * complementary alignment. In this method, those columns that has been marked * to be deleted but has, at least, 3 of 4 surronding neighbour selected are * get back to the new alignmnet. At other part of the method, those column * blocks that do not have a minimum size fix by the method will be deleted * from the trimmed alignment. */ alignment *alignment::cleanStrict(int gapCut, const int *gInCol, float simCut, const float *MDK_W, bool complementary, bool variable) { int i, num, lenBlock; alignment *newAlig; deque<int> neighboursBlock; /* Reject columns with gaps number greater than the gap threshold. */ for(i = 0; i < residNumber; i++) if(gInCol[i] > gapCut) saveResidues[i] = -1; /* Reject columns with similarity score under the threshold. */ for(i = 0; i < residNumber; i++) if(MDK_W[i] < simCut) saveResidues[i] = -1; /* Search for those columns that have been removed but have, * at least, 3 out of 4 adjacent columns selected. */ /* Load the first 5 columns into the list */ for(i = 0; i < (residNumber) && i < 5; i++) neighboursBlock.push_back(saveResidues[i]); /* Special case #1 - we analyze it before processing the rest positions * Column: 1 - Save this column considering its neighbours */ if((saveResidues[0] == -1) && (neighboursBlock.size() > 2) && (neighboursBlock[1] != -1) && (neighboursBlock[2] != -1)) saveResidues[0] = 0; /* Special case #2A - we analyze it before processing the rest positions * Column: 2 - Save this column considering its neighbours */ if((saveResidues[1] == -1) && (neighboursBlock.size() > 3) && ((neighboursBlock[0] != -1) && (neighboursBlock[2] != -1) && (neighboursBlock[3] != -1))) saveResidues[1] = 1; /* Special case #2B - when alignment is 3 columns long - we analyze it before * processing the rest positions. * Column: 2 - Save this column considering its neighbours */ if((saveResidues[1] == -1) && (neighboursBlock.size() > 2) && ((neighboursBlock[0] != -1) && (neighboursBlock[2] != -1))) saveResidues[1] = 1; /* Normal cases */ for(i = 2, num = 0; i < (residNumber - 2); i++, num = 0) { if(saveResidues[i] == -1) { if(neighboursBlock[0] != -1) num++; if(neighboursBlock[1] != -1) num++; if(neighboursBlock[3] != -1) num++; if(neighboursBlock[4] != -1) num++; saveResidues[i] = (num >= 3) ? i : -1; } /* We slide the window one position to the right */ if((i+3) < residNumber) { neighboursBlock.pop_front(); neighboursBlock.push_back(saveResidues[i+3]); } } /* Special case #3 - * Column: 2nd last one - We consider the list rather than the saveResidues * vector in order to consider only unmodified values before the previous * loop */ if((saveResidues[(residNumber - 2)] == -1) && (neighboursBlock.size() > 3) && ((neighboursBlock[neighboursBlock.size() - 4] != -1) && (neighboursBlock[neighboursBlock.size() - 3] != -1) && (neighboursBlock[neighboursBlock.size() - 1] != -1))) saveResidues[(residNumber - 2)] = (residNumber - 2); /* Special case #4 - * Column: last one - We consider the list rather than the saveResidues * vector in order to consider only unmodified values before the previous * loop */ if((saveResidues[(residNumber - 1)] == -1) && (neighboursBlock.size() > 2) && ((neighboursBlock[neighboursBlock.size() - 3] != -1) && (neighboursBlock[neighboursBlock.size() - 2] != -1))) saveResidues[(residNumber - 1)] = (residNumber - 1); /* Select blocks size based on user input. It can be set either to 5 or to a * variable number between 3 and 12 depending on alignment length (1% alig) */ if(!variable) lenBlock = 5; else { lenBlock = utils::roundInt(residNumber * 0.01); lenBlock = lenBlock > 3 ? (lenBlock < 12 ? lenBlock : 12) : 3; } /* Allow to change minimal block size */ blockSize = blockSize > 0 ? blockSize : lenBlock; /* Keep only columns blocks bigger than either a computed dinamically or * set by the user block size */ removeSmallerBlocks(blockSize); /* If the flag -terminalony is set, apply a method to look for internal * boundaries and get back columns inbetween them, if they exist */ if(terminalGapOnly == true) if(!removeOnlyTerminal()) return NULL; /* Once the columns/sequences selection is done, turn it around * if complementary flag is active */ if(complementary == true) computeComplementaryAlig(true, false); /* Check for any additional column/sequence to be removed */ /* Compute new sequences and columns numbers */ newValues counter; counter = removeCols_SeqsAllGaps(); /* Allocate memory for selected sequences/columns */ //~ matrixAux = new string[counter.sequences]; //~ newSeqsName = new string[counter.sequences]; /* Fill local allocated memory with previously selected data */ //~ fillNewDataStructure(matrixAux, newSeqsName); fillNewDataStructure(&counter); //~ cerr << counter -> seqsName[counter -> sequences - 1] << endl; /* When we have all parameters, we create the new alignment */ //~ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, counter.sequences, counter.residues, newAlig = new alignment(filename, aligInfo, counter.matrix, counter.seqsName, seqsInfo, counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities, overlaps); /* Deallocate local memory */ //~ delete[] matrixAux; //~ delete[] newSeqsName; /* Return the new alignment reference */ return newAlig; } /* Remove those sequences with an overlap less than a given threshold. It can * return the complementary alignment if appropiate flag is set. */ alignment *alignment::cleanOverlapSeq(float minimumOverlap, float *overlapSeq, bool complementary) { string *matrixAux, *newSeqsName; alignment *newAlig; newValues counter; int i; /* Keep only those sequences with an overlap value equal or greater than * the minimum overlap value set by the user. */ for(i = 0; i < sequenNumber; i++) if(overlapSeq[i] < minimumOverlap) saveSequences[i] = -1; /* Once the columns/sequences selection is done, turn it around * if complementary flag is active */ if(complementary == true) computeComplementaryAlig(false, true); /* Check for any additional column/sequence to be removed */ /* Compute new sequences and columns numbers */ counter = removeCols_SeqsAllGaps(); /* Allocate memory for selected sequences/columns */ matrixAux = new string[counter.sequences]; newSeqsName = new string[counter.sequences]; /* Fill local allocated memory with previously selected data */ fillNewDataStructure(matrixAux, newSeqsName); /* When we have all parameters, we create the new alignment */ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities, overlaps); /* Deallocate local memory */ delete [] matrixAux; delete [] newSeqsName; /* Return the new alignment reference */ return newAlig; } /* Remove those columns, expressed as range, set by the user. It can return * the complementary alignmnet if appropiate flags is set. */ alignment *alignment::removeColumns(int *columns, int init, int size, bool complementary) { string *matrixAux, *newSeqsName; alignment *newAlig; newValues counter; int i, j; /* Delete those range columns defines in the columns vector */ for(i = init; i < size + init; i += 2) for(j = columns[i]; j <= columns[i+1]; j++) saveResidues[j] = -1; /* Once the columns/sequences selection is done, turn it around * if complementary flag is active */ if(complementary == true) computeComplementaryAlig(true, false); /* Check for any additional column/sequence to be removed */ /* Compute new sequences and columns numbers */ counter = removeCols_SeqsAllGaps(); /* Allocate memory for selected sequences/columns */ matrixAux = new string[counter.sequences]; newSeqsName = new string[counter.sequences]; /* Fill local allocated memory with previously selected data */ fillNewDataStructure(matrixAux, newSeqsName); /* When we have all parameters, we create the new alignment */ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities, overlaps); /* Deallocate local memory */ delete[] matrixAux; delete[] newSeqsName; /* Return the new alignment reference */ return newAlig; } /* This method removes those sequences, expressed as range of sequences, set by * the user. The method also can return the complementary alignment, it means, * those sequences that originally should be removed from the input alignment */ alignment *alignment::removeSequences(int *seqs, int init, int size, bool complementary) { string *matrixAux, *newSeqsName; alignment *newAlig; newValues counter; int i, j; /* Delete those range of sequences defines by the seqs vector */ for(i = init; i < size + init; i += 2) for(j = seqs[i]; j <= seqs[i+1]; j++) saveSequences[j] = -1; /* Once the columns/sequences selection is done, turn it around * if complementary flag is active */ if(complementary == true) computeComplementaryAlig(false, true); /* Check for any additional column/sequence to be removed */ /* Compute new sequences and columns numbers */ counter = removeCols_SeqsAllGaps(); /* Allocate memory for selected sequences/columns */ matrixAux = new string[counter.sequences]; newSeqsName = new string[counter.sequences]; /* Fill local allocated memory with previously selected data */ fillNewDataStructure(matrixAux, newSeqsName); /* When we have all parameters, we create the new alignment */ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities, overlaps); /* Free local memory */ delete [] matrixAux; delete [] newSeqsName; /* Return the new alignment reference */ return newAlig; } /* Function for computing the complementary alignment. It just turn around the * current columns/sequences selection */ void alignment::computeComplementaryAlig(bool residues, bool sequences) { int i; for(i = 0; i < residNumber && residues; i++) saveResidues[i] = (saveResidues[i] == -1) ? i : -1; for(i = 0; i < sequenNumber && sequences; i++) saveSequences[i] = (saveSequences[i] == -1) ? i : -1; } /* Function designed for identifying right and left borders between central * and terminal regions in the alignment. The borders are those columns, first * and last, composed by only residues. Everything inbetween left and right * borders are keept independently of the applied methods */ bool alignment::removeOnlyTerminal(void) { int i; const int *gInCol; if((left_boundary == -1) and (right_boundary == -1)) { /* Get alignments gaps stats and copy it */ if(calculateGapStats() != true) { cerr << endl << "WARNING: Impossible to apply 'terminal-only' method" << endl << endl; return false; } gInCol = sgaps -> getGapsWindow(); /* Identify left and right borders. First and last columns with no gaps */ for(i = 0; i < residNumber && gInCol[i] != 0; i++) ; left_boundary = i; for(i = residNumber - 1; i > -1 && gInCol[i] != 0; i--) ; right_boundary = i; } else if(left_boundary >= right_boundary) { cerr << endl << "ERROR: Check your manually set left '"<< left_boundary << "' and right '" << right_boundary << "' boundaries'" << endl << endl; return false; } /* We increase the right boundary in one position because we use lower strict * condition to get back columns inbetween boundaries removed by any method */ right_boundary += 1; /* Once the interal boundaries have been established, if these limits exist * then retrieved all columns inbetween both boundaries. Columns out of these * limits will remain selected or not depending on the algorithm applied */ for(i = left_boundary; i < right_boundary; i++) saveResidues[i] = i; return true; } /* Function designed to identify and remove those columns blocks smaller than * a given size */ void alignment::removeSmallerBlocks(int blockSize) { int i, j, pos, block; if(blockSize == 0) return ; /* Traverse the alignment looking for blocks greater than BLOCKSIZE, everytime * than a column hasn't been selected, check whether the current block is big * enough to be kept or it should be removed from the final alignment */ for(i = 0, pos = 0, block = 0; i < residNumber; i++) { if(saveResidues[i] != -1) block++; else { /* Remove columns from blocks smaller than input blocks size */ if(block < blockSize) for(j = pos; j <= i; j++) saveResidues[j] = -1; pos = i + 1; block = 0; } } /* Check final block separately since it could happen than last block is not * big enough but because the loop end could provoke to ignore it */ if(block < blockSize) for(j = pos; j < i; j++) saveResidues[j] = -1; return ; } /* Function designed to identify columns and sequences composed only by gaps. * Once these columns/sequences have been identified, they are removed from * final alignment. */ newValues alignment::removeCols_SeqsAllGaps(void) { int i, j, valid, gaps; bool warnings = false; newValues counter; /* Check all valid columns looking for those composed by only gaps */ for(i = 0, counter.residues = 0; i < residNumber; i++) { if(saveResidues[i] == -1) continue; for(j = 0, valid = 0, gaps = 0; j < sequenNumber; j++) { if (saveSequences[j] == -1) continue; if (sequences[j][i] == '-') gaps ++; valid ++; } /* Once a column has been identified, warm about it and remove it */ if(gaps == valid) { if(!warnings) cerr << endl; warnings = true; cerr << "WARNING: Removing column '" << i << "' composed only by gaps" << endl; saveResidues[i] = -1; } else { counter.residues ++; } } /* Check for those selected sequences to see whether there is anyone with * only gaps */ for(i = 0, counter.sequences = 0; i < sequenNumber; i++) { if(saveSequences[i] == -1) continue; for(j = 0, valid = 0, gaps = 0; j < residNumber; j++) { if(saveResidues[j] == -1) continue; if (sequences[i][j] == '-') gaps ++; valid ++; } /* Warm about it and remove each sequence composed only by gaps */ if(gaps == valid) { if(!warnings) cerr << endl; warnings = true; if(keepSequences) { cerr << "WARNING: Keeping sequence '" << seqsName[i] << "' composed only by gaps" << endl; counter.sequences ++; } else { cerr << "WARNING: Removing sequence '" << seqsName[i] << "' composed only by gaps" << endl; saveSequences[i] = -1; } } else { counter.sequences ++; } } if(warnings) cerr << endl; counter.matrix = new string[counter.sequences]; counter.seqsName = new string[counter.sequences]; return counter; } /* Function for copying to previously allocated memory those data selected * for being in the final alignment */ void alignment::fillNewDataStructure(string *newMatrix, string *newNames) { int i, j, k; /* Copy only those sequences/columns selected */ for(i = 0, j = 0; i < sequenNumber; i++) { if(saveSequences[i] == -1) continue; newNames[j] = seqsName[i]; for(k = 0; k < residNumber; k++) { if(saveResidues[k] == -1) continue; newMatrix[j].resize(newMatrix[j].size() + 1, sequences[i][k]); } j++; } } /* Function for copying to previously allocated memory those data selected * for being in the final alignment */ void alignment::fillNewDataStructure(newValues *data) { int i, j, k; /* Copy only those sequences/columns selected */ for(i = 0, j = 0; i < sequenNumber; i++) { if(saveSequences[i] == -1) continue; data -> seqsName[j] = seqsName[i]; for(k = 0; k < residNumber; k++) { if(saveResidues[k] == -1) continue; data -> matrix[j].resize(data -> matrix[j].size() + 1, sequences[i][k]); } j++; } // cerr << data -> seqsName[j-1] << endl; } /* Check if CDS file is correct based on: Residues are DNA/RNA (at most). There * is not gaps in the whole dataset. Each sequence is multiple of 3. At the same * time, the function will remove stop codons if appropiate flags are used */ bool alignment::prepareCodingSequence(bool splitByStopCodon, bool ignStopCodon,\ alignment *proteinAlig) { bool warning = false; size_t found; int i; /* New code: We now care about the presence of wildcards/indeterminations * characters such as 'X' or 'B' into protein sequences as well as about the * presence of Selenocysteines ('U') or Pyrrolysines ('O'). It only works, by * now, with the universal genetic code */ int *protSeqsLengths, numbProtSeqs, current_prot; string *protSeqsNames, *protSequences; char aminoAcid; numbProtSeqs = proteinAlig -> getNumSpecies(); protSeqsNames = new string[numbProtSeqs]; protSequences = new string[numbProtSeqs]; protSeqsLengths = new int[numbProtSeqs]; proteinAlig -> getSequences(protSeqsNames, protSequences, protSeqsLengths); /* Check read sequences are real DNA/RNA */ if (getTypeAlignment() == AAType) { cerr << endl << "ERROR: Check input CDS file. It seems to content protein " << "residues." << endl << endl; return false; } for(i = 0; i < sequenNumber; i++) { /* Get protein sequence to compare against any potential stop codon in the * coding sequence. If there is not protein sequence for current coding * sequence, skip its analysis */ for(current_prot = 0; current_prot < numbProtSeqs; current_prot++) if(protSeqsNames[current_prot] == seqsName[i]) break; if(current_prot == numbProtSeqs) continue; if(sequences[i].find("-") != string::npos) { if (!warning) cerr << endl; cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has, at least, one gap" << endl << endl; return false; } if((sequences[i].length() % 3) != 0) { if (!warning) cerr << endl; warning = true; cerr << "WARNING: Sequence length \"" << seqsName[i] << "\" is not " << "multiple of 3 (length: " << sequences[i].length() << ")" << endl; } /* Ignore stop codons from the CDS if set by the user */ if (ignStopCodon) continue; /* Detect universal stop codons in the CDS. Then, compare those stop codons * against the protein sequence to see whether they are real stop codons or * are representing rare amino-acids such as Selenocysteines or Pyrrolysines * It also allows stop-codons when there are wildcards/indet characters in * the protein sequence. CDS sequences could be splitted using stop codons * from the sequence itself */ /* Initialize first appearence of a given stop codon to -1. * That means that it has not been found yet */ found = -1; do { found = sequences[i].find("TGA", found + 1); /* If a stop codon has been found and its position is multiple of 3. * Analize it */ if((found != string::npos) && (((int) found % 3) == 0)) { aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]); /* It may be a Selenocysteine ('TGA') which should be represented as 'U' * or wildcard/indet characters such as 'X' or 'B' */ //~ if ((aminoAcid == 'U') || (aminoAcid == 'X') || (aminoAcid == 'B')) /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character * such as 'B'/'X' is present, skip current stop codon */ if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \ (aminoAcid == 'B')) continue; /* If split_by_stop_codon flag is activated then cut input CDS sequence * up to first appearance of a stop codon */ else if(splitByStopCodon) { if (!warning) cerr << endl; warning = true; cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first" << " appearance of stop codon \"TGA\" (residue \"" << aminoAcid << "\") at position " << (int) found + 1 << " (length: " << sequences[i].length() << ")" << endl; sequences[i].resize((int) found); } /* Otherwise, warn about it and return an error */ else { if (!warning) cerr << endl; cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \"" << "TGA\" (residue \"" << aminoAcid << "\") at position " << (int) found + 1 << " (length: " << sequences[i].length() << ")" << endl << endl; return false; } } /* Iterate over the CDS until not stop codon is found */ } while(found != string::npos); /* Initialize first appearence of a given stop codon to -1. * That means that it has not been found yet */ found = -1; do { found = sequences[i].find("TAA", found + 1); /* If a stop codon has been found and its position is multiple of 3. * Analize it */ if((found != string::npos) && (((int) found % 3) == 0)) { aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]); /* Check if there is any wildcard/indet characters such as 'X' or 'B' */ //~ if ((aminoAcid == 'X') || (aminoAcid == 'B')) /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character * such as 'B'/'X' is present, skip current stop codon */ if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \ (aminoAcid == 'B')) continue; /* If split_by_stop_codon flag is activated then cut input CDS sequence * up to first appearance of a stop codon */ else if(splitByStopCodon) { if (!warning) cerr << endl; warning = true; cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first" << " appearance of stop codon \"TAA\" (residue \"" << aminoAcid << "\") at position " << (int) found + 1 << " (length: " << sequences[i].length() << ")" << endl; sequences[i].resize((int) found); } /* Otherwise, warn about it and return an error */ else { if (!warning) cerr << endl; cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \"" << "TAA\" (residue \"" << aminoAcid << "\") at position " << (int) found + 1 << " (length: " << sequences[i].length() << ")" << endl << endl; return false; } } /* Iterate over the CDS until not stop codon is found */ } while(found != string::npos); /* Initialize first appearence of a given stop codon to -1. * That means that it has not been found yet */ found = -1; do { found = sequences[i].find("TAG", found + 1); /* If a stop codon has been found and its position is multiple of 3. * Analize it */ if((found != string::npos) && (((int) found % 3) == 0)) { aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]); /* It may be a Pyrrolysine ('TAG') which should be represented as 'O' * or wildcard/indet characters such as 'X' or 'B' */ //~ if ((aminoAcid == 'O') || (aminoAcid == 'X') || (aminoAcid == 'B')) /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character * such as 'B'/'X' is present, skip current stop codon */ if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \ (aminoAcid == 'B')) continue; /* If split_by_stop_codon flag is activated then cut input CDS sequence * up to first appearance of a stop codon */ else if(splitByStopCodon) { if (!warning) cerr << endl; warning = true; cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first" << " appearance of stop codon \"TAG\" (residue \"" << aminoAcid << "\") at position " << (int) found + 1 << " (length: " << sequences[i].length() << ")" << endl; sequences[i].resize((int) found); } /* Otherwise, warn about it and return an error */ else { if (!warning) cerr << endl; cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \"" << "TAG\" (residue \"" << aminoAcid << "\") at position " << (int) found + 1 << " (length: " << sequences[i].length() << ")" << endl << endl; return false; } } /* Iterate over the CDS until not stop codon is found */ } while(found != string::npos); } /* If everything was return an OK to informat about it. */ return true; } /* Function designed to check whether input CDS file is correct or not based on * some features: Sequences are in both files (it could be more on CDS file), * they have (more or less) same ength. Otherwise, some nucleotides could be * excluded or some 'N's added to fit protein length. */ bool alignment::checkCorrespondence(string *names, int *lengths, int \ totalInputSeqs, int multiple = 1) { int i, j, seqLength, indet; bool warnings = false; string tmp; /* For each sequence in the current protein alignment, look for its coding * DNA sequence checking that they have the same size. */ for(i = 0; i < sequenNumber; i++) { /* Get protein sequence length removing any possible gap. Get as well last * residue from current sequence */ tmp = utils::removeCharacter('-', sequences[i]); seqLength = tmp.length() * multiple; indet = ((int) tmp.length() - utils::min((int) tmp.find_last_not_of("X"), \ (int) tmp.find_last_not_of("x"))) - 1; /* Go through all available CDS looking for the one with the same ID */ for(j = 0; j < totalInputSeqs; j++) { /* Once both ID matchs, compare its lengths */ if(seqsName[i] == names[j]) { /* If both sequences have the same length, stop the search */ if(seqLength == lengths[j]) break; /* If nucleotide sequence is larger than protein sequence, warn about * it and continue the verification process. It will used the 'Nth' * first nucleotides for the conversion */ else if(seqLength < lengths[j]) { if (!warnings) cerr << endl; warnings = true; cerr << "WARNING: Sequence \"" << seqsName[i] << "\" will be cut at " << "position " << seqLength << " (length: "<< lengths[j] << ")" << endl; break; } /* It has been detected some indeterminations at the end of the protein * sequence. That issue could be cause by some incomplete codons in the * nucleotide sequences. This issue is solved adding as much 'N' symbols * as it is needed to preserve the backtranslated alignment */ else if((indet > 0) && (indet > (seqLength - lengths[j])/3)) { if (!warnings) cerr << endl; warnings = true; cerr << "WARNING: Sequence \"" << seqsName[i] << "\" has some inde" << "termination symbols 'X' at the end of sequence. They will be" << " included in the final alignment." << endl; break; } /* If nucleotide sequence is shorter than protein sequence, return an * error since it is not feasible to cut the input protein aligment to * fit it into CDNA sequences size */ else { if (!warnings) cerr << endl; warnings = true; cerr << "WARNING: Sequence \"" << seqsName[i] << "\" has less nucleo" << "tides (" << lengths[j] << ") than expected (" << seqLength << "). It will be added N's to complete the sequence" << endl; break; } } } /* Warn about a mismatch a sequences name level */ if(j == totalInputSeqs) { cerr << endl << "ERROR: Sequence \"" << seqsName[i] << "\" is not in " << "CDS file." << endl << endl; return false; } } /* If everything is OK, return an appropiate flag */ return true; }