Mercurial > repos > padge > trimal
diff trimal_repo/source/statisticsGaps.cpp @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author | padge |
---|---|
date | Fri, 25 Mar 2022 17:10:43 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trimal_repo/source/statisticsGaps.cpp Fri Mar 25 17:10:43 2022 +0000 @@ -0,0 +1,464 @@ +/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** + ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** + + trimAl v1.4: a tool for automated alignment trimming in large-scale + phylogenetics analyses. + + 2009-2015 Capella-Gutierrez S. and Gabaldon, T. + [scapella, tgabaldon]@crg.es + + This file is part of trimAl. + + trimAl is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, the last available version. + + trimAl is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with trimAl. If not, see <http://www.gnu.org/licenses/>. + +***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** +***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ + +#include "statisticsGaps.h" + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +| statisticsGaps::statisticsGaps(char **, int, int) | +| | +| Class constructor. This method uses the inputs parameters to put the information in the new object that | +| has been created. | +| | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +statisticsGaps::statisticsGaps(string *alignmentMatrix, int species, int aminos, int dataType_) { + + int i, j; + char indet; + + columnLength = species; + columns = aminos; + maxGaps = 0; + halfWindow = 0; + dataType = dataType_; + + if(dataType == AAType) + indet = 'X'; + else + indet = 'N'; + + /* Memory allocation for the vectors and its initialization */ + gapsInColumn = new int[columns]; + utils::initlVect(gapsInColumn, columns, 0); + + aminosXInColumn = new int[columns]; + utils::initlVect(aminosXInColumn, aminos, 0); + + gapsWindow = new int[columns]; + utils::initlVect(gapsWindow, columns, 0); + + numColumnsWithGaps = new int[species+1]; + utils::initlVect(numColumnsWithGaps, columnLength+1, 0); + + /* Count the gaps and indeterminations of each columns */ + for(i = 0; i < columns; i++) { + for(j = 0; j < columnLength; j++) { + if(alignmentMatrix[j][i] == '-') + gapsInColumn[i]++; + else if(alignmentMatrix[j][i] == indet) + aminosXInColumn[i]++; + } + + /* Increase the number of colums with the number of gaps of the last processed column */ + numColumnsWithGaps[gapsInColumn[i]]++; + gapsWindow[i] = gapsInColumn[i]; + if(gapsWindow[i] > maxGaps) maxGaps = gapsWindow[i]; + } +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +| statisticsGaps::statisticsGaps(void) | +| | +| Class constructor. | +| | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +statisticsGaps::statisticsGaps(void) { + + /* Initializate all values to NULL or 0 */ + gapsInColumn = NULL; + numColumnsWithGaps = NULL; + aminosXInColumn = NULL; + gapsWindow = NULL; + + columns = 0; + columnLength = 0; + maxGaps = 0; + halfWindow = 0; + dataType = 0; +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +| statisticsGaps::~statisticsGaps(void) | +| | +| Class destroyer. | +| | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +statisticsGaps::~statisticsGaps(void) { + + /* Only free memory if there is previous memory allocation */ + if(gapsInColumn != NULL){ + delete[] gapsInColumn; + delete[] numColumnsWithGaps; + delete[] aminosXInColumn; + delete[] gapsWindow; + } +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +| bool statisticsGaps::applyWindow(int) | +| | +| This method computes for each column's alignment its gapwindows' value. For this purpose, the method uses the | +| values that previously has been calculated and the window's size value. | +| | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +bool statisticsGaps::applyWindow(int _halfWindow) { + + int i, j, window; + + /* If one of this conditions is true, we return FALSE: */ + /* .- If already exists a previously calculated vector for this window size */ + /* .- If halfWinSize value is greater than 1/4 of alignment length */ + if((halfWindow == _halfWindow) || (_halfWindow > columns/4)) + return false; + + /* Initializate to 0 the vector that will store the number of gaps of each column */ + /* and the vector that will store window processing results */ + utils::initlVect(numColumnsWithGaps, columnLength+1, 0); + utils::initlVect(gapsWindow, columns, 0); + + /* Initializate maximum gaps' number per column value and store the mediumWinSize value in the object. */ + maxGaps = 0; + halfWindow = _halfWindow; + window = (2 * halfWindow + 1); + + /* We calculate some statistics for every column in the alignment,and the maximum gaps' number value */ + for(i = 0; i < columns; i++) { + /* Sum the total number of gaps for the considered window */ + for(j = i - halfWindow, gapsWindow[i] = 0; j <= i + halfWindow; j++) { + if(j < 0) + gapsWindow[i] += gapsInColumn[-j]; + else if(j >= columns) + gapsWindow[i] += gapsInColumn[((2 * columns - j) - 2)]; + else + gapsWindow[i] += gapsInColumn[j]; + } + + /* Calculate, and round to the nearest integer, the number of gaps for the i column */ + gapsWindow[i] = utils::roundInt(((double) gapsWindow[i]/window)); + /* Increase in 1 the number of colums with the same number of gaps than column i */ + numColumnsWithGaps[gapsWindow[i]]++; + + /* Update the max number of gaps in the alignment, if neccesary */ + if(gapsWindow[i] > maxGaps) + maxGaps = gapsWindow[i]; + } + return true; +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +| int *statisticsGaps::getGapsWindow(void) | +| | +| This method returns a pointer to gaps window's vector | +| | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +int *statisticsGaps::getGapsWindow(void) { + + return gapsWindow; +} + +double statisticsGaps::calcCutPoint(float minInputAlignment, float gapThreshold) + { + + /* Method to select the cutting point based on gaps values from the input + * alignment. The cutting point is selected as the maximum gaps number allowed + * in the output alignment given a minimum percentage of the input alignment + * to be kept and a maximum gaps number. In case of both values set different + * cutting points, the minimum percentage of the input alignment prevails. */ + + double cuttingPoint_MinimumConserv, cuttingPoint_gapThreshold; + int i, acum; + + /* Calculate the gap number represented by the gaps threshold. This gap number + * represents the maximum gap number in any column in the output alignment */ + cuttingPoint_gapThreshold = (double) columnLength * gapThreshold; + + /* Compute the minimum columns number to be kept from the input alignment */ + cuttingPoint_MinimumConserv = utils::roundInt(((double) + (columns * minInputAlignment) / 100.0)); + if(cuttingPoint_MinimumConserv > columns) + cuttingPoint_MinimumConserv = columns; + + /* We look at the number of gaps which allows us to keep the minimum columns + * number from the input alignment */ + for(i = 0, acum = 0; i < columnLength; i++) { + acum += numColumnsWithGaps[i]; + if(acum >= cuttingPoint_MinimumConserv) + break; + } + + /* If there is not an exact number for the gaps cutting point, compute such + * value as the inmediate superior gap number minus the proportion of columns + * necessary to respect the minimum percentage from the input alignment to be + * kept */ + if(numColumnsWithGaps[i]) + cuttingPoint_MinimumConserv = (double) (i - ((float) + (acum - cuttingPoint_MinimumConserv)/numColumnsWithGaps[i])); + else + cuttingPoint_MinimumConserv = 0; + + /* Return the maximum gap number of the two computed cutting points. */ + return (utils::max(cuttingPoint_MinimumConserv, cuttingPoint_gapThreshold)); +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +| int statisticsGaps::calcCutPointMixSlope(void) | +| | +| This method computes and selects the cut point based on the maximum rate between the first slope ratio between | +| gaps' percentage in the columns and alignments' length and the "second" slope (slope between three consecutive | +| points using only the first and third of them) ratio between gaps' percentage in the columns and alignments' | +| length. | +| | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +int statisticsGaps::calcCutPointMixSlope(void) { + + float delta = 0, maxSlope = -1, *firstSlopeVector, *secondSlopeVector; + int prev, pprev, maxIter, row = 1, act = 0, max = 0; + + /* We build two slope vectors, one vector for the first slope and another one for the second. */ + firstSlopeVector = new float[maxGaps+1]; + secondSlopeVector = new float[maxGaps+1]; + + /* We initialize them with -1.0 value and fix the maximum iteractions' number as maximun gaps' number plus 1. */ + utils::initlVect(firstSlopeVector, maxGaps, -1.0); + utils::initlVect(secondSlopeVector, maxGaps, -1.0); + maxIter = maxGaps + 1; + + /* Until to achieve the maximum iteractions' number. */ + while(act < maxIter) { + + /* We look for a first point to second slope. */ + while((numColumnsWithGaps[act]) == 0) act++; + pprev = act; if((act+1) >= maxIter) break; + + /* We look for a first point to first slope. */ + do { act++; } while((numColumnsWithGaps[act]) == 0); + prev = act; if((act+1) >= maxIter) break; + + /* We look for a second point to first and second slope. */ + do { act++; } while((numColumnsWithGaps[act]) == 0); + if(act >= maxIter) break; + + /* Calculate the first slope between the earlier previus and previus points. */ + firstSlopeVector[prev] = ((float) (prev - pprev) / columnLength); + firstSlopeVector[prev] /= ((float) numColumnsWithGaps[prev] / columns); + + /* Calculate the first slope between the previus and current points. */ + firstSlopeVector[act] = ((float) (act - prev) / columnLength); + firstSlopeVector[act] /= ((float) numColumnsWithGaps[act] / columns); + + /* Calculate the second slope between the earlier previus and current points. */ + secondSlopeVector[act] = ((float) (act - pprev) / columnLength); + secondSlopeVector[act] /= ((float) (numColumnsWithGaps[act] + numColumnsWithGaps[prev]) / columns); + + /* If the ratio between ... */ + if((secondSlopeVector[pprev] != -1.0) || (firstSlopeVector[pprev] != -1.0)) { + + /* .- first slope previus and first slope earlier previus points. */ + if(firstSlopeVector[pprev] != -1.0) { + delta = firstSlopeVector[prev]/firstSlopeVector[pprev]; + row = pprev; + } + + /* .- first slope current and first slope previus points. */ + if(delta < (firstSlopeVector[act]/firstSlopeVector[prev])) { + delta = firstSlopeVector[act]/firstSlopeVector[prev]; + row = prev; + } + + /* .- second slope current and second slope earlier previus points. */ + if(secondSlopeVector[pprev] != -1.0) { + if(delta < (secondSlopeVector[act]/secondSlopeVector[pprev])) { + delta = secondSlopeVector[act]/secondSlopeVector[pprev]; + row = pprev; + } + } + + /* ... is better that current maxSlope then we modify the maxSlope with the best ratio. */ + if(delta > maxSlope) { + maxSlope = delta; + max = row; + } + } + act = prev; + } + + /* We delete the local memory. */ + delete[] firstSlopeVector; + delete[] secondSlopeVector; + + /* and, finally, we return the cut point. */ + return max; +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +| int statisticsGaps::calcCutPoint2ndSlope(void) | +| | +| This method computes and selects the cut point based on the maximum "second" slope (slope between three | +| consecutive points using only the first and third of them) ratio between gaps' percentage in the columns and | +| alignments' length. | +| | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +int statisticsGaps::calcCutPoint2ndSlope(void) { + + float maxSlope = -1, *secondSlopeVector; + int prev, pprev, maxIter, act = 0, max = 0; + + /* We build one slope vector and fix the maximum iteractions' number as the gaps'number plus 1. */ + secondSlopeVector = new float[maxGaps+1]; + utils::initlVect(secondSlopeVector, maxGaps, -1.0); + maxIter = maxGaps + 1; + + /* Find the lowest number of gaps into the input alignment. If there are few + * points, it is possible that lowest number of gaps is returned as the thres + * hold. It could happen input alignment does not have columns with no-gaps */ + for(act = 0, max = 0; numColumnsWithGaps[act] == 0; act++) + max = act + 1; + + act = 0; + while(act < maxIter) { + + /* We look for a first point to second slope. */ + while((numColumnsWithGaps[act]) == 0) + act++; + pprev = act; + if((act+1) >= maxIter) + break; + + /* We look for a first point to first slope. */ + do { + act++; + } while((numColumnsWithGaps[act]) == 0); + prev = act; + if((act+1) >= maxIter) + break; + + /* We look for a second point to first and second slope. */ + do { + act++; + } while((numColumnsWithGaps[act]) == 0); + if(act >= maxIter) + break; + + /* Calculate the second slope between the earlier previous and current points. */ + secondSlopeVector[act] = ((float) (act - pprev) / columnLength); + secondSlopeVector[act] /= ((float) (numColumnsWithGaps[act] + numColumnsWithGaps[prev]) / columns); + + /* If the ratio between second slope current and second slope earlier previous points. */ + if(secondSlopeVector[pprev] != -1.0) { + if((secondSlopeVector[act]/secondSlopeVector[pprev]) > maxSlope) { + maxSlope = (secondSlopeVector[act]/secondSlopeVector[pprev]); + max = pprev; + } + } else if(secondSlopeVector[prev] != -1.0) { + if((secondSlopeVector[act]/secondSlopeVector[prev]) > maxSlope) { + maxSlope = (secondSlopeVector[act]/secondSlopeVector[prev]); + max = pprev; + } + } + act = prev; + } + + /* We deallocate local memory. */ + delete[] secondSlopeVector; + + /* Finally, we return the selected cut point. */ + return max; +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +| void statisticsGaps::printGapsColumns(void) | +| | +| This method shows the gaps' percentage per each column in the alignment. | +| | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +void statisticsGaps::printGapsColumns(void) { + + int *vectAux; + + /* We allocate a local vector to recovery information on it */ + vectAux = new int[columns]; + + /* We decide about the information's source then we get the information. */ + if(halfWindow == 0) + utils::copyVect(gapsInColumn, vectAux, columns); + else + utils::copyVect(gapsWindow, vectAux, columns); + + /* Fix the precision of output */ + /* We set the output precision and print the header. */ + cout << "| Residue\t % Gaps \t Gap Score |" << endl; + cout << "| Number \t \t |" << endl; + cout << "+----------------------------------------------+" << endl; + cout.precision(10); + + /* Show the information that have been requered */ + for(int i = 0; i < columns; i++) + cout << " " << setw(5) << i << "\t\t" << setw(10) << (vectAux[i] * 100.0)/columnLength + << "\t" << setw(7) << 1 -((vectAux[i] * 1.0)/columnLength) << endl; + + /* Finally, we deallocate the local memory */ + delete[] vectAux; +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +| void statisticsGaps::printGapsAcl(void) | +| | +| This method shows the gaps' statistics for the alignment. | +| | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ + +void statisticsGaps::printGapsAcl(void) { + + int acm, i; + + /* Fix the precision of output */ + cout << "| Number of\t \t|\t Cumulative \t% Cumulative\t|\tNumber of Gaps\t % Gaps \tGap Score |" << endl; + cout << "| Residues \t% Length\t|\tNumberResid.\t Length \t|\t per Column \tper Column\tper Column |" << endl; + cout << "+-------------------------------+-----------------------------" + << "----------+--------------------------------------------------+" << endl; + cout.precision(10); + + /* Count for each gaps' number the columns' number with that gaps' number. */ + for(i = 0, acm = 0; i <= maxGaps; i++) { + + /* If the columns' number with this gaps' number is not equal to zero, we will count the columns. */ + if(numColumnsWithGaps[i] != 0) { + + /* Compute and prints the accumulative values for the gaps in the alignment. */ + acm += numColumnsWithGaps[i]; + cout << " " << setiosflags(ios::left) << numColumnsWithGaps[i] << "\t\t" << setw(10) << (numColumnsWithGaps[i] * 100.0)/columns + << "\t\t" << acm << "\t\t" << setw(10) << (acm * 100.0)/columns + << "\t\t" << i << "\t\t" << setw(10) << (i * 1.0)/columnLength << "\t"<< setw(10) << 1 - ((i * 1.0)/columnLength) << endl; + } + } +}