view trimal_repo/source/statisticsGaps.cpp @ 0:b15a3147e604 draft

"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author padge
date Fri, 25 Mar 2022 17:10:43 +0000
parents
children
line wrap: on
line source

/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
   ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****

    trimAl v1.4: a tool for automated alignment trimming in large-scale
                 phylogenetics analyses.

    2009-2015 Capella-Gutierrez S. and Gabaldon, T.
              [scapella, tgabaldon]@crg.es

    This file is part of trimAl.

    trimAl is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, the last available version.

    trimAl is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with trimAl. If not, see <http://www.gnu.org/licenses/>.

***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */

#include "statisticsGaps.h"

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|  statisticsGaps::statisticsGaps(char **, int, int)                                                                   |
|                                                                                                                      |
|       Class constructor. This method uses the inputs parameters to put the information in the new object that        |
|       has been created.                                                                                              |
|                                                                                                                      |
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

statisticsGaps::statisticsGaps(string *alignmentMatrix, int species, int aminos, int dataType_) {

  int i, j;
  char indet;

  columnLength = species;
  columns =      aminos;
  maxGaps =      0;
  halfWindow =   0;
  dataType = dataType_;

  if(dataType == AAType)
    indet = 'X';
  else
    indet = 'N';

  /* Memory allocation for the vectors and its initialization */
  gapsInColumn =       new int[columns];
  utils::initlVect(gapsInColumn, columns, 0);

  aminosXInColumn =    new int[columns];
  utils::initlVect(aminosXInColumn, aminos, 0);

  gapsWindow =         new int[columns];
  utils::initlVect(gapsWindow, columns, 0);

  numColumnsWithGaps = new int[species+1];
  utils::initlVect(numColumnsWithGaps, columnLength+1, 0);

  /* Count the gaps and indeterminations of each columns */
  for(i = 0; i < columns; i++) {
    for(j = 0; j < columnLength; j++) {
      if(alignmentMatrix[j][i] == '-')
        gapsInColumn[i]++;
      else if(alignmentMatrix[j][i] == indet)
        aminosXInColumn[i]++;
    }

    /* Increase the number of colums with the number of gaps of the last processed column */
    numColumnsWithGaps[gapsInColumn[i]]++;
    gapsWindow[i] = gapsInColumn[i];
    if(gapsWindow[i] > maxGaps) maxGaps = gapsWindow[i];
  }
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|  statisticsGaps::statisticsGaps(void)                                                                                |
|                                                                                                                      |
|       Class constructor.                                                                                             |
|                                                                                                                      |
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

statisticsGaps::statisticsGaps(void) {

  /* Initializate all values to NULL or 0 */
  gapsInColumn = NULL;
  numColumnsWithGaps = NULL;
  aminosXInColumn = NULL;
  gapsWindow = NULL;

  columns = 	  0;
  columnLength  = 0;
  maxGaps = 	  0;
  halfWindow = 	  0;
  dataType =      0;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|  statisticsGaps::~statisticsGaps(void)                                                                               |
|                                                                                                                      |
|       Class destroyer.                                                                                               |
|                                                                                                                      |
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

statisticsGaps::~statisticsGaps(void) {

  /* Only free memory if there is previous memory allocation */
  if(gapsInColumn != NULL){
    delete[] gapsInColumn;
    delete[] numColumnsWithGaps;
    delete[] aminosXInColumn;
    delete[] gapsWindow;
  }
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|  bool statisticsGaps::applyWindow(int)                                                                               |
|                                                                                                                      |
|       This method computes for each column's alignment its gapwindows' value. For this purpose, the method uses the  |
|       values that previously has been calculated and the window's size value.                                        |
|                                                                                                                      |
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

bool statisticsGaps::applyWindow(int _halfWindow) {

  int i, j, window;

  /* If one of this conditions is true, we return FALSE:                         */
  /*    .- If already exists a previously calculated vector for this window size */
  /*    .- If halfWinSize value is greater than 1/4 of alignment length        */
  if((halfWindow == _halfWindow) || (_halfWindow > columns/4))
     return false;

  /* Initializate to 0 the vector that will store the number of gaps of each column */
  /* and the vector that will store window processing results                       */
  utils::initlVect(numColumnsWithGaps, columnLength+1, 0);
  utils::initlVect(gapsWindow, columns, 0);

  /* Initializate maximum gaps' number per column value and store the mediumWinSize value in the object. */
  maxGaps = 0;
  halfWindow = _halfWindow;
  window = (2 * halfWindow + 1);

  /* We calculate some statistics for every column in the alignment,and the maximum gaps' number value */
  for(i = 0; i < columns; i++) {
    /* Sum the total number of gaps for the considered window */
    for(j = i - halfWindow, gapsWindow[i] = 0; j <= i + halfWindow; j++) {
      if(j < 0)
        gapsWindow[i] += gapsInColumn[-j];
      else if(j >= columns)
        gapsWindow[i] += gapsInColumn[((2 * columns - j) - 2)];
      else
        gapsWindow[i] += gapsInColumn[j];
    }

    /* Calculate, and round to the nearest integer, the number of gaps for the i column */
    gapsWindow[i] = utils::roundInt(((double) gapsWindow[i]/window));
    /* Increase in 1 the number of colums with the same number of gaps than column i */
    numColumnsWithGaps[gapsWindow[i]]++;

    /* Update the max number of gaps in the alignment, if neccesary */
    if(gapsWindow[i] > maxGaps)
      maxGaps = gapsWindow[i];
  }
  return true;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|  int *statisticsGaps::getGapsWindow(void)                                                                            |
|                                                                                                                      |
|       This method returns a pointer to gaps window's vector                                                          |
|                                                                                                                      |
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

int *statisticsGaps::getGapsWindow(void) {

  return gapsWindow;
}

double statisticsGaps::calcCutPoint(float minInputAlignment, float gapThreshold)
  {

  /* Method to select the cutting point based on gaps values from the input
   * alignment. The cutting point is selected as the maximum gaps number allowed
   * in the output alignment given a minimum percentage of the input alignment
   * to be kept and a maximum gaps number. In case of both values set different
   * cutting points, the minimum percentage of the input alignment prevails. */

  double cuttingPoint_MinimumConserv, cuttingPoint_gapThreshold;
  int i, acum;

  /* Calculate the gap number represented by the gaps threshold. This gap number
   * represents the maximum gap number in any column in the output alignment */
  cuttingPoint_gapThreshold = (double) columnLength * gapThreshold;

  /* Compute the minimum columns number to be kept from the input alignment */
  cuttingPoint_MinimumConserv = utils::roundInt(((double)
    (columns * minInputAlignment) / 100.0));
  if(cuttingPoint_MinimumConserv > columns)
    cuttingPoint_MinimumConserv = columns;    

  /* We look at the number of gaps which allows us to keep the minimum columns
   * number from the input alignment */
  for(i = 0, acum = 0; i < columnLength; i++) {
    acum += numColumnsWithGaps[i];
    if(acum >= cuttingPoint_MinimumConserv)
      break;
  }

  /* If there is not an exact number for the gaps cutting point, compute such
   * value as the inmediate superior gap number minus the proportion of columns
   * necessary to respect the minimum percentage from the input alignment to be
   * kept */
  if(numColumnsWithGaps[i])
    cuttingPoint_MinimumConserv = (double) (i - ((float)
      (acum - cuttingPoint_MinimumConserv)/numColumnsWithGaps[i]));
  else
    cuttingPoint_MinimumConserv = 0;

  /* Return the maximum gap number of the two computed cutting points. */
  return (utils::max(cuttingPoint_MinimumConserv, cuttingPoint_gapThreshold));
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|  int statisticsGaps::calcCutPointMixSlope(void)                                                                      |
|                                                                                                                      |
|       This method computes and selects the cut point based on the maximum rate between the first slope ratio between |
|       gaps' percentage in the columns and alignments' length and the "second" slope (slope between three consecutive |
|       points using only the first and third of them) ratio between gaps' percentage in the columns and alignments'   |
|       length.                                                                                                        |
|                                                                                                                      |
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

int statisticsGaps::calcCutPointMixSlope(void) {

  float delta = 0, maxSlope = -1, *firstSlopeVector, *secondSlopeVector;
  int prev, pprev, maxIter, row = 1, act = 0, max = 0;

  /* We build two slope vectors, one vector for the first slope and another one for the second. */
  firstSlopeVector = new float[maxGaps+1];
  secondSlopeVector = new float[maxGaps+1];

  /* We initialize them with -1.0 value and fix the maximum iteractions' number as maximun gaps' number plus 1. */
  utils::initlVect(firstSlopeVector, maxGaps, -1.0);
  utils::initlVect(secondSlopeVector, maxGaps, -1.0);
  maxIter = maxGaps + 1;

  /* Until to achieve the maximum iteractions' number. */
  while(act < maxIter) {

    /* We look for a first point to second slope. */
    while((numColumnsWithGaps[act]) == 0) act++;
    pprev = act; if((act+1) >= maxIter) break;

    /* We look for a first point to first slope. */
    do { act++; } while((numColumnsWithGaps[act]) == 0);
    prev = act; if((act+1) >= maxIter) break;

    /* We look for a second point to first and second slope. */
    do { act++; } while((numColumnsWithGaps[act]) == 0);
    if(act >= maxIter) break;

    /* Calculate the first slope between the earlier previus and previus points. */
    firstSlopeVector[prev] =  ((float) (prev - pprev) / columnLength);
    firstSlopeVector[prev] /= ((float) numColumnsWithGaps[prev] / columns);

    /* Calculate the first slope between the previus and current points. */
    firstSlopeVector[act] =  ((float) (act - prev) / columnLength);
    firstSlopeVector[act] /= ((float) numColumnsWithGaps[act] / columns);

    /* Calculate the second slope between the earlier previus and current points. */
    secondSlopeVector[act] = ((float) (act - pprev) / columnLength);
    secondSlopeVector[act] /= ((float) (numColumnsWithGaps[act] + numColumnsWithGaps[prev]) / columns);

    /* If the ratio between ... */
    if((secondSlopeVector[pprev] != -1.0) || (firstSlopeVector[pprev] != -1.0)) {

      /* .- first slope previus and first slope earlier previus points. */
      if(firstSlopeVector[pprev] != -1.0) {
        delta = firstSlopeVector[prev]/firstSlopeVector[pprev];
        row = pprev;
      }

      /* .- first slope current and first slope previus points. */
      if(delta < (firstSlopeVector[act]/firstSlopeVector[prev])) {
        delta = firstSlopeVector[act]/firstSlopeVector[prev];
        row = prev;
      }

      /* .- second slope current and second slope earlier previus points. */
      if(secondSlopeVector[pprev] != -1.0) {
        if(delta < (secondSlopeVector[act]/secondSlopeVector[pprev])) {
          delta = secondSlopeVector[act]/secondSlopeVector[pprev];
          row = pprev;
        }
      }

      /* ... is better that current maxSlope then we modify the maxSlope with the best ratio. */
      if(delta > maxSlope) {
        maxSlope = delta;
        max = row;
      }
    }
    act = prev;
  }

  /* We delete the local memory. */
  delete[] firstSlopeVector;
  delete[] secondSlopeVector;

  /* and, finally, we return the cut point. */
  return max;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|  int statisticsGaps::calcCutPoint2ndSlope(void)                                                                      |
|                                                                                                                      |
|       This method computes and selects the cut point based on the maximum "second" slope (slope between three        |
|       consecutive points using only the first and third of them) ratio between gaps' percentage in the columns and   |
|       alignments' length.                                                                                            |
|                                                                                                                      |
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

int statisticsGaps::calcCutPoint2ndSlope(void) {

  float maxSlope = -1, *secondSlopeVector;
  int prev, pprev, maxIter, act = 0, max = 0;

  /* We build one slope vector and fix the maximum iteractions' number as the gaps'number plus 1. */
  secondSlopeVector = new float[maxGaps+1];
  utils::initlVect(secondSlopeVector, maxGaps, -1.0);
  maxIter = maxGaps + 1;

  /* Find the lowest number of gaps into the input alignment. If there are few
   * points, it is possible that lowest number of gaps is returned as the thres
   * hold. It could happen input alignment does not have columns with no-gaps */
  for(act = 0, max = 0; numColumnsWithGaps[act] == 0; act++)
    max = act + 1;

  act = 0;
  while(act < maxIter) {

    /* We look for a first point to second slope. */
    while((numColumnsWithGaps[act]) == 0)
      act++;
    pprev = act;
    if((act+1) >= maxIter)
      break;

    /* We look for a first point to first slope. */
    do {
      act++;
    } while((numColumnsWithGaps[act]) == 0);
    prev = act;
    if((act+1) >= maxIter)
      break;

    /* We look for a second point to first and second slope. */
    do {
      act++;
    } while((numColumnsWithGaps[act]) == 0);
    if(act >= maxIter)
      break;

    /* Calculate the second slope between the earlier previous and current points. */
    secondSlopeVector[act] = ((float) (act - pprev) / columnLength);
    secondSlopeVector[act] /= ((float) (numColumnsWithGaps[act] + numColumnsWithGaps[prev]) / columns);

    /* If the ratio between second slope current and second slope earlier previous points. */
    if(secondSlopeVector[pprev] != -1.0) {
      if((secondSlopeVector[act]/secondSlopeVector[pprev]) > maxSlope) {
        maxSlope = (secondSlopeVector[act]/secondSlopeVector[pprev]);
        max = pprev;
      }
    } else if(secondSlopeVector[prev] != -1.0) {
      if((secondSlopeVector[act]/secondSlopeVector[prev]) > maxSlope) {
        maxSlope = (secondSlopeVector[act]/secondSlopeVector[prev]);
        max = pprev;
      }
    }
    act = prev;
  }

  /* We deallocate local memory. */
  delete[] secondSlopeVector;

  /* Finally, we return the selected cut point. */
  return max;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|  void statisticsGaps::printGapsColumns(void)                                                                         |
|                                                                                                                      |
|       This method shows the gaps' percentage per each column in the alignment.                                       |
|                                                                                                                      |
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

void statisticsGaps::printGapsColumns(void) {

  int *vectAux;

  /* We allocate a local vector to recovery information on it */
  vectAux = new int[columns];

  /* We decide about the information's source then we get the information. */
  if(halfWindow == 0)
    utils::copyVect(gapsInColumn, vectAux, columns);
  else
   utils::copyVect(gapsWindow, vectAux, columns);

  /* Fix the precision of output */
  /* We set the output precision and print the header. */
  cout << "| Residue\t  % Gaps \t   Gap Score   |" << endl;
  cout << "| Number \t         \t               |" << endl;
  cout << "+----------------------------------------------+" << endl;
  cout.precision(10);

  /* Show the information that have been requered */
  for(int i = 0; i < columns; i++)
    cout << "  " << setw(5) << i << "\t\t" << setw(10) << (vectAux[i] * 100.0)/columnLength
         << "\t" << setw(7) << 1 -((vectAux[i] * 1.0)/columnLength) << endl;

  /* Finally, we deallocate the local memory */
  delete[] vectAux;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|  void statisticsGaps::printGapsAcl(void)                                                                             |
|                                                                                                                      |
|       This method shows the gaps' statistics for the alignment.                                                      |
|                                                                                                                      |
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

void statisticsGaps::printGapsAcl(void) {

  int acm, i;

  /* Fix the precision of output */
  cout << "| Number of\t        \t|\t Cumulative \t% Cumulative\t|\tNumber of Gaps\t  % Gaps  \tGap Score  |"  << endl;
  cout << "| Residues \t% Length\t|\tNumberResid.\t   Length   \t|\t  per Column  \tper Column\tper Column |" << endl;
  cout << "+-------------------------------+-----------------------------"
       << "----------+--------------------------------------------------+" << endl;
  cout.precision(10);

  /* Count for each gaps' number the columns' number with that gaps' number. */
  for(i = 0, acm = 0; i <= maxGaps; i++) {

    /* If the columns' number with this gaps' number is not equal to zero, we will count the columns. */
    if(numColumnsWithGaps[i] != 0) {

      /* Compute and prints the accumulative values for the gaps in the alignment. */
      acm += numColumnsWithGaps[i];
      cout << "  " << setiosflags(ios::left) << numColumnsWithGaps[i] << "\t\t" << setw(10) << (numColumnsWithGaps[i] * 100.0)/columns
           << "\t\t" << acm << "\t\t" << setw(10) << (acm * 100.0)/columns
           << "\t\t" << i << "\t\t" << setw(10) << (i * 1.0)/columnLength << "\t"<< setw(10) << 1 - ((i * 1.0)/columnLength) << endl;
    }
  }
}