diff trimal_repo/source/autAlignment.cpp @ 0:b15a3147e604 draft

"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author padge
date Fri, 25 Mar 2022 17:10:43 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trimal_repo/source/autAlignment.cpp	Fri Mar 25 17:10:43 2022 +0000
@@ -0,0 +1,448 @@
+/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
+   ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
+
+    trimAl v1.4: a tool for automated alignment trimming in large-scale
+                 phylogenetics analyses.
+
+    2009-2015 Capella-Gutierrez S. and Gabaldon, T.
+              [scapella, tgabaldon]@crg.es
+
+    This file is part of trimAl.
+
+    trimAl is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, the last available version.
+
+    trimAl is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with trimAl. If not, see <http://www.gnu.org/licenses/>.
+
+***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
+***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
+#include "alignment.h"
+#include "defines.h"
+
+/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
+/* This function computes the identities values between the sequences from
+ * the alignment */
+/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
+void alignment::calculateSeqIdentity(void) {
+
+  int i, j, k, hit, dst;
+  char indet;
+
+  /* Depending on alignment type, indetermination symbol will be one or other */
+  indet = getTypeAlignment() == AAType ? 'X' : 'N';
+
+  /* Create identities matrix to store identities scores */
+  identities = new float*[sequenNumber];
+
+  /* For each seq, compute its identity score against the others in the MSA */
+  for(i = 0; i < sequenNumber; i++) {
+    identities[i] = new float[sequenNumber];
+
+    /* It's a symmetric matrix, copy values that have been already computed */
+    for(j = 0; j < i; j++)
+      identities[i][j] = identities[j][i];
+    identities[i][i] = 0;
+
+    /* Compute identity scores for the current sequence against the rest */
+    for(j = i + 1; j < sequenNumber; j++) {
+      for(k = 0, hit = 0, dst = 0; k < residNumber; k++) {
+      /* If one of the two positions is a valid residue,
+       * count it for the common length */
+        if(((sequences[i][k] != indet) && (sequences[i][k] != '-')) ||
+           ((sequences[j][k] != indet) && (sequences[j][k] != '-'))) {
+          dst++;
+          /* If both positions are the same, count a hit */
+          if(sequences[i][k] == sequences[j][k])
+            hit++;
+        }
+      }
+
+      /* Identity score between two sequences is the ratio of identical residues
+       * by the total length (common and no-common residues) among them */
+      identities[i][j] = (float) hit/dst;
+    }
+  }
+}
+
+void alignment::calculateSeqOverlap(void) {
+  /* Compute the overlap between sequences taken each of them as the reference
+   * to compute such scores. It will lead to a non-symmetric matrix. */
+
+  int i, j, k, shared, referenceLength;
+  char indet;
+
+  /* Depending on alignment type, indetermination symbol will be one or other */
+  indet = getTypeAlignment() == AAType ? 'X' : 'N';
+
+  /* Create overlap matrix to store overlap scores */
+  overlaps = new float*[sequenNumber];
+
+  /* For each seq, compute its overlap score against the others in the MSA */
+  for(i = 0; i < sequenNumber; i++) {
+    overlaps[i] = new float[sequenNumber];
+
+    for(j = 0; j < sequenNumber; j++) {
+      for(k = 0, shared = 0, referenceLength = 0; k < residNumber; k++) {
+        /* If there a valid residue for the reference sequence, then see if
+         * there is a valid residue for the other sequence. */
+        if((sequences[i][k] != indet) && (sequences[i][k] != '-')) {
+          referenceLength++;
+          if ((sequences[j][k] != indet) && (sequences[j][k] != '-'))
+            shared++;
+        }
+      }
+      /* Overlap score between two sequences is the ratio of shared valid
+       * residues divided by the sequence length taken as reference. The
+       * overlaps matrix, therefore, will be not symmetric. */
+      overlaps[i][j] = (float) shared/referenceLength;
+    }
+  }
+}
+
+void alignment::calculateRelaxedSeqIdentity(void) {
+  /* Raw approximation of sequence identity computation designed for reducing
+   * comparisons for huge alignemnts */
+
+  int i, j, k, hit;
+
+  /* Create identities matrix to store identities scores */
+  identities = new float*[sequenNumber];
+
+  /* For each seq, compute its identity score against the others in the MSA */
+  for(i = 0; i < sequenNumber; i++) {
+    identities[i] = new float[sequenNumber];
+
+    /* It's a symmetric matrix, copy values that have been already computed */
+    for(j = 0; j < i; j++)
+      identities[i][j] = identities[j][i];
+    identities[i][i] = 0;
+
+    /* Compute identity score between the selected sequence and the others */
+    for(j = i + 1; j < sequenNumber; j++) {
+      for(k = 0, hit = 0; k < residNumber; k++) {
+        /* If both positions are the same, count a hit */
+        if(sequences[i][k] == sequences[j][k])
+          hit++;
+      }
+    /* Raw identity score is computed as the ratio of identical residues between
+     * alignment length */
+      identities[i][j] = (float) hit/residNumber;
+    }
+  }
+}
+
+
+/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
+/* This function computes some parameters from the input alignment such as
+ * identity average, identity average from each sequence and its most similar
+ * one, etc, to select which one is the best automated method to trim this
+ * alignment */
+/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
+int alignment::selectMethod(void) {
+
+  float mx, avg, maxSeq = 0, avgSeq = 0;
+  int i, j;
+
+  /* ***** ***** ***** ***** ***** ***** ***** ***** */
+  /* Ask for the sequence identities assesment */
+  if(identities == NULL)
+    calculateSeqIdentity();
+  /* ***** ***** ***** ***** ***** ***** ***** ***** */
+
+  /* ***** ***** ***** ***** ***** ***** ***** ***** */
+  /* Once we have the identities among all possible
+   * combinations between each pair of sequence. We
+   * compute the average identity as well as the
+   * average identity for each sequence with its most
+   * similar one */
+  for(i = 0; i < sequenNumber; i++) {
+    for(j = 0, mx = 0, avg = 0; j < sequenNumber; j++) {
+      if(i != j) {
+        mx  = mx < identities[i][j] ? identities[i][j] : mx;
+        avg += identities[i][j];
+      }
+    }
+    avgSeq += avg/(sequenNumber - 1);
+    maxSeq += mx;
+  }
+  /* ***** ***** ***** ***** ***** ***** ***** ***** */
+
+  /* ***** ***** ***** ***** ***** ***** ***** ***** */
+  avgSeq = avgSeq/sequenNumber;
+  maxSeq = maxSeq/sequenNumber;
+  /* ***** ***** ***** ***** ***** ***** ***** ***** */
+
+  /* ***** ***** ***** ***** ***** ***** ***** ***** */
+  /* With the different parameters, we decide wich one
+   * is the best automated method, based on a previous
+   * simulated data benchmarks, to trim the alig */
+  if(avgSeq >= 0.55)      return GAPPYOUT;
+  else if(avgSeq <= 0.38) return STRICT;
+  /* ***** ***** ***** ***** ***** ***** ***** ***** */
+
+  /* ***** ***** ***** ***** ***** ***** ***** ***** */
+  /* Sometimes we need to use more parameters to select
+   * the best automated method, always based on our
+   * benchmarks, to trim the input alignment */
+  else {
+    if(sequenNumber <= 20) return GAPPYOUT;
+    else {
+      if((maxSeq >= 0.5) && (maxSeq <= 0.65)) return GAPPYOUT;
+      else return STRICT;
+    }
+  }
+  /* ***** ***** ***** ***** ***** ***** ***** ***** */
+}
+
+/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
+/* This method prints different identity values computed from the alignment.
+ * In this method, we asses the identity values matrix as well as diferent
+ * average values. Moreover, the method computes which one is the most
+ * similar sequence, in term of identity values, for each one in this alig */
+/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
+void alignment::printSeqIdentity(void) {
+
+  int i, j, k, pos, maxLongName;
+  float mx, avg, maxAvgSeq = 0, maxSeq = 0, avgSeq = 0, **maxs;
+
+  /* Ask for the sequence identities assesment */
+  if(identities == NULL)
+    calculateSeqIdentity();
+
+  /* For each sequence, we look for its most similar one */
+  maxs = new float*[sequenNumber];
+
+  for(i = 0; i < sequenNumber; i++) {
+    maxs[i] = new float[2];
+
+    /* Get the most similar sequence to the current one in term of identity */
+    for(k = 0, mx = 0, avg = 0, pos = i; k < sequenNumber; k++) {
+      if(i != k) {
+        avg += identities[i][k];
+        if(mx < identities[i][k]) {
+          mx = identities[i][k];
+          pos = k;
+        }
+      }
+    }
+    /* Update global average variables*/
+    avgSeq += avg/(sequenNumber - 1);
+    maxAvgSeq += mx;
+
+    /* Save the maximum average identity value for each sequence */
+    maxs[i][0] = mx;
+    maxs[i][1] = pos;
+  }
+
+  /* Compute general averages */
+  avgSeq = avgSeq/sequenNumber;
+  maxAvgSeq = maxAvgSeq/sequenNumber;
+
+  /* Compute longest sequences name */
+  for(i = 0, maxLongName = 0; i < sequenNumber; i++)
+    maxLongName = utils::max(maxLongName, seqsName[i].size());
+
+  /* Once the method has computed all of different values, it prints it */
+  cout.precision(4);
+  cout << fixed;
+
+  for(i = 0, maxSeq = 0; i < sequenNumber; i++)
+    if(maxs[i][0] > maxSeq)
+      maxSeq = maxs[i][0];
+
+  cout << "## MaxIdentity\t" << maxSeq;
+  cout << endl << "#> MaxIdentity\tGet the maximum identity value for any pair "
+    << "of sequences in the alignment" << endl;
+
+  cout << endl << "## AverageIdentity\t" << avgSeq;
+  cout << endl << "#> AverageIdentity\tAverage identity between all sequences";
+
+  cout << endl << endl << "## Identity sequences matrix";
+  for(i = 0; i < sequenNumber; i++) {
+    cout << endl << setw(maxLongName + 2) << left << seqsName[i] << "\t";
+    for(j = 0; j < i; j++)
+      cout << setiosflags(ios::left) << setw(10) << identities[i][j] << "\t";
+    cout << setiosflags(ios::left) << setw(10) << 1.00 << "\t";
+    for(j = i + 1; j < sequenNumber; j++)
+      cout << setiosflags(ios::left) << setw(10) << identities[i][j] << "\t";
+  }
+  cout << endl;
+
+  cout << endl << "## AverageMostSimilarIdentity\t" << maxAvgSeq;
+  cout << endl << "#> AverageMostSimilarIdentity\t Average identity between "
+    << "most similar pair-wise sequences";
+
+  cout << endl << endl << "## Identity for most similar pair-wise sequences "
+    << "matrix" << endl;
+  for(i = 0; i < sequenNumber; i++)
+    cout << setw(maxLongName + 2) << left << seqsName[i]
+      << "\t" << setiosflags(ios::left) << setw(5)
+      << maxs[i][0] << "\t" << seqsName[(int) maxs[i][1]] << endl;
+}
+
+void alignment::printSeqOverlap(void) {
+
+  int i, j, k, pos, maxLongName;
+  float mx, avg, maxAvgSeq = 0, maxSeq = 0, avgSeq = 0, **maxs;
+
+  /* Ask for the sequence identities assesment */
+  if(overlaps == NULL)
+    calculateSeqOverlap();
+
+  /* For each sequence, we look for its most similar one */
+  maxs = new float*[sequenNumber];
+
+  for(i = 0; i < sequenNumber; i++) {
+    maxs[i] = new float[2];
+
+    /* Get the most similar sequence to the current one in term of overlap */
+    for(k = 0, mx = 0, avg = 0, pos = i; k < sequenNumber; k++) {
+      if(i != k) {
+        avg += overlaps[i][k];
+        if(mx < overlaps[i][k]) {
+          mx = overlaps[i][k];
+          pos = k;
+        }
+      }
+    }
+    /* Update global average variables*/
+    avgSeq += avg/(sequenNumber - 1);
+    maxAvgSeq += mx;
+
+    /* Save the maximum average overlap value for each sequence */
+    maxs[i][0] = mx;
+    maxs[i][1] = pos;
+  }
+
+  /* Compute general averages */
+  avgSeq = avgSeq/sequenNumber;
+  maxAvgSeq = maxAvgSeq/sequenNumber;
+
+  /* Compute longest sequences name */
+  for(i = 0, maxLongName = 0; i < sequenNumber; i++)
+    maxLongName = utils::max(maxLongName, seqsName[i].size());
+
+  /* Once the method has computed all of different values, it prints it */
+  cout.precision(4);
+  cout << fixed;
+
+  for(i = 0, maxSeq = 0; i < sequenNumber; i++)
+    if(maxs[i][0] > maxSeq)
+      maxSeq = maxs[i][0];
+
+  cout << "## MaxOverlap\t" << maxSeq;
+  cout << endl << "#> MaxOverlap\tGet the maximum overlap value for any pair "
+    << "of sequences in the alignment" << endl;
+
+  cout << endl << "## AverageOverlap\t" << avgSeq;
+  cout << endl << "#> AverageOverlap\tAverage overlap between all sequences";
+
+  cout << endl << endl << "## Overlap sequences matrix";
+  for(i = 0; i < sequenNumber; i++) {
+    cout << endl << setw(maxLongName + 2) << left << seqsName[i] << "\t";
+    for(j = 0; j < sequenNumber; j++)
+      cout << setiosflags(ios::left) << setw(10) << overlaps[i][j] << "\t";
+  }
+  cout << endl;
+}
+
+/* *** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *** */
+/*                                                                           */
+/*                             NEW CODE: feb/2012                            */
+/*                                                                           */
+/* *** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *** */
+void alignment::calculateColIdentity(float *ColumnIdentities) {
+
+  int i, j, counter, pos, max, columnLen;
+  char letter, indet, gapSymbol;
+  string column;
+
+  /* Initialize some data for make computation more precise */
+  indet = getTypeAlignment() == AAType ? 'X' : 'N';
+  gapSymbol = '-';
+
+  /* Compute identity score for the most frequent residue, it can be as well
+   * gaps and indeterminations, for each column */
+  for(i = 0, max = 0; i < residNumber; i++, max = 0, column.clear()) {
+
+    /* Get residues from each column in capital letters */
+    for(j = 0; j < sequenNumber; j++)
+      /* Discard gaps and indeterminations from calculations */
+      if((toupper(sequences[j][i]) != indet) && (sequences[j][i] != gapSymbol))
+        column += toupper(sequences[j][i]);
+    columnLen = column.size();
+
+    /* Count letter frequency. It only matter the frequency. Use some shorcuts
+     * to speed-up the process */
+    while (!column.empty()) {
+      letter = column[0];
+      counter = 0;
+      pos = 0;
+
+      do {
+        counter += 1;
+        column.erase(pos, 1);
+        pos = column.find(letter, pos);
+      } while(pos != (int) string::npos);
+
+      /* Keep only the most frequent residue */
+      if(counter > max)
+        max = counter;
+      /* If column size is smaller than the current max, stop the count */
+      if((int) column.size() < max)
+        break;
+    }
+
+    /* Store column identity values */
+    if(columnLen != 0)
+      ColumnIdentities[i] = float(max)/columnLen;
+  }
+}
+
+void alignment::printColumnsIdentity_DescriptiveStats(void) {
+
+  float *colIdentities, avg, std, max, min;
+  int i, positions;
+
+  /* Allocate local memory for the computation */
+  colIdentities = new float[residNumber];
+
+  utils::initlVect(colIdentities, residNumber, -1);
+  calculateColIdentity(colIdentities);
+
+  for(i = 0, max = 0, min = 1, avg = 0, positions = 0; i < residNumber; i++) {
+    if(colIdentities[i] != -1) {
+      /* Compute on-the-fly max and min scores. Store accumulative score */
+      avg += colIdentities[i];
+      max = (colIdentities[i] > max) ? colIdentities[i] : max;
+      min = (colIdentities[i] < min) ? colIdentities[i] : min;
+      /* Count how many columns have a value score */
+      positions += 1;
+    }
+  }
+  /* Compute average identity column score */
+  avg /= positions;
+
+  /* Compute standard desviation */
+  for(i = 0, std = 0; i < residNumber; i++)
+    if(colIdentities[i] != -1)
+      std += pow((colIdentities[i] - avg), 2);
+  std = sqrt(std/positions);
+
+  /* Print general descriptive stats */
+  cout << "#maxColIdentity\t" << max << endl;
+  cout << "#minColIdentity\t" << min << endl;
+  cout << "#avgColIdentity\t" << avg << endl;
+  cout << "#stdColIdentity\t" << std << endl;
+}
+
+
+
+