comparison trimal_repo/source/statisticsGaps.cpp @ 0:b15a3147e604 draft

"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author padge
date Fri, 25 Mar 2022 17:10:43 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b15a3147e604
1 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
2 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
3
4 trimAl v1.4: a tool for automated alignment trimming in large-scale
5 phylogenetics analyses.
6
7 2009-2015 Capella-Gutierrez S. and Gabaldon, T.
8 [scapella, tgabaldon]@crg.es
9
10 This file is part of trimAl.
11
12 trimAl is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, the last available version.
15
16 trimAl is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with trimAl. If not, see <http://www.gnu.org/licenses/>.
23
24 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
25 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
26
27 #include "statisticsGaps.h"
28
29 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
30 | statisticsGaps::statisticsGaps(char **, int, int) |
31 | |
32 | Class constructor. This method uses the inputs parameters to put the information in the new object that |
33 | has been created. |
34 | |
35 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
36
37 statisticsGaps::statisticsGaps(string *alignmentMatrix, int species, int aminos, int dataType_) {
38
39 int i, j;
40 char indet;
41
42 columnLength = species;
43 columns = aminos;
44 maxGaps = 0;
45 halfWindow = 0;
46 dataType = dataType_;
47
48 if(dataType == AAType)
49 indet = 'X';
50 else
51 indet = 'N';
52
53 /* Memory allocation for the vectors and its initialization */
54 gapsInColumn = new int[columns];
55 utils::initlVect(gapsInColumn, columns, 0);
56
57 aminosXInColumn = new int[columns];
58 utils::initlVect(aminosXInColumn, aminos, 0);
59
60 gapsWindow = new int[columns];
61 utils::initlVect(gapsWindow, columns, 0);
62
63 numColumnsWithGaps = new int[species+1];
64 utils::initlVect(numColumnsWithGaps, columnLength+1, 0);
65
66 /* Count the gaps and indeterminations of each columns */
67 for(i = 0; i < columns; i++) {
68 for(j = 0; j < columnLength; j++) {
69 if(alignmentMatrix[j][i] == '-')
70 gapsInColumn[i]++;
71 else if(alignmentMatrix[j][i] == indet)
72 aminosXInColumn[i]++;
73 }
74
75 /* Increase the number of colums with the number of gaps of the last processed column */
76 numColumnsWithGaps[gapsInColumn[i]]++;
77 gapsWindow[i] = gapsInColumn[i];
78 if(gapsWindow[i] > maxGaps) maxGaps = gapsWindow[i];
79 }
80 }
81
82 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
83 | statisticsGaps::statisticsGaps(void) |
84 | |
85 | Class constructor. |
86 | |
87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
88
89 statisticsGaps::statisticsGaps(void) {
90
91 /* Initializate all values to NULL or 0 */
92 gapsInColumn = NULL;
93 numColumnsWithGaps = NULL;
94 aminosXInColumn = NULL;
95 gapsWindow = NULL;
96
97 columns = 0;
98 columnLength = 0;
99 maxGaps = 0;
100 halfWindow = 0;
101 dataType = 0;
102 }
103
104 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
105 | statisticsGaps::~statisticsGaps(void) |
106 | |
107 | Class destroyer. |
108 | |
109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
110
111 statisticsGaps::~statisticsGaps(void) {
112
113 /* Only free memory if there is previous memory allocation */
114 if(gapsInColumn != NULL){
115 delete[] gapsInColumn;
116 delete[] numColumnsWithGaps;
117 delete[] aminosXInColumn;
118 delete[] gapsWindow;
119 }
120 }
121
122 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
123 | bool statisticsGaps::applyWindow(int) |
124 | |
125 | This method computes for each column's alignment its gapwindows' value. For this purpose, the method uses the |
126 | values that previously has been calculated and the window's size value. |
127 | |
128 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
129
130 bool statisticsGaps::applyWindow(int _halfWindow) {
131
132 int i, j, window;
133
134 /* If one of this conditions is true, we return FALSE: */
135 /* .- If already exists a previously calculated vector for this window size */
136 /* .- If halfWinSize value is greater than 1/4 of alignment length */
137 if((halfWindow == _halfWindow) || (_halfWindow > columns/4))
138 return false;
139
140 /* Initializate to 0 the vector that will store the number of gaps of each column */
141 /* and the vector that will store window processing results */
142 utils::initlVect(numColumnsWithGaps, columnLength+1, 0);
143 utils::initlVect(gapsWindow, columns, 0);
144
145 /* Initializate maximum gaps' number per column value and store the mediumWinSize value in the object. */
146 maxGaps = 0;
147 halfWindow = _halfWindow;
148 window = (2 * halfWindow + 1);
149
150 /* We calculate some statistics for every column in the alignment,and the maximum gaps' number value */
151 for(i = 0; i < columns; i++) {
152 /* Sum the total number of gaps for the considered window */
153 for(j = i - halfWindow, gapsWindow[i] = 0; j <= i + halfWindow; j++) {
154 if(j < 0)
155 gapsWindow[i] += gapsInColumn[-j];
156 else if(j >= columns)
157 gapsWindow[i] += gapsInColumn[((2 * columns - j) - 2)];
158 else
159 gapsWindow[i] += gapsInColumn[j];
160 }
161
162 /* Calculate, and round to the nearest integer, the number of gaps for the i column */
163 gapsWindow[i] = utils::roundInt(((double) gapsWindow[i]/window));
164 /* Increase in 1 the number of colums with the same number of gaps than column i */
165 numColumnsWithGaps[gapsWindow[i]]++;
166
167 /* Update the max number of gaps in the alignment, if neccesary */
168 if(gapsWindow[i] > maxGaps)
169 maxGaps = gapsWindow[i];
170 }
171 return true;
172 }
173
174 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
175 | int *statisticsGaps::getGapsWindow(void) |
176 | |
177 | This method returns a pointer to gaps window's vector |
178 | |
179 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
180
181 int *statisticsGaps::getGapsWindow(void) {
182
183 return gapsWindow;
184 }
185
186 double statisticsGaps::calcCutPoint(float minInputAlignment, float gapThreshold)
187 {
188
189 /* Method to select the cutting point based on gaps values from the input
190 * alignment. The cutting point is selected as the maximum gaps number allowed
191 * in the output alignment given a minimum percentage of the input alignment
192 * to be kept and a maximum gaps number. In case of both values set different
193 * cutting points, the minimum percentage of the input alignment prevails. */
194
195 double cuttingPoint_MinimumConserv, cuttingPoint_gapThreshold;
196 int i, acum;
197
198 /* Calculate the gap number represented by the gaps threshold. This gap number
199 * represents the maximum gap number in any column in the output alignment */
200 cuttingPoint_gapThreshold = (double) columnLength * gapThreshold;
201
202 /* Compute the minimum columns number to be kept from the input alignment */
203 cuttingPoint_MinimumConserv = utils::roundInt(((double)
204 (columns * minInputAlignment) / 100.0));
205 if(cuttingPoint_MinimumConserv > columns)
206 cuttingPoint_MinimumConserv = columns;
207
208 /* We look at the number of gaps which allows us to keep the minimum columns
209 * number from the input alignment */
210 for(i = 0, acum = 0; i < columnLength; i++) {
211 acum += numColumnsWithGaps[i];
212 if(acum >= cuttingPoint_MinimumConserv)
213 break;
214 }
215
216 /* If there is not an exact number for the gaps cutting point, compute such
217 * value as the inmediate superior gap number minus the proportion of columns
218 * necessary to respect the minimum percentage from the input alignment to be
219 * kept */
220 if(numColumnsWithGaps[i])
221 cuttingPoint_MinimumConserv = (double) (i - ((float)
222 (acum - cuttingPoint_MinimumConserv)/numColumnsWithGaps[i]));
223 else
224 cuttingPoint_MinimumConserv = 0;
225
226 /* Return the maximum gap number of the two computed cutting points. */
227 return (utils::max(cuttingPoint_MinimumConserv, cuttingPoint_gapThreshold));
228 }
229
230 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
231 | int statisticsGaps::calcCutPointMixSlope(void) |
232 | |
233 | This method computes and selects the cut point based on the maximum rate between the first slope ratio between |
234 | gaps' percentage in the columns and alignments' length and the "second" slope (slope between three consecutive |
235 | points using only the first and third of them) ratio between gaps' percentage in the columns and alignments' |
236 | length. |
237 | |
238 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
239
240 int statisticsGaps::calcCutPointMixSlope(void) {
241
242 float delta = 0, maxSlope = -1, *firstSlopeVector, *secondSlopeVector;
243 int prev, pprev, maxIter, row = 1, act = 0, max = 0;
244
245 /* We build two slope vectors, one vector for the first slope and another one for the second. */
246 firstSlopeVector = new float[maxGaps+1];
247 secondSlopeVector = new float[maxGaps+1];
248
249 /* We initialize them with -1.0 value and fix the maximum iteractions' number as maximun gaps' number plus 1. */
250 utils::initlVect(firstSlopeVector, maxGaps, -1.0);
251 utils::initlVect(secondSlopeVector, maxGaps, -1.0);
252 maxIter = maxGaps + 1;
253
254 /* Until to achieve the maximum iteractions' number. */
255 while(act < maxIter) {
256
257 /* We look for a first point to second slope. */
258 while((numColumnsWithGaps[act]) == 0) act++;
259 pprev = act; if((act+1) >= maxIter) break;
260
261 /* We look for a first point to first slope. */
262 do { act++; } while((numColumnsWithGaps[act]) == 0);
263 prev = act; if((act+1) >= maxIter) break;
264
265 /* We look for a second point to first and second slope. */
266 do { act++; } while((numColumnsWithGaps[act]) == 0);
267 if(act >= maxIter) break;
268
269 /* Calculate the first slope between the earlier previus and previus points. */
270 firstSlopeVector[prev] = ((float) (prev - pprev) / columnLength);
271 firstSlopeVector[prev] /= ((float) numColumnsWithGaps[prev] / columns);
272
273 /* Calculate the first slope between the previus and current points. */
274 firstSlopeVector[act] = ((float) (act - prev) / columnLength);
275 firstSlopeVector[act] /= ((float) numColumnsWithGaps[act] / columns);
276
277 /* Calculate the second slope between the earlier previus and current points. */
278 secondSlopeVector[act] = ((float) (act - pprev) / columnLength);
279 secondSlopeVector[act] /= ((float) (numColumnsWithGaps[act] + numColumnsWithGaps[prev]) / columns);
280
281 /* If the ratio between ... */
282 if((secondSlopeVector[pprev] != -1.0) || (firstSlopeVector[pprev] != -1.0)) {
283
284 /* .- first slope previus and first slope earlier previus points. */
285 if(firstSlopeVector[pprev] != -1.0) {
286 delta = firstSlopeVector[prev]/firstSlopeVector[pprev];
287 row = pprev;
288 }
289
290 /* .- first slope current and first slope previus points. */
291 if(delta < (firstSlopeVector[act]/firstSlopeVector[prev])) {
292 delta = firstSlopeVector[act]/firstSlopeVector[prev];
293 row = prev;
294 }
295
296 /* .- second slope current and second slope earlier previus points. */
297 if(secondSlopeVector[pprev] != -1.0) {
298 if(delta < (secondSlopeVector[act]/secondSlopeVector[pprev])) {
299 delta = secondSlopeVector[act]/secondSlopeVector[pprev];
300 row = pprev;
301 }
302 }
303
304 /* ... is better that current maxSlope then we modify the maxSlope with the best ratio. */
305 if(delta > maxSlope) {
306 maxSlope = delta;
307 max = row;
308 }
309 }
310 act = prev;
311 }
312
313 /* We delete the local memory. */
314 delete[] firstSlopeVector;
315 delete[] secondSlopeVector;
316
317 /* and, finally, we return the cut point. */
318 return max;
319 }
320
321 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
322 | int statisticsGaps::calcCutPoint2ndSlope(void) |
323 | |
324 | This method computes and selects the cut point based on the maximum "second" slope (slope between three |
325 | consecutive points using only the first and third of them) ratio between gaps' percentage in the columns and |
326 | alignments' length. |
327 | |
328 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
329
330 int statisticsGaps::calcCutPoint2ndSlope(void) {
331
332 float maxSlope = -1, *secondSlopeVector;
333 int prev, pprev, maxIter, act = 0, max = 0;
334
335 /* We build one slope vector and fix the maximum iteractions' number as the gaps'number plus 1. */
336 secondSlopeVector = new float[maxGaps+1];
337 utils::initlVect(secondSlopeVector, maxGaps, -1.0);
338 maxIter = maxGaps + 1;
339
340 /* Find the lowest number of gaps into the input alignment. If there are few
341 * points, it is possible that lowest number of gaps is returned as the thres
342 * hold. It could happen input alignment does not have columns with no-gaps */
343 for(act = 0, max = 0; numColumnsWithGaps[act] == 0; act++)
344 max = act + 1;
345
346 act = 0;
347 while(act < maxIter) {
348
349 /* We look for a first point to second slope. */
350 while((numColumnsWithGaps[act]) == 0)
351 act++;
352 pprev = act;
353 if((act+1) >= maxIter)
354 break;
355
356 /* We look for a first point to first slope. */
357 do {
358 act++;
359 } while((numColumnsWithGaps[act]) == 0);
360 prev = act;
361 if((act+1) >= maxIter)
362 break;
363
364 /* We look for a second point to first and second slope. */
365 do {
366 act++;
367 } while((numColumnsWithGaps[act]) == 0);
368 if(act >= maxIter)
369 break;
370
371 /* Calculate the second slope between the earlier previous and current points. */
372 secondSlopeVector[act] = ((float) (act - pprev) / columnLength);
373 secondSlopeVector[act] /= ((float) (numColumnsWithGaps[act] + numColumnsWithGaps[prev]) / columns);
374
375 /* If the ratio between second slope current and second slope earlier previous points. */
376 if(secondSlopeVector[pprev] != -1.0) {
377 if((secondSlopeVector[act]/secondSlopeVector[pprev]) > maxSlope) {
378 maxSlope = (secondSlopeVector[act]/secondSlopeVector[pprev]);
379 max = pprev;
380 }
381 } else if(secondSlopeVector[prev] != -1.0) {
382 if((secondSlopeVector[act]/secondSlopeVector[prev]) > maxSlope) {
383 maxSlope = (secondSlopeVector[act]/secondSlopeVector[prev]);
384 max = pprev;
385 }
386 }
387 act = prev;
388 }
389
390 /* We deallocate local memory. */
391 delete[] secondSlopeVector;
392
393 /* Finally, we return the selected cut point. */
394 return max;
395 }
396
397 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
398 | void statisticsGaps::printGapsColumns(void) |
399 | |
400 | This method shows the gaps' percentage per each column in the alignment. |
401 | |
402 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
403
404 void statisticsGaps::printGapsColumns(void) {
405
406 int *vectAux;
407
408 /* We allocate a local vector to recovery information on it */
409 vectAux = new int[columns];
410
411 /* We decide about the information's source then we get the information. */
412 if(halfWindow == 0)
413 utils::copyVect(gapsInColumn, vectAux, columns);
414 else
415 utils::copyVect(gapsWindow, vectAux, columns);
416
417 /* Fix the precision of output */
418 /* We set the output precision and print the header. */
419 cout << "| Residue\t % Gaps \t Gap Score |" << endl;
420 cout << "| Number \t \t |" << endl;
421 cout << "+----------------------------------------------+" << endl;
422 cout.precision(10);
423
424 /* Show the information that have been requered */
425 for(int i = 0; i < columns; i++)
426 cout << " " << setw(5) << i << "\t\t" << setw(10) << (vectAux[i] * 100.0)/columnLength
427 << "\t" << setw(7) << 1 -((vectAux[i] * 1.0)/columnLength) << endl;
428
429 /* Finally, we deallocate the local memory */
430 delete[] vectAux;
431 }
432
433 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
434 | void statisticsGaps::printGapsAcl(void) |
435 | |
436 | This method shows the gaps' statistics for the alignment. |
437 | |
438 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
439
440 void statisticsGaps::printGapsAcl(void) {
441
442 int acm, i;
443
444 /* Fix the precision of output */
445 cout << "| Number of\t \t|\t Cumulative \t% Cumulative\t|\tNumber of Gaps\t % Gaps \tGap Score |" << endl;
446 cout << "| Residues \t% Length\t|\tNumberResid.\t Length \t|\t per Column \tper Column\tper Column |" << endl;
447 cout << "+-------------------------------+-----------------------------"
448 << "----------+--------------------------------------------------+" << endl;
449 cout.precision(10);
450
451 /* Count for each gaps' number the columns' number with that gaps' number. */
452 for(i = 0, acm = 0; i <= maxGaps; i++) {
453
454 /* If the columns' number with this gaps' number is not equal to zero, we will count the columns. */
455 if(numColumnsWithGaps[i] != 0) {
456
457 /* Compute and prints the accumulative values for the gaps in the alignment. */
458 acm += numColumnsWithGaps[i];
459 cout << " " << setiosflags(ios::left) << numColumnsWithGaps[i] << "\t\t" << setw(10) << (numColumnsWithGaps[i] * 100.0)/columns
460 << "\t\t" << acm << "\t\t" << setw(10) << (acm * 100.0)/columns
461 << "\t\t" << i << "\t\t" << setw(10) << (i * 1.0)/columnLength << "\t"<< setw(10) << 1 - ((i * 1.0)/columnLength) << endl;
462 }
463 }
464 }