Mercurial > repos > padge > trimal
comparison trimal_repo/source/statisticsGaps.cpp @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author | padge |
---|---|
date | Fri, 25 Mar 2022 17:10:43 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b15a3147e604 |
---|---|
1 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** | |
2 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** | |
3 | |
4 trimAl v1.4: a tool for automated alignment trimming in large-scale | |
5 phylogenetics analyses. | |
6 | |
7 2009-2015 Capella-Gutierrez S. and Gabaldon, T. | |
8 [scapella, tgabaldon]@crg.es | |
9 | |
10 This file is part of trimAl. | |
11 | |
12 trimAl is free software: you can redistribute it and/or modify | |
13 it under the terms of the GNU General Public License as published by | |
14 the Free Software Foundation, the last available version. | |
15 | |
16 trimAl is distributed in the hope that it will be useful, | |
17 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 GNU General Public License for more details. | |
20 | |
21 You should have received a copy of the GNU General Public License | |
22 along with trimAl. If not, see <http://www.gnu.org/licenses/>. | |
23 | |
24 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** | |
25 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
26 | |
27 #include "statisticsGaps.h" | |
28 | |
29 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
30 | statisticsGaps::statisticsGaps(char **, int, int) | | |
31 | | | |
32 | Class constructor. This method uses the inputs parameters to put the information in the new object that | | |
33 | has been created. | | |
34 | | | |
35 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
36 | |
37 statisticsGaps::statisticsGaps(string *alignmentMatrix, int species, int aminos, int dataType_) { | |
38 | |
39 int i, j; | |
40 char indet; | |
41 | |
42 columnLength = species; | |
43 columns = aminos; | |
44 maxGaps = 0; | |
45 halfWindow = 0; | |
46 dataType = dataType_; | |
47 | |
48 if(dataType == AAType) | |
49 indet = 'X'; | |
50 else | |
51 indet = 'N'; | |
52 | |
53 /* Memory allocation for the vectors and its initialization */ | |
54 gapsInColumn = new int[columns]; | |
55 utils::initlVect(gapsInColumn, columns, 0); | |
56 | |
57 aminosXInColumn = new int[columns]; | |
58 utils::initlVect(aminosXInColumn, aminos, 0); | |
59 | |
60 gapsWindow = new int[columns]; | |
61 utils::initlVect(gapsWindow, columns, 0); | |
62 | |
63 numColumnsWithGaps = new int[species+1]; | |
64 utils::initlVect(numColumnsWithGaps, columnLength+1, 0); | |
65 | |
66 /* Count the gaps and indeterminations of each columns */ | |
67 for(i = 0; i < columns; i++) { | |
68 for(j = 0; j < columnLength; j++) { | |
69 if(alignmentMatrix[j][i] == '-') | |
70 gapsInColumn[i]++; | |
71 else if(alignmentMatrix[j][i] == indet) | |
72 aminosXInColumn[i]++; | |
73 } | |
74 | |
75 /* Increase the number of colums with the number of gaps of the last processed column */ | |
76 numColumnsWithGaps[gapsInColumn[i]]++; | |
77 gapsWindow[i] = gapsInColumn[i]; | |
78 if(gapsWindow[i] > maxGaps) maxGaps = gapsWindow[i]; | |
79 } | |
80 } | |
81 | |
82 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
83 | statisticsGaps::statisticsGaps(void) | | |
84 | | | |
85 | Class constructor. | | |
86 | | | |
87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
88 | |
89 statisticsGaps::statisticsGaps(void) { | |
90 | |
91 /* Initializate all values to NULL or 0 */ | |
92 gapsInColumn = NULL; | |
93 numColumnsWithGaps = NULL; | |
94 aminosXInColumn = NULL; | |
95 gapsWindow = NULL; | |
96 | |
97 columns = 0; | |
98 columnLength = 0; | |
99 maxGaps = 0; | |
100 halfWindow = 0; | |
101 dataType = 0; | |
102 } | |
103 | |
104 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
105 | statisticsGaps::~statisticsGaps(void) | | |
106 | | | |
107 | Class destroyer. | | |
108 | | | |
109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
110 | |
111 statisticsGaps::~statisticsGaps(void) { | |
112 | |
113 /* Only free memory if there is previous memory allocation */ | |
114 if(gapsInColumn != NULL){ | |
115 delete[] gapsInColumn; | |
116 delete[] numColumnsWithGaps; | |
117 delete[] aminosXInColumn; | |
118 delete[] gapsWindow; | |
119 } | |
120 } | |
121 | |
122 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
123 | bool statisticsGaps::applyWindow(int) | | |
124 | | | |
125 | This method computes for each column's alignment its gapwindows' value. For this purpose, the method uses the | | |
126 | values that previously has been calculated and the window's size value. | | |
127 | | | |
128 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
129 | |
130 bool statisticsGaps::applyWindow(int _halfWindow) { | |
131 | |
132 int i, j, window; | |
133 | |
134 /* If one of this conditions is true, we return FALSE: */ | |
135 /* .- If already exists a previously calculated vector for this window size */ | |
136 /* .- If halfWinSize value is greater than 1/4 of alignment length */ | |
137 if((halfWindow == _halfWindow) || (_halfWindow > columns/4)) | |
138 return false; | |
139 | |
140 /* Initializate to 0 the vector that will store the number of gaps of each column */ | |
141 /* and the vector that will store window processing results */ | |
142 utils::initlVect(numColumnsWithGaps, columnLength+1, 0); | |
143 utils::initlVect(gapsWindow, columns, 0); | |
144 | |
145 /* Initializate maximum gaps' number per column value and store the mediumWinSize value in the object. */ | |
146 maxGaps = 0; | |
147 halfWindow = _halfWindow; | |
148 window = (2 * halfWindow + 1); | |
149 | |
150 /* We calculate some statistics for every column in the alignment,and the maximum gaps' number value */ | |
151 for(i = 0; i < columns; i++) { | |
152 /* Sum the total number of gaps for the considered window */ | |
153 for(j = i - halfWindow, gapsWindow[i] = 0; j <= i + halfWindow; j++) { | |
154 if(j < 0) | |
155 gapsWindow[i] += gapsInColumn[-j]; | |
156 else if(j >= columns) | |
157 gapsWindow[i] += gapsInColumn[((2 * columns - j) - 2)]; | |
158 else | |
159 gapsWindow[i] += gapsInColumn[j]; | |
160 } | |
161 | |
162 /* Calculate, and round to the nearest integer, the number of gaps for the i column */ | |
163 gapsWindow[i] = utils::roundInt(((double) gapsWindow[i]/window)); | |
164 /* Increase in 1 the number of colums with the same number of gaps than column i */ | |
165 numColumnsWithGaps[gapsWindow[i]]++; | |
166 | |
167 /* Update the max number of gaps in the alignment, if neccesary */ | |
168 if(gapsWindow[i] > maxGaps) | |
169 maxGaps = gapsWindow[i]; | |
170 } | |
171 return true; | |
172 } | |
173 | |
174 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
175 | int *statisticsGaps::getGapsWindow(void) | | |
176 | | | |
177 | This method returns a pointer to gaps window's vector | | |
178 | | | |
179 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
180 | |
181 int *statisticsGaps::getGapsWindow(void) { | |
182 | |
183 return gapsWindow; | |
184 } | |
185 | |
186 double statisticsGaps::calcCutPoint(float minInputAlignment, float gapThreshold) | |
187 { | |
188 | |
189 /* Method to select the cutting point based on gaps values from the input | |
190 * alignment. The cutting point is selected as the maximum gaps number allowed | |
191 * in the output alignment given a minimum percentage of the input alignment | |
192 * to be kept and a maximum gaps number. In case of both values set different | |
193 * cutting points, the minimum percentage of the input alignment prevails. */ | |
194 | |
195 double cuttingPoint_MinimumConserv, cuttingPoint_gapThreshold; | |
196 int i, acum; | |
197 | |
198 /* Calculate the gap number represented by the gaps threshold. This gap number | |
199 * represents the maximum gap number in any column in the output alignment */ | |
200 cuttingPoint_gapThreshold = (double) columnLength * gapThreshold; | |
201 | |
202 /* Compute the minimum columns number to be kept from the input alignment */ | |
203 cuttingPoint_MinimumConserv = utils::roundInt(((double) | |
204 (columns * minInputAlignment) / 100.0)); | |
205 if(cuttingPoint_MinimumConserv > columns) | |
206 cuttingPoint_MinimumConserv = columns; | |
207 | |
208 /* We look at the number of gaps which allows us to keep the minimum columns | |
209 * number from the input alignment */ | |
210 for(i = 0, acum = 0; i < columnLength; i++) { | |
211 acum += numColumnsWithGaps[i]; | |
212 if(acum >= cuttingPoint_MinimumConserv) | |
213 break; | |
214 } | |
215 | |
216 /* If there is not an exact number for the gaps cutting point, compute such | |
217 * value as the inmediate superior gap number minus the proportion of columns | |
218 * necessary to respect the minimum percentage from the input alignment to be | |
219 * kept */ | |
220 if(numColumnsWithGaps[i]) | |
221 cuttingPoint_MinimumConserv = (double) (i - ((float) | |
222 (acum - cuttingPoint_MinimumConserv)/numColumnsWithGaps[i])); | |
223 else | |
224 cuttingPoint_MinimumConserv = 0; | |
225 | |
226 /* Return the maximum gap number of the two computed cutting points. */ | |
227 return (utils::max(cuttingPoint_MinimumConserv, cuttingPoint_gapThreshold)); | |
228 } | |
229 | |
230 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
231 | int statisticsGaps::calcCutPointMixSlope(void) | | |
232 | | | |
233 | This method computes and selects the cut point based on the maximum rate between the first slope ratio between | | |
234 | gaps' percentage in the columns and alignments' length and the "second" slope (slope between three consecutive | | |
235 | points using only the first and third of them) ratio between gaps' percentage in the columns and alignments' | | |
236 | length. | | |
237 | | | |
238 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
239 | |
240 int statisticsGaps::calcCutPointMixSlope(void) { | |
241 | |
242 float delta = 0, maxSlope = -1, *firstSlopeVector, *secondSlopeVector; | |
243 int prev, pprev, maxIter, row = 1, act = 0, max = 0; | |
244 | |
245 /* We build two slope vectors, one vector for the first slope and another one for the second. */ | |
246 firstSlopeVector = new float[maxGaps+1]; | |
247 secondSlopeVector = new float[maxGaps+1]; | |
248 | |
249 /* We initialize them with -1.0 value and fix the maximum iteractions' number as maximun gaps' number plus 1. */ | |
250 utils::initlVect(firstSlopeVector, maxGaps, -1.0); | |
251 utils::initlVect(secondSlopeVector, maxGaps, -1.0); | |
252 maxIter = maxGaps + 1; | |
253 | |
254 /* Until to achieve the maximum iteractions' number. */ | |
255 while(act < maxIter) { | |
256 | |
257 /* We look for a first point to second slope. */ | |
258 while((numColumnsWithGaps[act]) == 0) act++; | |
259 pprev = act; if((act+1) >= maxIter) break; | |
260 | |
261 /* We look for a first point to first slope. */ | |
262 do { act++; } while((numColumnsWithGaps[act]) == 0); | |
263 prev = act; if((act+1) >= maxIter) break; | |
264 | |
265 /* We look for a second point to first and second slope. */ | |
266 do { act++; } while((numColumnsWithGaps[act]) == 0); | |
267 if(act >= maxIter) break; | |
268 | |
269 /* Calculate the first slope between the earlier previus and previus points. */ | |
270 firstSlopeVector[prev] = ((float) (prev - pprev) / columnLength); | |
271 firstSlopeVector[prev] /= ((float) numColumnsWithGaps[prev] / columns); | |
272 | |
273 /* Calculate the first slope between the previus and current points. */ | |
274 firstSlopeVector[act] = ((float) (act - prev) / columnLength); | |
275 firstSlopeVector[act] /= ((float) numColumnsWithGaps[act] / columns); | |
276 | |
277 /* Calculate the second slope between the earlier previus and current points. */ | |
278 secondSlopeVector[act] = ((float) (act - pprev) / columnLength); | |
279 secondSlopeVector[act] /= ((float) (numColumnsWithGaps[act] + numColumnsWithGaps[prev]) / columns); | |
280 | |
281 /* If the ratio between ... */ | |
282 if((secondSlopeVector[pprev] != -1.0) || (firstSlopeVector[pprev] != -1.0)) { | |
283 | |
284 /* .- first slope previus and first slope earlier previus points. */ | |
285 if(firstSlopeVector[pprev] != -1.0) { | |
286 delta = firstSlopeVector[prev]/firstSlopeVector[pprev]; | |
287 row = pprev; | |
288 } | |
289 | |
290 /* .- first slope current and first slope previus points. */ | |
291 if(delta < (firstSlopeVector[act]/firstSlopeVector[prev])) { | |
292 delta = firstSlopeVector[act]/firstSlopeVector[prev]; | |
293 row = prev; | |
294 } | |
295 | |
296 /* .- second slope current and second slope earlier previus points. */ | |
297 if(secondSlopeVector[pprev] != -1.0) { | |
298 if(delta < (secondSlopeVector[act]/secondSlopeVector[pprev])) { | |
299 delta = secondSlopeVector[act]/secondSlopeVector[pprev]; | |
300 row = pprev; | |
301 } | |
302 } | |
303 | |
304 /* ... is better that current maxSlope then we modify the maxSlope with the best ratio. */ | |
305 if(delta > maxSlope) { | |
306 maxSlope = delta; | |
307 max = row; | |
308 } | |
309 } | |
310 act = prev; | |
311 } | |
312 | |
313 /* We delete the local memory. */ | |
314 delete[] firstSlopeVector; | |
315 delete[] secondSlopeVector; | |
316 | |
317 /* and, finally, we return the cut point. */ | |
318 return max; | |
319 } | |
320 | |
321 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
322 | int statisticsGaps::calcCutPoint2ndSlope(void) | | |
323 | | | |
324 | This method computes and selects the cut point based on the maximum "second" slope (slope between three | | |
325 | consecutive points using only the first and third of them) ratio between gaps' percentage in the columns and | | |
326 | alignments' length. | | |
327 | | | |
328 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
329 | |
330 int statisticsGaps::calcCutPoint2ndSlope(void) { | |
331 | |
332 float maxSlope = -1, *secondSlopeVector; | |
333 int prev, pprev, maxIter, act = 0, max = 0; | |
334 | |
335 /* We build one slope vector and fix the maximum iteractions' number as the gaps'number plus 1. */ | |
336 secondSlopeVector = new float[maxGaps+1]; | |
337 utils::initlVect(secondSlopeVector, maxGaps, -1.0); | |
338 maxIter = maxGaps + 1; | |
339 | |
340 /* Find the lowest number of gaps into the input alignment. If there are few | |
341 * points, it is possible that lowest number of gaps is returned as the thres | |
342 * hold. It could happen input alignment does not have columns with no-gaps */ | |
343 for(act = 0, max = 0; numColumnsWithGaps[act] == 0; act++) | |
344 max = act + 1; | |
345 | |
346 act = 0; | |
347 while(act < maxIter) { | |
348 | |
349 /* We look for a first point to second slope. */ | |
350 while((numColumnsWithGaps[act]) == 0) | |
351 act++; | |
352 pprev = act; | |
353 if((act+1) >= maxIter) | |
354 break; | |
355 | |
356 /* We look for a first point to first slope. */ | |
357 do { | |
358 act++; | |
359 } while((numColumnsWithGaps[act]) == 0); | |
360 prev = act; | |
361 if((act+1) >= maxIter) | |
362 break; | |
363 | |
364 /* We look for a second point to first and second slope. */ | |
365 do { | |
366 act++; | |
367 } while((numColumnsWithGaps[act]) == 0); | |
368 if(act >= maxIter) | |
369 break; | |
370 | |
371 /* Calculate the second slope between the earlier previous and current points. */ | |
372 secondSlopeVector[act] = ((float) (act - pprev) / columnLength); | |
373 secondSlopeVector[act] /= ((float) (numColumnsWithGaps[act] + numColumnsWithGaps[prev]) / columns); | |
374 | |
375 /* If the ratio between second slope current and second slope earlier previous points. */ | |
376 if(secondSlopeVector[pprev] != -1.0) { | |
377 if((secondSlopeVector[act]/secondSlopeVector[pprev]) > maxSlope) { | |
378 maxSlope = (secondSlopeVector[act]/secondSlopeVector[pprev]); | |
379 max = pprev; | |
380 } | |
381 } else if(secondSlopeVector[prev] != -1.0) { | |
382 if((secondSlopeVector[act]/secondSlopeVector[prev]) > maxSlope) { | |
383 maxSlope = (secondSlopeVector[act]/secondSlopeVector[prev]); | |
384 max = pprev; | |
385 } | |
386 } | |
387 act = prev; | |
388 } | |
389 | |
390 /* We deallocate local memory. */ | |
391 delete[] secondSlopeVector; | |
392 | |
393 /* Finally, we return the selected cut point. */ | |
394 return max; | |
395 } | |
396 | |
397 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
398 | void statisticsGaps::printGapsColumns(void) | | |
399 | | | |
400 | This method shows the gaps' percentage per each column in the alignment. | | |
401 | | | |
402 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
403 | |
404 void statisticsGaps::printGapsColumns(void) { | |
405 | |
406 int *vectAux; | |
407 | |
408 /* We allocate a local vector to recovery information on it */ | |
409 vectAux = new int[columns]; | |
410 | |
411 /* We decide about the information's source then we get the information. */ | |
412 if(halfWindow == 0) | |
413 utils::copyVect(gapsInColumn, vectAux, columns); | |
414 else | |
415 utils::copyVect(gapsWindow, vectAux, columns); | |
416 | |
417 /* Fix the precision of output */ | |
418 /* We set the output precision and print the header. */ | |
419 cout << "| Residue\t % Gaps \t Gap Score |" << endl; | |
420 cout << "| Number \t \t |" << endl; | |
421 cout << "+----------------------------------------------+" << endl; | |
422 cout.precision(10); | |
423 | |
424 /* Show the information that have been requered */ | |
425 for(int i = 0; i < columns; i++) | |
426 cout << " " << setw(5) << i << "\t\t" << setw(10) << (vectAux[i] * 100.0)/columnLength | |
427 << "\t" << setw(7) << 1 -((vectAux[i] * 1.0)/columnLength) << endl; | |
428 | |
429 /* Finally, we deallocate the local memory */ | |
430 delete[] vectAux; | |
431 } | |
432 | |
433 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
434 | void statisticsGaps::printGapsAcl(void) | | |
435 | | | |
436 | This method shows the gaps' statistics for the alignment. | | |
437 | | | |
438 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ | |
439 | |
440 void statisticsGaps::printGapsAcl(void) { | |
441 | |
442 int acm, i; | |
443 | |
444 /* Fix the precision of output */ | |
445 cout << "| Number of\t \t|\t Cumulative \t% Cumulative\t|\tNumber of Gaps\t % Gaps \tGap Score |" << endl; | |
446 cout << "| Residues \t% Length\t|\tNumberResid.\t Length \t|\t per Column \tper Column\tper Column |" << endl; | |
447 cout << "+-------------------------------+-----------------------------" | |
448 << "----------+--------------------------------------------------+" << endl; | |
449 cout.precision(10); | |
450 | |
451 /* Count for each gaps' number the columns' number with that gaps' number. */ | |
452 for(i = 0, acm = 0; i <= maxGaps; i++) { | |
453 | |
454 /* If the columns' number with this gaps' number is not equal to zero, we will count the columns. */ | |
455 if(numColumnsWithGaps[i] != 0) { | |
456 | |
457 /* Compute and prints the accumulative values for the gaps in the alignment. */ | |
458 acm += numColumnsWithGaps[i]; | |
459 cout << " " << setiosflags(ios::left) << numColumnsWithGaps[i] << "\t\t" << setw(10) << (numColumnsWithGaps[i] * 100.0)/columns | |
460 << "\t\t" << acm << "\t\t" << setw(10) << (acm * 100.0)/columns | |
461 << "\t\t" << i << "\t\t" << setw(10) << (i * 1.0)/columnLength << "\t"<< setw(10) << 1 - ((i * 1.0)/columnLength) << endl; | |
462 } | |
463 } | |
464 } |