comparison trimal_repo/source/compareFiles.cpp @ 0:b15a3147e604 draft

"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author padge
date Fri, 25 Mar 2022 17:10:43 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b15a3147e604
1 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
2 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
3
4 trimAl v1.4: a tool for automated alignment trimming in large-scale
5 phylogenetics analyses.
6
7 2009-2015 Capella-Gutierrez S. and Gabaldon, T.
8 [scapella, tgabaldon]@crg.es
9
10 This file is part of trimAl.
11
12 trimAl is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, the last available version.
15
16 trimAl is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with trimAl. If not, see <http://www.gnu.org/licenses/>.
23
24 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
25 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
26
27 #include "compareFiles.h"
28 #include "alignment.h"
29
30 #define LONG 80
31
32 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
33 /* This method compares a set of alignment in order to select the most
34 * consistent one respect of the other ones. To compute the consistency
35 * values we use the proportion of residue pairs per column in the aligs
36 * to compare */
37 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
38 int compareFiles::algorithm(alignment **vectAlignments, char **fileNames, float *columnsValue, int numAlignments, bool verbosity) {
39
40 int *numResiduesAlig, *correspNames, *columnSeqMatrix, *columnSeqMatrixAux;
41 int i, j, k, l, m, numSeqs, pairRes, hits, alig = 0;
42 float max = 0, value = 0, **vectHits;
43 bool appearErrors = false;
44 string *names;
45
46 /* ***** ***** ***** ***** ***** ***** ***** ***** */
47 /* Get some parameters from the alignment that has
48 * been selected */
49 numSeqs = vectAlignments[0] -> getNumSpecies();
50 /* ***** ***** ***** ***** ***** ***** ***** ***** */
51
52 /* ***** ***** ***** ***** ***** ***** ***** ***** */
53 /* Allocate dinamic local memory */
54 names = new string[numSeqs];
55 correspNames = new int[numSeqs];
56 numResiduesAlig = new int[numAlignments];
57 columnSeqMatrix = new int[numSeqs];
58 vectHits = new float*[numAlignments];
59 columnSeqMatrixAux = new int[numSeqs];
60 /* ***** ***** ***** ***** ***** ***** ***** ***** */
61
62 /* ***** ***** ***** ***** ***** ***** ***** ***** */
63 /* Check that all of alignment has the same number of
64 * sequence as well as there exists a correspondence
65 * between the names for each pars of aligs. */
66 for(i = 1; i < numAlignments; i++) {
67 /* ***** ***** ***** ***** ***** ***** ***** ***** */
68 if(numSeqs != vectAlignments[i] -> getNumSpecies()) {
69 cerr << endl << "ERROR: The files to compare do not have "
70 << "the same number of sequences" << endl << endl;
71 appearErrors = true;
72 break;
73 }
74 /* ***** ***** ***** ***** ***** ***** ***** ***** */
75
76 /* ***** ***** ***** ***** ***** ***** ***** ***** */
77 vectAlignments[i] -> getSequences(names);
78 if(!vectAlignments[0] -> getSeqNameOrder(names, correspNames)) {
79 cerr << endl << "ERROR: The files to compare do not"
80 << " have the sequence names" << endl << endl;
81 appearErrors = true;
82 break;
83 }
84 /* ***** ***** ***** ***** ***** ***** ***** ***** */
85 }
86
87 /* ***** ***** ***** ***** ***** ***** ***** ***** */
88 /* Changes the order in sequences number matrix
89 * according to the order in the selected alignment */
90 for(i = 1; ((i < numAlignments) && (!appearErrors)); i++) {
91 vectAlignments[i] -> getSequences(names);
92 vectAlignments[0] -> getSeqNameOrder(names, correspNames);
93 vectAlignments[i] -> setSeqMatrixOrder(correspNames);
94 }
95 /* ***** ***** ***** ***** ***** ***** ***** ***** */
96
97 /* ***** ***** ***** ***** ***** ***** ***** ***** */
98 /* Get back the residues number for each alignment */
99 for(i = 0; ((i < numAlignments) && (!appearErrors)); i++)
100 numResiduesAlig[i] = vectAlignments[i] -> getNumAminos();
101 /* ***** ***** ***** ***** ***** ***** ***** ***** */
102
103 /* ***** ***** ***** ***** ***** ***** ***** ***** */
104 /* Start the comparison among the alignments */
105 for(i = 0; ((i < numAlignments) && (!appearErrors)); i++, value = 0) {
106
107 /* ***** ***** ***** ***** ***** ***** ***** ***** */
108 /* If it's necessary, we print some information */
109 if(verbosity)
110 cout << endl;
111 /* ***** ***** ***** ***** ***** ***** ***** ***** */
112
113 /* ***** ***** ***** ***** ***** ***** ***** ***** */
114 /* Initialize the hits vector for each alignment */
115 vectHits[i] = new float[numResiduesAlig[i]];
116 utils::initlVect(vectHits[i], numResiduesAlig[i], 0);
117 /* ***** ***** ***** ***** ***** ***** ***** ***** */
118
119 /* ***** ***** ***** ***** ***** ***** ***** ***** */
120 for(j = 0, pairRes = 0, hits = 0; j < numResiduesAlig[i]; j++, pairRes = 0, hits = 0) {
121
122 /* ***** ***** ***** ***** ***** ***** ***** ***** */
123 /* Get back each column from the current selected
124 * alignment */
125 vectAlignments[i] -> getColumnSeqMatrix(j, columnSeqMatrix);
126 /* ***** ***** ***** ***** ***** ***** ***** ***** */
127
128 /* ***** ***** ***** ***** ***** ***** ***** ***** */
129 /* For each position from the previous recovered
130 * columns, we carry out the analysis to detect the
131 * same residues pair in the rest of the alignmetns */
132 for(k = 0; k < numSeqs; k++) {
133
134 /* ***** ***** ***** ***** ***** ***** ***** ***** */
135 /* If there is a valid residue, we go ahead with
136 * the analysis */
137 if(columnSeqMatrix[k] != 0) {
138 /* ***** ***** ***** ***** ***** ***** ***** ***** */
139 for(l = 0; l < i; l++) {
140 /* Recover the residue pairs from the others aligs */
141 vectAlignments[l] -> getColumnSeqMatrix(columnSeqMatrix[k], k, columnSeqMatrixAux);
142 /* ***** ***** ***** ***** ***** ***** ***** ***** */
143 /* and count the similar residue pairs */
144 for(m = k + 1; m < numSeqs; m++)
145 if(columnSeqMatrix[m] != 0) {
146 if(columnSeqMatrix[m] == columnSeqMatrixAux[m])
147 hits++;
148 pairRes++;
149 }
150 }
151 /* ***** ***** ***** ***** ***** ***** ***** ***** */
152
153 /* ***** ***** ***** ***** ***** ***** ***** ***** */
154 for(l = i + 1; l < numAlignments; l++) {
155 /* Recover the residue pairs from the others aligs */
156 vectAlignments[l] -> getColumnSeqMatrix(columnSeqMatrix[k], k, columnSeqMatrixAux);
157 /* ***** ***** ***** ***** ***** ***** ***** ***** */
158 /* and count the similar residue pairs */
159 for(m = k + 1; m < numSeqs; m++)
160 if(columnSeqMatrix[m] != 0) {
161 if(columnSeqMatrix[m] == columnSeqMatrixAux[m])
162 hits++;
163 pairRes++;
164 }
165 }
166 /* ***** ***** ***** ***** ***** ***** ***** ***** */
167 }
168 }
169 /* ***** ***** ***** ***** ***** ***** ***** ***** */
170
171 /* ***** ***** ***** ***** ***** ***** ***** ***** */
172 /* For each column, compute the hits proportion for
173 * every residue pair against the rest of alignments */
174 if(pairRes != 0) {
175 vectHits[i][j] += ((1.0 * hits)/pairRes);
176 value += vectHits[i][j];
177 }
178 /* ***** ***** ***** ***** ***** ***** ***** ***** */
179 }
180
181 /* ***** ***** ***** ***** ***** ***** ***** ***** */
182 /* The method can offer some information about the
183 * comparison progression */
184 if(verbosity)
185 cout << "File:\t\t" << fileNames[i] << endl << "Values:\t\tSequences: " << numSeqs
186 << "\tResidues: " << numResiduesAlig[i] << "\tPond. Hits: " << setw(8)
187 << value << "\t%Consistency: " << value/numResiduesAlig[i] << endl;
188 /* ***** ***** ***** ***** ***** ***** ***** ***** */
189
190 /* ***** ***** ***** ***** ***** ***** ***** ***** */
191 /* Keep the alignment with higher consistency value */
192 if((value/numResiduesAlig[i]) > max) {
193 alig = i;
194 max = value/numResiduesAlig[i];
195 }
196 /* ***** ***** ***** ***** ***** ***** ***** ***** */
197 }
198
199 /* ***** ***** ***** ***** ***** ***** ***** ***** */
200 /* Prints the alignment that have been selected */
201 if((verbosity) && (!appearErrors)) {
202 cout << "\t\t\t\t\t--------------" << endl;
203 cout << endl << "File Selected:\t" << fileNames[alig] << endl << "Value:\t\t" << max << endl << endl;
204 }
205 /* ***** ***** ***** ***** ***** ***** ***** ***** */
206
207 /* ***** ***** ***** ***** ***** ***** ***** ***** */
208 /* The method returns a vector with the consistency
209 * value for each column in the selected alignment */
210 if((columnsValue != NULL) && (!appearErrors)) {
211 utils::initlVect(columnsValue, numResiduesAlig[alig], -1);
212 for(i = 0; i < numResiduesAlig[alig]; i++)
213 columnsValue[i] = vectHits[alig][i];
214 }
215 /* ***** ***** ***** ***** ***** ***** ***** ***** */
216
217 /* ***** ***** ***** ***** ***** ***** ***** ***** */
218 /* Deallocate memmory */
219 for(i = 0; ((i < numAlignments) && (!appearErrors)); i++)
220 delete [] vectHits[i];
221 delete [] vectHits;
222
223 delete [] names;
224 delete [] correspNames;
225 delete [] numResiduesAlig;
226 delete [] columnSeqMatrix;
227 delete [] columnSeqMatrixAux;
228 /* ***** ***** ***** ***** ***** ***** ***** ***** */
229
230 /* ***** ***** ***** ***** ***** ***** ***** ***** */
231 /* Return the selected alignment index or an error
232 * flag otherwise */
233 if(appearErrors) return -1;
234 else return alig;
235 /* ***** ***** ***** ***** ***** ***** ***** ***** */
236 }
237
238 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
239 /* This method returns the consistency value vector for a given alignment
240 * against a set of alignments with the same sequences */
241 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
242 bool compareFiles::forceComparison(alignment **vectAlignments, int numAlignments, alignment *selected, float *columnsValue) {
243
244 int *correspNames, *columnSeqMatrix, *columnSeqMatrixAux;
245 int i, j, k, ll, numResidues, numSeqs, pairRes, hit;
246 bool appearErrors = false;
247 string *names;
248
249 /* ***** ***** ***** ***** ***** ***** ***** ***** */
250 /* Get some parameters from the alignment that has
251 * been selected */
252 numResidues = selected -> getNumAminos();
253 numSeqs = selected -> getNumSpecies();
254 /* ***** ***** ***** ***** ***** ***** ***** ***** */
255
256 /* ***** ***** ***** ***** ***** ***** ***** ***** */
257 /* Initialize the vector where we are going to store
258 * the proportion of hits for each column in the
259 * selected alignment */
260 utils::initlVect(columnsValue, numResidues, 0);
261 /* ***** ***** ***** ***** ***** ***** ***** ***** */
262
263 /* ***** ***** ***** ***** ***** ***** ***** ***** */
264 /* Allocate dinamic local memory */
265 names = new string[numSeqs];
266 correspNames = new int[numSeqs];
267 columnSeqMatrix = new int[numSeqs];
268 columnSeqMatrixAux = new int[numSeqs];
269 /* ***** ***** ***** ***** ***** ***** ***** ***** */
270
271 /* ***** ***** ***** ***** ***** ***** ***** ***** */
272 /* Check that all of alignment has the same number of
273 * sequence as well as there exists a correspondence
274 * between the names for each pars of aligs. */
275 for(i = 0; i < numAlignments; i++) {
276 /* ***** ***** ***** ***** ***** ***** ***** ***** */
277 if(numSeqs != vectAlignments[i] -> getNumSpecies()) {
278 cerr << endl << "ERROR: The files to compare do not have "
279 << "the same number of sequences" << endl << endl;
280 appearErrors = true;
281 break;
282 }
283 /* ***** ***** ***** ***** ***** ***** ***** ***** */
284
285 /* ***** ***** ***** ***** ***** ***** ***** ***** */
286 vectAlignments[i] -> getSequences(names);
287 if(!selected -> getSeqNameOrder(names, correspNames)) {
288 cerr << endl << "ERROR: The files to compare do not"
289 << " have the sequence names" << endl << endl;
290 appearErrors = true;
291 break;
292 }
293 /* ***** ***** ***** ***** ***** ***** ***** ***** */
294 }
295 /* ***** ***** ***** ***** ***** ***** ***** ***** */
296
297 /* ***** ***** ***** ***** ***** ***** ***** ***** */
298 /* Changes the order in sequences number matrix
299 * according to the order in the selected alignment */
300 for(i = 0; i < numAlignments; i++) {
301 vectAlignments[i] -> getSequences(names);
302 selected -> getSeqNameOrder(names, correspNames);
303 vectAlignments[i] -> setSeqMatrixOrder(correspNames);
304 }
305 /* ***** ***** ***** ***** ***** ***** ***** ***** */
306
307 /* ***** ***** ***** ***** ***** ***** ***** ***** */
308 /* Do the same analysis for each column */
309 for(i = 0, pairRes = 0, hit = 0; ((i < numResidues) && (!appearErrors)); i++, pairRes = 0, hit = 0) {
310
311 /* ***** ***** ***** ***** ***** ***** ***** ***** */
312 /* We get back the sequence position for each residue
313 * from every column in the selected alignment */
314 utils::initlVect(columnSeqMatrix, numSeqs, 0);
315 selected -> getColumnSeqMatrix(i, columnSeqMatrix);
316 /* ***** ***** ***** ***** ***** ***** ***** ***** */
317
318 /* ***** ***** ***** ***** ***** ***** ***** ***** */
319 /* For each residue pairs, we look for it in the rest
320 * of alignments */
321 for(j = 0; j < numSeqs; j++) {
322
323 /* ***** ***** ***** ***** ***** ***** ***** ***** */
324 /* If there is a residue, not a gap, we carry out the
325 * assesment */
326 if(columnSeqMatrix[j] != 0) {
327 for(k = 0; k < numAlignments; k++) {
328
329 /* ***** ***** ***** ***** ***** ***** ***** ***** */
330 /* We look for the same residue in the same row in
331 * the rest of alignments */
332 utils::initlVect(columnSeqMatrixAux, numSeqs, 0);
333 vectAlignments[k] -> getColumnSeqMatrix(columnSeqMatrix[j], j, columnSeqMatrixAux);
334 /* ***** ***** ***** ***** ***** ***** ***** ***** */
335
336 /* ***** ***** ***** ***** ***** ***** ***** ***** */
337 /* We count when we get the same residue pairs in the
338 * rest of alignments */
339 for(ll = j + 1; ll < numSeqs; ll++)
340 if(columnSeqMatrix[ll] != 0) {
341 if(columnSeqMatrix[ll] == columnSeqMatrixAux[ll])
342 hit++;
343 pairRes++;
344 }
345 /* ***** ***** ***** ***** ***** ***** ***** ***** */
346 }
347 /* ***** ***** ***** ***** ***** ***** ***** ***** */
348 }
349 /* ***** ***** ***** ***** ***** ***** ***** ***** */
350 }
351 /* ***** ***** ***** ***** ***** ***** ***** ***** */
352 /* Store the hits proportion for each column */
353 if(pairRes != 0) columnsValue[i] += ((1.0 * hit)/pairRes);
354 /* ***** ***** ***** ***** ***** ***** ***** ***** */
355 }
356 /* ***** ***** ***** ***** ***** ***** ***** ***** */
357
358 /* ***** ***** ***** ***** ***** ***** ***** ***** */
359 /* Deallocate dinamic memory */
360 delete [] names;
361 delete [] correspNames;
362 delete [] columnSeqMatrix;
363 delete [] columnSeqMatrixAux;
364 /* ***** ***** ***** ***** ***** ***** ***** ***** */
365
366 /* ***** ***** ***** ***** ***** ***** ***** ***** */
367 /* If it was fine, return true. Otherwise, return
368 * false */
369 if(appearErrors) return false;
370 else return true;
371 /* ***** ***** ***** ***** ***** ***** ***** ***** */
372 }
373
374 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
375 /* This method applies a specific windows size to a selected alignment */
376 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
377 bool compareFiles::applyWindow(int columns, int halfWindow, float *columnsValue) {
378
379 int i, j, window;
380 float *vectAux;
381
382 /* ***** ***** ***** ***** ***** ***** ***** ***** */
383 /* If windows size is greater than 1/4 of alignment
384 *length, trimAl rejects this windows size */
385 if(halfWindow > columns/4) return false;
386 else window = (2 * halfWindow + 1);
387 /* ***** ***** ***** ***** ***** ***** ***** ***** */
388
389 /* ***** ***** ***** ***** ***** ***** ***** ***** */
390 /* Allocate local memory. Copy the array values to
391 * auxiliar memory. */
392 vectAux = new float[columns];
393 utils::copyVect(columnsValue, vectAux, columns);
394 /* ***** ***** ***** ***** ***** ***** ***** ***** */
395
396 /* ***** ***** ***** ***** ***** ***** ***** ***** */
397 /* For each column from the selected alignment,
398 * compute the average for its consistency values */
399 for(i = 0; i < columns; i++) {
400 /* ***** ***** ***** ***** ***** ***** ***** ***** */
401 /* This average is computed from halfWindow positions
402 * before to halfWindow positions after */
403 for(j = i - halfWindow, columnsValue[i] = 0; j <= i + halfWindow; j++) {
404 if(j < 0) columnsValue[i] += vectAux[-j];
405 else if(j >= columns)
406 columnsValue[i] += vectAux[((2 * columns - j) - 2)];
407 else columnsValue[i] += vectAux[j];
408 }
409 /* ***** ***** ***** ***** ***** ***** ***** ***** */
410
411 /* ***** ***** ***** ***** ***** ***** ***** ***** */
412 /* Finally, the column value is divided by the window
413 * size in order to compute the average score. */
414 columnsValue[i] /= window;
415 }
416 /* ***** ***** ***** ***** ***** ***** ***** ***** */
417
418 /* ***** ***** ***** ***** ***** ***** ***** ***** */
419 /* Deallocate dinamic memory */
420 delete [] vectAux;
421 /* ***** ***** ***** ***** ***** ***** ***** ***** */
422
423 /* ***** ***** ***** ***** ***** ***** ***** ***** */
424 /* If everything is OK, return true */
425 return true;
426 /* ***** ***** ***** ***** ***** ***** ***** ***** */
427 }
428
429 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
430 /* Print the consistency value for each column from the selected alignment */
431 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
432 void compareFiles::printStatisticsFileColumns(int numAminos, float *compareVect) {
433
434 /* ***** ***** ***** ***** ***** ***** ***** ***** */
435 /* Prepare the header information */
436 cout << "| Residue\tConsistency |" << endl;
437 cout << "| Number \t Value |" << endl;
438 cout << "+---------------------------+" << endl;
439 cout.precision(10);
440 /* ***** ***** ***** ***** ***** ***** ***** ***** */
441
442 /* ***** ***** ***** ***** ***** ***** ***** ***** */
443 /* Print the consistency values for each column from
444 * the selected alignment */
445 for(int i = 0; i < numAminos; i++)
446 cout << " " << setw(5) << i + 1 << "\t"
447 << "\t" << compareVect[i] << endl;
448 /* ***** ***** ***** ***** ***** ***** ***** ***** */
449 }
450
451 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
452 /* Print the consistency values accumulative distribution for the selected
453 * alignment */
454 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
455 void compareFiles::printStatisticsFileAcl(int numAminos, float *compareVect) {
456
457 float refer, *vectAux;
458 int i, num;
459
460 /* ***** ***** ***** ***** ***** ***** ***** ***** */
461 /* Allocate dinamic memory to copy the input vector
462 * and sort it */
463 vectAux = new float[numAminos];
464 utils::copyVect(compareVect, vectAux, numAminos);
465 utils::quicksort(vectAux, 0, numAminos-1);
466 /* ***** ***** ***** ***** ***** ***** ***** ***** */
467
468 /* ***** ***** ***** ***** ***** ***** ***** ***** */
469 /* Set the output precision and print the header */
470 cout << "| Number of\t \t|\t Cumulative \t% "
471 << "Cumulative\t| Consistency |" << endl;
472 cout << "| Residues \t% Length\t|\tNumberResid.\t "
473 << "Length \t| Value |" << endl;
474 cout << "+-------------------------------+------------"
475 << "---------------------------+-----------------+"
476 << endl;
477 cout.precision(10);
478 /* ***** ***** ***** ***** ***** ***** ***** ***** */
479
480 /* ***** ***** ***** ***** ***** ***** ***** ***** */
481 /* Fix the initial values to count how many columns
482 * has the same consistency value */
483 refer = vectAux[0];
484 num = 1;
485 /* ***** ***** ***** ***** ***** ***** ***** ***** */
486
487 /* ***** ***** ***** ***** ***** ***** ***** ***** */
488 /* Print the accumulative distribution */
489 for(i = 1; i < numAminos; i++) {
490 /* ***** ***** ***** ***** ***** ***** ***** ***** */
491 /* When the method detects a new consistency value
492 * print the previous value as well as its frequency
493 * and starts to count how many columns are for this
494 * new value */
495 if(refer != vectAux[i]) {
496 cout << " " << num << "\t\t" << setw(10) << ((float) num/numAminos * 100.0)
497 << "\t\t" << i << "\t\t" << setw(10) << ((float) i/numAminos * 100.0)
498 << "\t" << setw(15) << refer << endl;
499 refer = vectAux[i];
500 num = 1;
501 }
502 else num++;
503 /* ***** ***** ***** ***** ***** ***** ***** ***** */
504 }
505
506 /* ***** ***** ***** ***** ***** ***** ***** ***** */
507 /* Print the last consistency value as well as its
508 * frequency */
509 cout << " " << num << "\t\t" << setw(10) << ((float) num/numAminos * 100.0)
510 << "\t\t" << i << "\t\t" << setw(10) << ((float) i/numAminos * 100.0)
511 << "\t" << setw(15) << refer << endl;
512 /* ***** ***** ***** ***** ***** ***** ***** ***** */
513
514 /* ***** ***** ***** ***** ***** ***** ***** ***** */
515 /* Deallocate dinamic memory */
516 delete [] vectAux;
517 /* ***** ***** ***** ***** ***** ***** ***** ***** */
518 }