Mercurial > repos > padge > trimal
comparison trimal_repo/source/alignment.cpp @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author | padge |
---|---|
date | Fri, 25 Mar 2022 17:10:43 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b15a3147e604 |
---|---|
1 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** | |
2 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** | |
3 | |
4 trimAl v1.4: a tool for automated alignment trimming in large-scale | |
5 phylogenetics analyses. | |
6 | |
7 readAl v1.4: a tool for automated alignment conversion among different | |
8 formats. | |
9 | |
10 statAl v1.4: a tool for getting stats about multiple sequence alignments. | |
11 | |
12 | |
13 2009-2015 Capella-Gutierrez S. and Gabaldon, T. | |
14 [scapella, tgabaldon]@crg.es | |
15 | |
16 This file is part of trimAl/readAl. | |
17 | |
18 trimAl/readAl are free software: you can redistribute it and/or modify | |
19 it under the terms of the GNU General Public License as published by | |
20 the Free Software Foundation, the last available version. | |
21 | |
22 trimAl/readAl are distributed in the hope that it will be useful, | |
23 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
25 GNU General Public License for more details. | |
26 | |
27 You should have received a copy of the GNU General Public License | |
28 along with trimAl/readAl. If not, see <http://www.gnu.org/licenses/>. | |
29 | |
30 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** | |
31 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
32 using namespace std; | |
33 | |
34 #include <float.h> | |
35 #include "alignment.h" | |
36 #include "rwAlignment.cpp" | |
37 #include "autAlignment.cpp" | |
38 | |
39 #include <deque> | |
40 | |
41 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
42 /* Class constructor */ | |
43 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
44 | |
45 alignment::alignment(void) { | |
46 | |
47 /* Alignment parameter */ | |
48 sequenNumber = 0; | |
49 residNumber = 0; | |
50 | |
51 /* Are the input sequences aligned? */ | |
52 isAligned = false; | |
53 | |
54 /* Should the output file be reversed? */ | |
55 reverse = false; | |
56 | |
57 /* Should be trimmed only terminal gaps? - set automated and manual boundaries | |
58 * values */ | |
59 terminalGapOnly = false; | |
60 left_boundary = -1; | |
61 right_boundary = -1; | |
62 | |
63 /* Input and output formats */ | |
64 iformat = 0; | |
65 oformat = 0; | |
66 shortNames = false; | |
67 | |
68 forceCaps = false; | |
69 upperCase = false; | |
70 lowerCase = false; | |
71 | |
72 /* Indicate whether sequences composed only by gaps should be kept or not */ | |
73 keepSequences = false; | |
74 | |
75 /* Indicate whether original header, they may include non-alphanumerical | |
76 * characters, should be dumped into output stream without any preprocessing | |
77 * step */ | |
78 keepHeader = false; | |
79 | |
80 gapSymbol = "-"; | |
81 | |
82 /* Sequence datatype: DNA, RNA or Protein */ | |
83 dataType = 0; | |
84 | |
85 /* Window sizes to trim the input alignment */ | |
86 ghWindow = 0; | |
87 shWindow = 0; | |
88 | |
89 /* Minimum block size in the new alignment */ | |
90 blockSize = 0; | |
91 | |
92 /* Is this alignmnet new? */ | |
93 oldAlignment = false; | |
94 | |
95 /* Sequence residues number */ | |
96 residuesNumber = NULL; | |
97 | |
98 /* Columns and sequences that have been previously selected */ | |
99 saveResidues = NULL; | |
100 saveSequences = NULL; | |
101 | |
102 /* Input sequences as well other information such as sequences name, etc */ | |
103 sequences = NULL; | |
104 seqsName = NULL; | |
105 seqsInfo = NULL; | |
106 | |
107 /* Information about input alignment */ | |
108 filename = ""; | |
109 aligInfo = ""; | |
110 | |
111 /* Information computed from alignment */ | |
112 sgaps = NULL; | |
113 scons = NULL; | |
114 seqMatrix = NULL; | |
115 | |
116 identities = NULL; | |
117 overlaps = NULL; | |
118 | |
119 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
120 } | |
121 | |
122 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
123 /* Class constructor */ | |
124 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
125 | |
126 alignment::alignment(string o_filename, string o_aligInfo, string *o_sequences, string *o_seqsName, | |
127 string *o_seqsInfo, int o_sequenNumber, int o_residNumber, int o_iformat, int o_oformat, | |
128 bool o_shortNames, int o_dataType, int o_isAligned, bool o_reverse, bool o_terminalGapOnly, | |
129 int o_left_boundary, int o_right_boundary, | |
130 bool o_keepSeqs, bool o_keepHeader, int OldSequences, int OldResidues, int *o_residuesNumber, | |
131 int *o_saveResidues, int *o_saveSequences, int o_ghWindow, int o_shWindow, int o_blockSize, | |
132 float **o_identities, float **o_overlaps) { | |
133 | |
134 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
135 int i, j, k, ll; | |
136 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
137 | |
138 oldAlignment = true; | |
139 | |
140 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
141 /* Assign the parameter values to the variables */ | |
142 sequenNumber = o_sequenNumber; | |
143 residNumber = o_residNumber; | |
144 | |
145 iformat = o_iformat; | |
146 oformat = o_oformat; | |
147 shortNames = o_shortNames; | |
148 | |
149 dataType = o_dataType; | |
150 | |
151 ghWindow = o_ghWindow; | |
152 shWindow = o_shWindow; | |
153 | |
154 blockSize = o_blockSize; | |
155 | |
156 isAligned = o_isAligned; | |
157 reverse = o_reverse; | |
158 | |
159 terminalGapOnly = o_terminalGapOnly; | |
160 right_boundary = o_right_boundary; | |
161 left_boundary = o_left_boundary; | |
162 | |
163 filename = o_filename; | |
164 aligInfo = o_aligInfo; | |
165 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
166 | |
167 keepSequences = o_keepSeqs; | |
168 keepHeader = o_keepHeader; | |
169 | |
170 forceCaps = false; | |
171 upperCase = false; | |
172 lowerCase = false; | |
173 | |
174 gapSymbol = "-"; | |
175 | |
176 | |
177 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
178 /* Basic information for the new alignment */ | |
179 sequences = new string[sequenNumber]; | |
180 for(i = 0; i < sequenNumber; i++) | |
181 sequences[i] = o_sequences[i]; | |
182 | |
183 seqsName = new string[sequenNumber]; | |
184 for(i = 0; i < sequenNumber; i++) | |
185 seqsName[i] = o_seqsName[i]; | |
186 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
187 | |
188 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
189 residuesNumber = new int[sequenNumber]; | |
190 if((isAligned) || (o_residuesNumber != NULL)) { | |
191 for(i = 0; i < sequenNumber; i++) | |
192 residuesNumber[i] = residNumber; | |
193 } else { | |
194 for(i = 0; i < sequenNumber; i++) | |
195 residuesNumber[i] = o_residuesNumber[i]; | |
196 } | |
197 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
198 | |
199 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
200 if(o_seqsInfo != NULL) { | |
201 seqsInfo = new string[sequenNumber]; | |
202 for(i = 0; i < sequenNumber; i++) | |
203 seqsInfo[i] = o_seqsInfo[i]; | |
204 } else seqsInfo = NULL; | |
205 | |
206 saveResidues = NULL; | |
207 if(o_saveResidues != NULL) { | |
208 saveResidues = new int[residNumber]; | |
209 for(i = 0, j = 0; i < OldResidues; i++) | |
210 if(o_saveResidues[i] != -1) { | |
211 saveResidues[j] = o_saveResidues[i]; | |
212 j++; | |
213 } | |
214 } | |
215 | |
216 saveSequences = NULL; | |
217 if(o_saveSequences != NULL) { | |
218 saveSequences = new int[sequenNumber]; | |
219 for(i = 0, j = 0; i < OldSequences; i++) | |
220 if(o_saveSequences[i] != -1) { | |
221 saveSequences[j] = o_saveSequences[i]; | |
222 j++; | |
223 } | |
224 } | |
225 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
226 | |
227 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
228 identities = NULL; | |
229 if(o_identities != NULL) { | |
230 identities = new float*[sequenNumber]; | |
231 for(i = 0, j = 0; i < OldSequences; i++) { | |
232 if(o_saveSequences[i] != -1) { | |
233 identities[j] = new float[sequenNumber]; | |
234 for(k = 0, ll = 0; k < OldSequences; k++) { | |
235 if(o_saveSequences[k] != -1) { | |
236 identities[j][ll] = o_identities[i][k]; | |
237 ll++; | |
238 } | |
239 } | |
240 j++; | |
241 } | |
242 } | |
243 } | |
244 | |
245 overlaps = NULL; | |
246 if(o_overlaps != NULL) { | |
247 overlaps = new float*[sequenNumber]; | |
248 for(i = 0, j = 0; i < OldSequences; i++) { | |
249 if(o_saveSequences[i] != -1) { | |
250 overlaps[j] = new float[sequenNumber]; | |
251 for(k = 0, ll = 0; k < OldSequences; k++) { | |
252 if(o_saveSequences[k] != -1) { | |
253 overlaps[j][ll] = o_overlaps[i][k]; | |
254 ll++; | |
255 } | |
256 } | |
257 j++; | |
258 } | |
259 } | |
260 } | |
261 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
262 | |
263 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
264 /* Any structure associated to the new alignment is | |
265 * initialize to NULL. In this way, these structure, | |
266 * if it will be necessary, has to be computed */ | |
267 sgaps = NULL; | |
268 scons = NULL; | |
269 seqMatrix = NULL; | |
270 identities = NULL; | |
271 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
272 } | |
273 | |
274 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
275 /* Overlapping operator = to use it as a kind of class constructor */ | |
276 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
277 | |
278 alignment &alignment::operator=(const alignment &old) { | |
279 int i, j; | |
280 | |
281 if(this != &old) { | |
282 | |
283 oldAlignment = true; | |
284 | |
285 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
286 /* Assign the parameter values to the variables */ | |
287 sequenNumber = old.sequenNumber; | |
288 residNumber = old.residNumber; | |
289 | |
290 isAligned = old.isAligned; | |
291 reverse = old.reverse; | |
292 | |
293 terminalGapOnly = old.terminalGapOnly; | |
294 right_boundary = old.right_boundary; | |
295 left_boundary = old.left_boundary; | |
296 | |
297 iformat = old.iformat; | |
298 oformat = old.oformat; | |
299 shortNames = old.shortNames; | |
300 | |
301 dataType = old.dataType; | |
302 | |
303 ghWindow = old.ghWindow; | |
304 shWindow = old.shWindow; | |
305 | |
306 blockSize = old.blockSize; | |
307 | |
308 filename = old.filename; | |
309 aligInfo = old.aligInfo; | |
310 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
311 | |
312 keepSequences = old.keepSequences; | |
313 keepHeader = old.keepHeader; | |
314 | |
315 forceCaps = old.forceCaps; | |
316 upperCase = old.upperCase; | |
317 lowerCase = old.lowerCase; | |
318 | |
319 gapSymbol = old.gapSymbol; | |
320 | |
321 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
322 sequences = new string[sequenNumber]; | |
323 for(i = 0; i < sequenNumber; i++) | |
324 sequences[i] = old.sequences[i]; | |
325 | |
326 seqsName = new string[sequenNumber]; | |
327 for(i = 0; i < sequenNumber; i++) | |
328 seqsName[i] = old.seqsName[i]; | |
329 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
330 | |
331 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
332 delete [] residuesNumber; | |
333 if(old.residuesNumber) { | |
334 residuesNumber = new int[sequenNumber]; | |
335 for(i = 0; i < sequenNumber; i++) | |
336 residuesNumber[i] = old.residuesNumber[i]; | |
337 } residuesNumber = NULL; | |
338 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
339 | |
340 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
341 delete [] seqsInfo; | |
342 if(old.seqsInfo) { | |
343 seqsInfo = new string[sequenNumber]; | |
344 for(i = 0; i < sequenNumber; i++) | |
345 seqsInfo[i] = old.seqsInfo[i]; | |
346 } else seqsInfo = NULL; | |
347 | |
348 delete [] saveResidues; | |
349 if(old.saveResidues) { | |
350 saveResidues = new int[residNumber]; | |
351 for(i = 0; i < residNumber; i++) | |
352 saveResidues[i] = old.saveResidues[i]; | |
353 } else saveResidues = NULL; | |
354 | |
355 delete [] saveSequences; | |
356 if(old.saveSequences) { | |
357 saveSequences = new int[sequenNumber]; | |
358 for(i = 0; i < sequenNumber; i++) | |
359 saveSequences[i] = old.saveSequences[i]; | |
360 } else saveSequences = NULL; | |
361 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
362 | |
363 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
364 delete [] identities; | |
365 if(old.identities) { | |
366 identities = new float*[sequenNumber]; | |
367 for(i = 0; i < sequenNumber; i++) { | |
368 identities[i] = new float[sequenNumber]; | |
369 for(j = 0; j < sequenNumber; j++) | |
370 identities[i][j] = old.identities[i][j]; | |
371 } | |
372 } else identities = NULL; | |
373 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
374 | |
375 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
376 delete [] overlaps; | |
377 if(old.overlaps) { | |
378 overlaps = new float*[sequenNumber]; | |
379 for(i = 0; i < sequenNumber; i++) { | |
380 overlaps[i] = new float[sequenNumber]; | |
381 for(j = 0; j < sequenNumber; j++) | |
382 overlaps[i][j] = old.overlaps[i][j]; | |
383 } | |
384 } else overlaps = NULL; | |
385 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
386 | |
387 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
388 delete sgaps; | |
389 sgaps = NULL; | |
390 | |
391 delete scons; | |
392 scons = NULL; | |
393 | |
394 delete seqMatrix; | |
395 seqMatrix = old.seqMatrix; | |
396 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
397 } | |
398 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
399 return *this; | |
400 } | |
401 | |
402 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
403 /* Class destructor */ | |
404 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
405 | |
406 alignment::~alignment(void) { | |
407 int i; | |
408 | |
409 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
410 if(sequences != NULL) | |
411 delete [] sequences; | |
412 sequences = NULL; | |
413 | |
414 if(seqsName != NULL) | |
415 delete [] seqsName; | |
416 seqsName = NULL; | |
417 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
418 | |
419 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
420 if(residuesNumber != NULL) | |
421 delete [] residuesNumber; | |
422 residuesNumber = NULL; | |
423 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
424 | |
425 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
426 if(seqsInfo != NULL) | |
427 delete [] seqsInfo; | |
428 seqsInfo = NULL; | |
429 | |
430 if(saveResidues != NULL) | |
431 delete[] saveResidues; | |
432 saveResidues = NULL; | |
433 | |
434 if(saveSequences != NULL) | |
435 delete[] saveSequences; | |
436 saveSequences = NULL; | |
437 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
438 | |
439 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
440 if(identities != NULL) { | |
441 for(i = 0; i < sequenNumber; i++) | |
442 delete [] identities[i]; | |
443 delete [] identities; | |
444 } | |
445 identities = NULL; | |
446 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
447 | |
448 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
449 if(overlaps != NULL) { | |
450 for(i = 0; i < sequenNumber; i++) | |
451 delete [] overlaps[i]; | |
452 delete [] overlaps; | |
453 } | |
454 overlaps = NULL; | |
455 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
456 | |
457 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
458 if(sgaps != NULL) | |
459 delete sgaps; | |
460 sgaps = NULL; | |
461 | |
462 if(scons != NULL) | |
463 delete scons; | |
464 scons = NULL; | |
465 | |
466 if(seqMatrix != NULL) | |
467 delete seqMatrix; | |
468 seqMatrix = NULL; | |
469 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
470 | |
471 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
472 oldAlignment = false; | |
473 | |
474 sequenNumber = 0; | |
475 residNumber = 0; | |
476 | |
477 isAligned = false; | |
478 reverse = false; | |
479 | |
480 terminalGapOnly = false; | |
481 right_boundary = -1; | |
482 left_boundary = -1; | |
483 | |
484 iformat = 0; | |
485 oformat = 0; | |
486 shortNames = false; | |
487 | |
488 dataType = 0; | |
489 | |
490 ghWindow = 0; | |
491 shWindow = 0; | |
492 | |
493 blockSize = 0; | |
494 | |
495 filename.clear(); | |
496 aligInfo.clear(); | |
497 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
498 } | |
499 | |
500 // Try to load current alignment and inform otherwise | |
501 bool alignment::loadAlignment(char *alignmentFile) { | |
502 | |
503 // Detect input alignment format - it is an strict detection procedure | |
504 iformat = formatInputAlignment(alignmentFile); | |
505 // Unless it is indicated somewhere else, output alignment format will be | |
506 // the same as the input one | |
507 oformat = iformat; | |
508 | |
509 // Use the appropiate function to read input alignment | |
510 switch(iformat) { | |
511 case 1: | |
512 return loadClustalAlignment(alignmentFile); | |
513 case 3: | |
514 return loadNBRF_PirAlignment(alignmentFile); | |
515 case 8: | |
516 return loadFastaAlignment(alignmentFile); | |
517 case 11: | |
518 return loadPhylip3_2Alignment(alignmentFile); | |
519 case 12: | |
520 return loadPhylipAlignment(alignmentFile); | |
521 case 17: | |
522 return loadNexusAlignment(alignmentFile); | |
523 case 21: | |
524 return loadMegaInterleavedAlignment(alignmentFile); | |
525 case 22: | |
526 return loadMegaNonInterleavedAlignment(alignmentFile); | |
527 // Return a FALSE value - meaning the input alignment was not loaded | |
528 default: | |
529 return false; | |
530 } | |
531 return false; | |
532 } | |
533 | |
534 // Return input alignment format - it is useful to set-up output alignment one | |
535 int alignment::getInputFormat(void) { | |
536 return iformat; | |
537 } | |
538 | |
539 // Return output alignment format | |
540 int alignment::getOutputFormat(void) { | |
541 return oformat; | |
542 } | |
543 | |
544 // Return whether there is one or more alignments at the input file. | |
545 // Currently trimAl family only support single alignments per file. | |
546 int alignment::typeInputFile(void) { | |
547 return SINGLE; | |
548 } | |
549 | |
550 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
551 /* This function prints the alignmnet to the standard output using an | |
552 * appropiate function depending on the output format */ | |
553 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
554 bool alignment::printAlignment(void){ | |
555 | |
556 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
557 if(sequences == NULL) | |
558 return false; | |
559 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
560 | |
561 if((residNumber == 0) || (sequenNumber == 0)) { | |
562 cerr << endl << "WARNING: Output alignment has not been generated. " | |
563 << "It is empty" << endl << endl; | |
564 return true; | |
565 } | |
566 | |
567 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
568 switch(oformat) { | |
569 case 1: | |
570 alignmentClustalToFile(cout); | |
571 break; | |
572 case 3: | |
573 alignmentNBRF_PirToFile(cout); | |
574 break; | |
575 case 8: | |
576 alignmentFastaToFile(cout); | |
577 break; | |
578 case 11: | |
579 alignmentPhylip3_2ToFile(cout); | |
580 break; | |
581 case 12: | |
582 alignmentPhylipToFile(cout); | |
583 break; | |
584 case 13: | |
585 alignmentPhylip_PamlToFile(cout); | |
586 break; | |
587 case 17: | |
588 alignmentNexusToFile(cout); | |
589 break; | |
590 case 21: case 22: | |
591 alignmentMegaToFile(cout); | |
592 break; | |
593 case 99: | |
594 getSequences(cout); | |
595 break; | |
596 case 100: | |
597 alignmentColourHTML(cout); | |
598 break; | |
599 default: | |
600 return false; | |
601 } | |
602 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
603 return true; | |
604 } | |
605 | |
606 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
607 /* This function puts the alignment in a given file */ | |
608 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
609 bool alignment::saveAlignment(char *destFile) { | |
610 | |
611 ofstream file; | |
612 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
613 if(sequences == NULL) | |
614 return false; | |
615 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
616 | |
617 if((residNumber == 0) || (sequenNumber == 0)) { | |
618 cerr << endl << "WARNING: Output alignment has not been generated. " | |
619 << "It is empty." << endl << endl; | |
620 return true; | |
621 } | |
622 | |
623 /* Check whether the input sequences file is aligned or not before creating | |
624 * a given output format. */ | |
625 switch(oformat) { | |
626 case 1: case 11: case 12: case 13: case 17: case 21: case 22: | |
627 /* Check whether sequences in the alignment are aligned or not. | |
628 * Warn about it if there are not aligned. */ | |
629 if (!isAligned) { | |
630 cerr << endl << "ERROR: Sequences are not aligned. Output format is " | |
631 << "not compatible with unaligned sequences."; | |
632 return false; | |
633 } | |
634 } | |
635 | |
636 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
637 /* File open and correct open check */ | |
638 file.open(destFile); | |
639 if(!file) return false; | |
640 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
641 | |
642 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
643 /* Depending on the output format, we call to the | |
644 * appropiate function */ | |
645 switch(oformat) { | |
646 case 1: | |
647 alignmentClustalToFile(file); | |
648 break; | |
649 case 3: | |
650 alignmentNBRF_PirToFile(file); | |
651 break; | |
652 case 8: | |
653 alignmentFastaToFile(file); | |
654 break; | |
655 case 11: | |
656 alignmentPhylip3_2ToFile(file); | |
657 break; | |
658 case 12: | |
659 alignmentPhylipToFile(file); | |
660 break; | |
661 case 13: | |
662 alignmentPhylip_PamlToFile(file); | |
663 break; | |
664 case 17: | |
665 alignmentNexusToFile(file); | |
666 break; | |
667 case 21: case 22: | |
668 alignmentMegaToFile(file); | |
669 break; | |
670 case 99: | |
671 getSequences(file); | |
672 break; | |
673 case 100: | |
674 alignmentColourHTML(file); | |
675 break; | |
676 default: | |
677 return false; | |
678 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
679 } | |
680 | |
681 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
682 /* Close the output file */ | |
683 file.close(); | |
684 | |
685 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
686 /* All is OK, return true */ | |
687 return true; | |
688 } | |
689 | |
690 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
691 /* This function trimms a given alignment based on the gap distribution | |
692 * values. To trim the alignment, this function uses two parameters: | |
693 * | |
694 * baseLine: Minimum percentage of columns that should | |
695 * be conserved in the new alignment. | |
696 * gapsPct: Maximum percentage of gaps per column that | |
697 * should be in the new alignment. | |
698 * | |
699 * The function selects that combination of parameters that maximize the | |
700 * final number of columns in the new alignment. There is an extra parameter | |
701 * to ask for the complementary alignment. A complementary alignment consits | |
702 * of those columns that would delete applying the standard approach. */ | |
703 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
704 alignment *alignment::cleanGaps(float baseLine, float gapsPct, bool complementary) { | |
705 | |
706 alignment *ret; | |
707 double cut; | |
708 | |
709 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
710 /* If gaps statistics are not calculated, we | |
711 * calculate them */ | |
712 if(calculateGapStats() != true) | |
713 return NULL; | |
714 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
715 | |
716 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
717 /* Obtain the cut point using the given parameters */ | |
718 cut = sgaps -> calcCutPoint(baseLine, gapsPct); | |
719 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
720 | |
721 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
722 /* Once we have the cut value proposed, we call the | |
723 * appropiate method to clean the alignment and, then, | |
724 * generate the new alignment. */ | |
725 ret = cleanByCutValue(cut, baseLine, sgaps -> getGapsWindow(), complementary); | |
726 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
727 | |
728 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
729 /* Return a reference of the new alignment */ | |
730 return ret; | |
731 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
732 } | |
733 | |
734 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
735 /* This function trimms a given alignment based on the similarity distribution | |
736 * values. To trim the alignment, this function uses two parameters: | |
737 * | |
738 * baseLine: Minimum percentage of columns that should | |
739 * be conserved in the new alignment. | |
740 * conservat: Minimum value of similarity per column that | |
741 * should be in the new alignment. | |
742 * | |
743 * The function selects that combination of parameters that maximize the | |
744 * final number of columns in the new alignment. There is an extra parameter | |
745 * to ask for the complementary alignment. A complementary alignment consits | |
746 * of those columns that would delete applying the standard approach. */ | |
747 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
748 alignment *alignment::cleanConservation(float baseLine, float conservationPct, bool complementary) { | |
749 | |
750 alignment *ret; | |
751 float cut; | |
752 | |
753 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
754 /* If conservation's statistics are not calculated, | |
755 * we calculate them */ | |
756 if(calculateConservationStats() != true) | |
757 return NULL; | |
758 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
759 | |
760 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
761 /* Calculate the cut point using the given parameters */ | |
762 cut = (float) scons -> calcCutPoint(baseLine, conservationPct); | |
763 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
764 | |
765 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
766 /* Once we have the cut value, we call the appropiate | |
767 * method to clean the alignment and, then, generate | |
768 the new alignment */ | |
769 ret = cleanByCutValue(cut, baseLine, scons -> getMdkwVector(), complementary); | |
770 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
771 | |
772 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
773 /* Return a reference of the new alignment */ | |
774 return ret; | |
775 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
776 } | |
777 | |
778 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
779 /* This function trimms a given alignment based on the similarity and gaps | |
780 * distribution values. To trim the alignment, this function uses three | |
781 * parameters: | |
782 * | |
783 * baseLine: Minimum percentage of columns that should | |
784 * be conserved in the new alignment. | |
785 * gapsPct: Maximum percentage of gaps per column that | |
786 * should be in the new alignment. | |
787 * conservat: Minimum value of similarity per column that | |
788 * should be in the new alignment. | |
789 * | |
790 * The function selects that combination of parameters that maximize the | |
791 * final number of columns in the new alignment. If the baseLine parameter | |
792 * is the most strict, the other two ones would be relaxed to keep columns | |
793 * that would not be selected. There is an extra parameter to ask for the | |
794 * complementary alignment. A complementary alignment consits of those | |
795 * columns that would delete applying the standard approach. */ | |
796 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
797 alignment *alignment::clean(float baseLine, float GapsPct, float conservationPct, bool complementary) { | |
798 | |
799 alignment *ret; | |
800 float cutCons; | |
801 double cutGaps; | |
802 | |
803 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
804 /* If gaps statistics are not calculated, we calculate | |
805 * them */ | |
806 if(calculateGapStats() != true) | |
807 return NULL; | |
808 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
809 | |
810 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
811 /* If conservation's statistics are not calculated, | |
812 * we calculate them */ | |
813 if(calculateConservationStats() != true) | |
814 return NULL; | |
815 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
816 | |
817 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
818 /* Calculate the two cut points using the parameters */ | |
819 cutGaps = sgaps->calcCutPoint(baseLine, GapsPct); | |
820 cutCons = scons->calcCutPoint(baseLine, conservationPct); | |
821 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
822 | |
823 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
824 /* Clean the alingment using the two cut values, the | |
825 * gapsWindow and MDK_Windows vectors and the baseline | |
826 * value */ | |
827 ret = cleanByCutValue(cutGaps, sgaps -> getGapsWindow(), baseLine, cutCons, scons -> getMdkwVector(), complementary); | |
828 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
829 | |
830 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
831 /* Return a reference of the clean alignment object */ | |
832 return ret; | |
833 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
834 } | |
835 | |
836 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
837 /* This function cleans a given alignment based on the consistency values | |
838 * asses from a dataset of alignments. The method takes three parameters: | |
839 * | |
840 * cutpoint: Lower limit (0-1) of comparefile value admits in | |
841 * the new alignment. | |
842 * baseline: Minimum percentage of columns to have in the new alignment | |
843 * vectValues: A vector with alignment's consistency values. | |
844 * | |
845 * The function computes the optimal parameter combination value to trim | |
846 * an alignment based on the consistency value from the comparison among | |
847 * a dataset of alignments with the same sequences. */ | |
848 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
849 alignment *alignment::cleanCompareFile(float cutpoint, float baseLine, float *vectValues, bool complementary) { | |
850 | |
851 alignment *ret; | |
852 float cut, *vectAux; | |
853 | |
854 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
855 /* Allocate memory */ | |
856 vectAux = new float[residNumber]; | |
857 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
858 | |
859 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
860 /* Sort a copy of the vectValues vector, and take the | |
861 * value at 100% - baseline position. */ | |
862 utils::copyVect((float *) vectValues, vectAux, residNumber); | |
863 utils::quicksort(vectAux, 0, residNumber-1); | |
864 cut = vectAux[(int) ((float)(residNumber - 1) * (100.0 - baseLine)/100.0)]; | |
865 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
866 | |
867 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
868 /* We have to decide which is the smallest value | |
869 * between the cutpoint value and the value from | |
870 * the minimum percentage threshold */ | |
871 cut = cutpoint < cut ? cutpoint : cut; | |
872 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
873 | |
874 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
875 /* Deallocate memory */ | |
876 delete [] vectAux; | |
877 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
878 | |
879 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
880 /* Clean the selected alignment using the input parameters. */ | |
881 ret = cleanByCutValue(cut, baseLine, vectValues, complementary); | |
882 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
883 | |
884 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
885 /* Return a refernce of the new alignment */ | |
886 return ret; | |
887 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
888 } | |
889 | |
890 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
891 /* This function removes those sequences that are misaligned with the rest | |
892 * of sequences in the alignment at given sequence and residue overlaps. | |
893 * | |
894 * overlapColumn: Set the minimum similarity fraction value that has | |
895 * to have a position from a given sequence to be | |
896 * considered as an hit. | |
897 * minimumOverlap: Set the minimum proportion of hits that has to be | |
898 * a sequence to keep it in the new alignment. | |
899 * | |
900 * At the same time, it's possible to get an alignment with those columns | |
901 * that this method should remove setting for that purpose the complementary | |
902 * parameter to True */ | |
903 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
904 alignment *alignment::cleanSpuriousSeq(float overlapColumn, float minimumOverlap, bool complementary) { | |
905 | |
906 float *overlapVector; | |
907 alignment *newAlig; | |
908 | |
909 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
910 /* Allocate local memory */ | |
911 overlapVector = new float[sequenNumber]; | |
912 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
913 | |
914 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
915 /* Compute the overlap's vector using the overlap | |
916 * column's value */ | |
917 if(!calculateSpuriousVector(overlapColumn, overlapVector)) | |
918 return NULL; | |
919 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
920 | |
921 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
922 /* Select and remove the sequences with a overlap less | |
923 * than threshold's overlap and create a new alignemnt */ | |
924 newAlig = cleanOverlapSeq(minimumOverlap, overlapVector, complementary); | |
925 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
926 | |
927 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
928 /* Deallocate local memory */ | |
929 delete [] overlapVector; | |
930 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
931 | |
932 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
933 /* Return a reference of the clean alignment object */ | |
934 return newAlig; | |
935 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
936 } | |
937 | |
938 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
939 /* This function implements the gappyout approach. To trim the alignment, this | |
940 * method computes the slope from the gaps distribution from the input alig. | |
941 * Using this information, the method asses the most abrupt change between | |
942 * three consecutive points from that distribution and selects the first of | |
943 * this point as aan cut-off point. | |
944 * | |
945 * The method also can return the complementary alignment that consists of | |
946 * those columns that would be delete applying this algorithm. */ | |
947 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
948 alignment *alignment::clean2ndSlope(bool complementarity) { | |
949 | |
950 alignment *ret; | |
951 int cut; | |
952 | |
953 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
954 /* If gaps statistics are not calculated, we calculate | |
955 * them */ | |
956 if(calculateGapStats() != true) | |
957 return NULL; | |
958 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
959 | |
960 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
961 /* We get the cut point using a automatic method for | |
962 * this purpose. */ | |
963 cut = sgaps -> calcCutPoint2ndSlope(); | |
964 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
965 | |
966 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
967 /* Using the cut point calculates in last steps, we | |
968 * clean the alignment and generate a new alignment */ | |
969 ret = cleanByCutValue(cut, 0, sgaps->getGapsWindow(), complementarity); | |
970 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
971 | |
972 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
973 /* Returns the new alignment. */ | |
974 return ret; | |
975 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
976 } | |
977 | |
978 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
979 /* This method implements the strict and strictplus approaches, to apply | |
980 * these algorithms, the program computes the gaps and similarity distribution | |
981 * values to decide which ones are the optimal cutoff points. To compute these | |
982 * cutoff points, the method applies those steps described in the manual. */ | |
983 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
984 alignment *alignment::cleanCombMethods(bool complementarity, bool variable) { | |
985 | |
986 float simCut, first20Point, last80Point, *simil, *vectAux; | |
987 int i, j, acm, gapCut, *positions, *gaps; | |
988 double inic, fin, vlr; | |
989 | |
990 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
991 /* If gaps statistics are not calculated, we calculate | |
992 * them */ | |
993 if(calculateGapStats() != true) | |
994 return NULL; | |
995 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
996 | |
997 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
998 /* Computes the gap cut point using a automatic method | |
999 * and at the same time, we get the gaps values from | |
1000 * the alignment. */ | |
1001 gapCut = sgaps -> calcCutPoint2ndSlope(); | |
1002 gaps = sgaps -> getGapsWindow(); | |
1003 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1004 | |
1005 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1006 /* If conservation's statistics are not calculated, | |
1007 * we calculate them */ | |
1008 if(calculateConservationStats() != true) | |
1009 return NULL; | |
1010 | |
1011 // Once the conservation stats have been calculated, generate the vector | |
1012 // containing the similarity value for each column considering parameters | |
1013 // such as windows size | |
1014 simil = scons -> getMdkwVector(); | |
1015 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1016 | |
1017 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1018 /* Allocate local memory and initializate it to -1 */ | |
1019 positions = new int[residNumber]; | |
1020 utils::initlVect(positions, residNumber, -1); | |
1021 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1022 | |
1023 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1024 /* The method only selects columns with gaps number | |
1025 * less or equal than the gap's cut point. Counts the | |
1026 * number of columns that have been selected */ | |
1027 for(i = 0, acm = 0; i < residNumber; i++) { | |
1028 if(gaps[i] <= gapCut) { | |
1029 positions[i] = i; | |
1030 acm++; | |
1031 } | |
1032 } | |
1033 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1034 | |
1035 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1036 /* Allocate local memory and save the similaritys | |
1037 * values for the columns that have been selected */ | |
1038 vectAux = new float[acm]; | |
1039 for(i = 0, j = 0; i < residNumber; i++) | |
1040 if(positions[i] != -1) | |
1041 vectAux[j++] = simil[i]; | |
1042 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1043 | |
1044 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1045 /* Sort the conservation's value vector. */ | |
1046 utils::quicksort(vectAux, 0, acm-1); | |
1047 | |
1048 /* ...and search for the vector points at the 20 and | |
1049 * 80% of length. */ | |
1050 first20Point = 0; | |
1051 last80Point = 0; | |
1052 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1053 | |
1054 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1055 for(i = acm - 1, j = 1; i >= 0; i--, j++) { | |
1056 if((((float) j/acm) * 100.0) <= 20.0) | |
1057 first20Point = vectAux[i]; | |
1058 if((((float) j/acm) * 100.0) <= 80.0) | |
1059 last80Point = vectAux[i]; | |
1060 } | |
1061 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1062 | |
1063 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1064 /* Computes the logaritmic's values for those points. | |
1065 * Finally the method computes the similarity cut | |
1066 * point using these values. */ | |
1067 inic = log10(first20Point); | |
1068 fin = log10(last80Point); | |
1069 vlr = ((inic - fin) / 10) + fin; | |
1070 simCut = (float) pow(10, vlr); | |
1071 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1072 | |
1073 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1074 /* Clean the alignment and generate a new alignment | |
1075 * object using the gaps cut and the similaritys cut | |
1076 * values */ | |
1077 alignment *ret = cleanStrict(gapCut, sgaps -> getGapsWindow(), | |
1078 simCut, scons -> getMdkwVector(), complementarity, variable); | |
1079 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1080 | |
1081 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1082 /* Deallocate local memory */ | |
1083 delete [] vectAux; | |
1084 delete [] positions; | |
1085 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1086 | |
1087 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1088 /* Return a reference of the new alignment */ | |
1089 return ret; | |
1090 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1091 } | |
1092 | |
1093 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1094 /* This function allows to delete those columns consistents of only gaps. | |
1095 * This function is specially useful when we remove misaligned sequences from | |
1096 * a given alignmnet. As the rest of functions, this function can return the | |
1097 * complementary alignment but this alignment would have only columns with | |
1098 * all gaps. */ | |
1099 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1100 alignment *alignment::cleanNoAllGaps(bool complementarity) { | |
1101 | |
1102 alignment *ret; | |
1103 | |
1104 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1105 /* If gaps statistics are not calculated, we calculate | |
1106 * them */ | |
1107 if(calculateGapStats() != true) | |
1108 return NULL; | |
1109 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1110 | |
1111 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1112 /* We want to conserve the columns with gaps' number | |
1113 * less or equal than sequences' number - 1 */ | |
1114 ret = cleanByCutValue((sequenNumber - 1), 0, sgaps->getGapsWindow(), complementarity); | |
1115 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1116 | |
1117 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1118 /* Returns the new alignment. */ | |
1119 return ret; | |
1120 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1121 } | |
1122 | |
1123 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1124 /* This method computes the overlap for each sequence given a overlap value. | |
1125 * This overlap set the minimum fraction that has to be a position for the | |
1126 * selected sequence to count as an hit. This proportion measures how much | |
1127 * is similar, (in terms of residues, indetermination and gaps), a given | |
1128 * position respect to the element on its column. */ | |
1129 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1130 bool alignment::calculateSpuriousVector(float overlap, float *spuriousVector) { | |
1131 | |
1132 int i, j, k, seqValue, ovrlap, hit; | |
1133 float floatOverlap; | |
1134 char indet; | |
1135 | |
1136 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1137 /* Compute the overlap */ | |
1138 floatOverlap = overlap * float(sequenNumber-1); | |
1139 ovrlap = int(overlap * (sequenNumber-1)); | |
1140 | |
1141 if(floatOverlap > float(ovrlap)) | |
1142 ovrlap++; | |
1143 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1144 | |
1145 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1146 /* If the spurious vectos is NULL, returns false. */ | |
1147 if(spuriousVector == NULL) | |
1148 return false; | |
1149 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1150 | |
1151 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1152 /* Depending on the kind of alignment, we have | |
1153 * different indetermination symbol */ | |
1154 if(getTypeAlignment() == AAType) | |
1155 indet = 'X'; | |
1156 else | |
1157 indet = 'N'; | |
1158 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1159 | |
1160 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1161 /* For each alignment's sequence, computes its overlap */ | |
1162 for(i = 0, seqValue = 0; i < sequenNumber; i++, seqValue = 0) { | |
1163 | |
1164 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1165 /* For each alignment's column, computes the overlap | |
1166 * between the selected sequence and the other ones */ | |
1167 for(j = 0; j < residNumber; j++) { | |
1168 | |
1169 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1170 /* For sequences are before the sequence selected */ | |
1171 for(k = 0, hit = 0; k < i; k++) { | |
1172 | |
1173 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1174 /* If the element of sequence selected is the same | |
1175 * that the element of sequence considered, computes | |
1176 * a hit */ | |
1177 if(sequences[i][j] == sequences[k][j]) | |
1178 hit++; | |
1179 | |
1180 /* If the element of sequence selected isn't a 'X' nor | |
1181 * 'N' (indetermination) or a '-' (gap) and the element | |
1182 * of sequence considered isn't a a 'X' nor 'N' | |
1183 * (indetermination) or a '-' (gap), computes a hit */ | |
1184 else if((sequences[i][j] != indet) && (sequences[i][j] != '-') | |
1185 && (sequences[k][j] != indet) && (sequences[k][j] != '-')) | |
1186 hit++; | |
1187 } | |
1188 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1189 | |
1190 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1191 /* For sequences are after the sequence selected */ | |
1192 for(k = (i + 1); k < sequenNumber; k++) { | |
1193 | |
1194 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1195 /* If the element of sequence selected is the same | |
1196 * that the element of sequence considered, computes | |
1197 * a hit */ | |
1198 if(sequences[i][j] == sequences[k][j]) | |
1199 hit++; | |
1200 | |
1201 /* If the element of sequence selected isn't a 'X' nor | |
1202 * 'N' (indetermination) or a '-' (gap) and the element | |
1203 * of sequence considered isn't a a 'X' nor 'N' | |
1204 * (indetermination) or a '-' (gap), computes a hit */ | |
1205 else if((sequences[i][j] != indet) && (sequences[i][j] != '-') | |
1206 && (sequences[k][j] != indet) && (sequences[k][j] != '-')) | |
1207 hit++; | |
1208 } | |
1209 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1210 | |
1211 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1212 /* Finally, if the hit's number divided by number of | |
1213 * sequences minus one is greater or equal than | |
1214 * overlap's value, computes a column's hit. */ | |
1215 if(hit >= ovrlap) | |
1216 seqValue++; | |
1217 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1218 } | |
1219 | |
1220 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1221 /* For each alignment's sequence, computes its spurious's | |
1222 * or overlap's value as the column's hits -for that | |
1223 sequence- divided by column's number. */ | |
1224 spuriousVector[i] = ((float) seqValue / residNumber); | |
1225 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1226 } | |
1227 | |
1228 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1229 /* If there is not problem in the method, return true */ | |
1230 return true; | |
1231 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1232 } | |
1233 | |
1234 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1235 /* This method compute the cluster belongs to a given alignment at certain | |
1236 * identity threshold. To know the clusters from the alignment, those | |
1237 * clusters are calculated as follow: 1) Select the longest sequences. 2) | |
1238 * Using the previous computed identity values, compare if the second | |
1239 * longest sequence belongs, because the identity value with the cluster | |
1240 * representative is equal or greater than a given threshold, to this cluster | |
1241 * or not. 3) If the sequence belongs to this cluster, we add it, if not | |
1242 * belongs, we create a new cluster and fix this sequence as its representative | |
1243 * 4) Continue with the rest of sequences. In the case that a given sequence | |
1244 * can belong to more than one cluster, we choose the cluster which one | |
1245 * maximize the identity value respect to its representative sequence */ | |
1246 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1247 int *alignment::calculateRepresentativeSeq(float maximumIdent) { | |
1248 | |
1249 int i, j, pos, clusterNum, **seqs; | |
1250 int *cluster; | |
1251 static int *repres; | |
1252 float max; | |
1253 | |
1254 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1255 /* Ask for the sequence identities assesment */ | |
1256 if(identities == NULL) | |
1257 calculateSeqIdentity(); | |
1258 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1259 | |
1260 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1261 seqs = new int*[sequenNumber]; | |
1262 for(i = 0; i < sequenNumber; i++) { | |
1263 seqs[i] = new int[2]; | |
1264 seqs[i][0] = utils::removeCharacter('-', sequences[i]).size(); | |
1265 seqs[i][1] = i; | |
1266 } | |
1267 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1268 | |
1269 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1270 utils::quicksort(seqs, 0, sequenNumber-1); | |
1271 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1272 | |
1273 cluster = new int[sequenNumber]; | |
1274 cluster[0] = seqs[sequenNumber - 1][1]; | |
1275 clusterNum = 1; | |
1276 | |
1277 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1278 for(i = sequenNumber - 2; i >= 0; i--) { | |
1279 | |
1280 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1281 for(j = 0, max = 0, pos = -1; j < clusterNum; j++) { | |
1282 if(identities[seqs[i][1]][cluster[j]] > maximumIdent) { | |
1283 if(identities[seqs[i][1]][cluster[j]] > max) { | |
1284 max = identities[seqs[i][1]][cluster[j]]; | |
1285 pos = j; | |
1286 } | |
1287 } | |
1288 } | |
1289 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1290 | |
1291 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1292 if(pos == -1) { | |
1293 cluster[j] = seqs[i][1]; | |
1294 clusterNum++; | |
1295 } | |
1296 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1297 } | |
1298 | |
1299 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1300 repres = new int[clusterNum + 1]; | |
1301 repres[0] = clusterNum; | |
1302 for(i = 0; i < clusterNum; i++) | |
1303 repres[i+1] = cluster[i]; | |
1304 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1305 | |
1306 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1307 /* Deallocate dinamic memory */ | |
1308 for(i = 0; i < sequenNumber; i++) | |
1309 delete [] seqs[i]; | |
1310 | |
1311 delete [] cluster; | |
1312 delete [] seqs; | |
1313 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1314 | |
1315 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1316 return repres; | |
1317 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1318 } | |
1319 | |
1320 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1321 /* This method looks for the optimal cut point for a given clusters number. | |
1322 * The idea is to find a identity cut-off point that can be used to get a | |
1323 * number of representative sequences similar to the input parameter */ | |
1324 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1325 float alignment::getCutPointClusters(int clusterNumber) { | |
1326 | |
1327 float max, min, avg, gMax, gMin, startingPoint, prevValue = 0, iter = 0; | |
1328 int i, j, clusterNum, *cluster, **seqs; | |
1329 | |
1330 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1331 /* If the user wants only one cluster means that all | |
1332 * of sequences have to be in the same clauster. | |
1333 * Otherwise, if the users wants the maximum number of | |
1334 * clusters means that each sequence have to be in their | |
1335 * own cluster */ | |
1336 if(clusterNumber == sequenNumber) return 1; | |
1337 else if(clusterNumber == 1) return 0; | |
1338 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1339 | |
1340 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1341 /* Ask for the sequence identities assesment */ | |
1342 if(identities == NULL) | |
1343 calculateSeqIdentity(); | |
1344 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1345 | |
1346 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1347 /* Compute the maximum, the minimum and the average | |
1348 * identity values from the sequences */ | |
1349 for(i = 0,gMax = 0, gMin = 1, startingPoint = 0; i < sequenNumber; i++) { | |
1350 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1351 for(j = 0, max = 0, avg = 0, min = 1; j < i; j++) { | |
1352 if(max < identities[i][j]) | |
1353 max = identities[i][j]; | |
1354 if(min > identities[i][j]) | |
1355 min = identities[i][j]; | |
1356 avg += identities[i][j]; | |
1357 } | |
1358 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1359 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1360 for(j = i + 1; j < sequenNumber; j++) { | |
1361 if(max < identities[i][j]) | |
1362 max = identities[i][j]; | |
1363 if(min > identities[i][j]) | |
1364 min = identities[i][j]; | |
1365 avg += identities[i][j]; | |
1366 } | |
1367 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1368 startingPoint += avg / (sequenNumber - 1); | |
1369 if(max > gMax) gMax = max; | |
1370 if(min < gMin) gMin = min; | |
1371 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1372 } | |
1373 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1374 | |
1375 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1376 /* Take the starting point as the average value */ | |
1377 startingPoint /= sequenNumber; | |
1378 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1379 | |
1380 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1381 /* Compute and sort the sequence length */ | |
1382 seqs = new int*[sequenNumber]; | |
1383 for(i = 0; i < sequenNumber; i++) { | |
1384 seqs[i] = new int[2]; | |
1385 seqs[i][0] = utils::removeCharacter('-', sequences[i]).size(); | |
1386 seqs[i][1] = i; | |
1387 } | |
1388 utils::quicksort(seqs, 0, sequenNumber-1); | |
1389 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1390 | |
1391 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1392 /* Create the data structure to store the different | |
1393 * clusters for a given thresholds */ | |
1394 cluster = new int[sequenNumber]; | |
1395 cluster[0] = seqs[sequenNumber - 1][1]; | |
1396 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1397 | |
1398 /**** ***** ***** ***** ***** ***** ***** ***** */ | |
1399 /* Look for the optimal identity value to get the | |
1400 * number of cluster set by the user. To do that, the | |
1401 * method starts in the average identity value and moves | |
1402 * to the right or lefe side of this value depending on | |
1403 * if this value is so strict or not. We set an flag to | |
1404 * avoid enter in an infinite loop, if we get the same | |
1405 * value for more than 10 consecutive point without get | |
1406 * the given cluster number means that it's impossible | |
1407 * to get this cut-off and we need to stop the search */ | |
1408 do { | |
1409 clusterNum = 1; | |
1410 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1411 /* Start the search */ | |
1412 for(i = sequenNumber - 2; i >= 0; i--) { | |
1413 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1414 for(j = 0; j < clusterNum; j++) | |
1415 if(identities[seqs[i][1]][cluster[j]] > startingPoint) | |
1416 break; | |
1417 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1418 | |
1419 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1420 if(j == clusterNum) { | |
1421 cluster[j] = seqs[i][1]; | |
1422 clusterNum++; | |
1423 } | |
1424 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1425 } | |
1426 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1427 | |
1428 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1429 /* Given the cutoff point, if we get the same cluster | |
1430 * number or we have been getting for more than 10 | |
1431 * consecutive times the same cutoff point, we stop */ | |
1432 if((clusterNum == clusterNumber) || (iter > 10)) | |
1433 break; | |
1434 else { | |
1435 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1436 /* We have to move to the left side from the average | |
1437 * to decrease the cutoff value and get a smaller number | |
1438 * of clusters */ | |
1439 if(clusterNum > clusterNumber) { | |
1440 gMax = startingPoint; | |
1441 startingPoint = (gMax + gMin)/2; | |
1442 } else { | |
1443 /* In the opposite side, we have to move to the right | |
1444 * side from the cutting point to be more strict and get | |
1445 * a bigger number of clusters */ | |
1446 gMin = startingPoint; | |
1447 startingPoint = (gMax + gMin)/2; | |
1448 } | |
1449 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1450 | |
1451 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1452 /* If the clusters number from the previous iteration | |
1453 * is different from the current number, we put the | |
1454 * iteration number to 0 and store this new value */ | |
1455 if(prevValue != clusterNum) { | |
1456 iter = 0; | |
1457 prevValue = clusterNum; | |
1458 /* Otherwise, we increase the iteration number to | |
1459 * avoid enter in an infinitve loop */ | |
1460 } else iter++; | |
1461 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1462 } | |
1463 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1464 } while(true); | |
1465 | |
1466 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1467 /* Deallocate dinamic memory */ | |
1468 for(i = 0; i < sequenNumber; i++) delete [] seqs[i]; | |
1469 delete [] seqs; | |
1470 delete [] cluster; | |
1471 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1472 | |
1473 return startingPoint; | |
1474 } | |
1475 | |
1476 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1477 /* This method select one representative sequence (the longest one) per each | |
1478 * cluster from the input alignment and generate a new alignment */ | |
1479 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1480 alignment *alignment::getClustering(float identityThreshold) { | |
1481 | |
1482 string *matrixAux, *newSeqsName; | |
1483 int i, j, *clustering; | |
1484 alignment *newAlig; | |
1485 | |
1486 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1487 /* Get the representative member for each cluster | |
1488 * given a maximum identity threshold */ | |
1489 clustering = calculateRepresentativeSeq(identityThreshold); | |
1490 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1491 | |
1492 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1493 /* Put all sequences to be deleted and get back those | |
1494 * sequences that are representative for each cluster | |
1495 * */ | |
1496 for(i = 0; i < sequenNumber; i ++) | |
1497 saveSequences[i] = -1; | |
1498 for(i = 1; i <= clustering[0]; i ++) | |
1499 saveSequences[clustering[i]] = clustering[i]; | |
1500 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1501 | |
1502 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1503 /* We allocate memory to save the sequences selected */ | |
1504 matrixAux = new string[clustering[0]]; | |
1505 newSeqsName = new string[clustering[0]]; | |
1506 | |
1507 /* Copy to new structures the information that have | |
1508 * been selected previously. */ | |
1509 for(i = 0, j = 0; i < sequenNumber; i++) | |
1510 if(saveSequences[i] != -1) { | |
1511 newSeqsName[j] = seqsName[i]; | |
1512 matrixAux[j] = sequences[i]; | |
1513 j++; | |
1514 } | |
1515 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1516 | |
1517 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1518 /* When we have all parameters, we create the new | |
1519 * alignment */ | |
1520 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
1521 clustering[0], residNumber, iformat, oformat, shortNames, dataType, isAligned, | |
1522 reverse, terminalGapOnly, left_boundary, right_boundary, | |
1523 keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber, | |
1524 saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities, | |
1525 overlaps); | |
1526 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1527 | |
1528 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1529 /* Deallocated auxiliar memory */ | |
1530 delete [] matrixAux; | |
1531 delete [] newSeqsName; | |
1532 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1533 | |
1534 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1535 /* Return the new alignment reference */ | |
1536 return newAlig; | |
1537 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1538 } | |
1539 | |
1540 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1541 /* This function returns the backtranslation for a given protein processed | |
1542 * alignment into its CDS alignment. To do this convertion, the function needs | |
1543 * the Coding sequences as well the original composition of each protein | |
1544 * sequence. Also, the function needs to know which columns/sequences will be | |
1545 * in the final alignment to carray out the conversion. */ | |
1546 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1547 alignment *alignment::getTranslationCDS(int newResidues, int newSequences, int *ColumnsToKeep, string *oldSeqsName, sequencesMatrix *seqMatrix, alignment *ProtAlig) { | |
1548 | |
1549 string *matrixAux; | |
1550 alignment *newAlig; | |
1551 int i, j, k, l, oldResidues; | |
1552 int *mappedSeqs, *tmpSequence, *selectedRes; | |
1553 | |
1554 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1555 /* Map the selected protein sequences to the input | |
1556 * coding sequences */ | |
1557 mappedSeqs = new int[newSequences]; | |
1558 for(i = 0; i < sequenNumber; i++) | |
1559 for(j = 0; j < newSequences; j++) | |
1560 if(!seqsName[i].compare(oldSeqsName[j])) { | |
1561 mappedSeqs[j] = i; | |
1562 break; | |
1563 } | |
1564 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1565 | |
1566 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1567 /* Get the original alignment size as well the original | |
1568 * selected/non-selected columns */ | |
1569 oldResidues = seqMatrix -> getResidNumber(); | |
1570 selectedRes = new int[oldResidues]; | |
1571 | |
1572 for(i = 0; i < oldResidues; i++) | |
1573 selectedRes[i] = i; | |
1574 | |
1575 for(j = 0; j < ColumnsToKeep[0]; j++) | |
1576 selectedRes[j] = -1; | |
1577 | |
1578 for(i = 0; i < newResidues - 1; i++) | |
1579 for(j = ColumnsToKeep[i] + 1; j < ColumnsToKeep[i+1]; j++) | |
1580 selectedRes[j] = -1; | |
1581 | |
1582 for(j = ColumnsToKeep[newResidues - 1] + 1; j < oldResidues; j++) | |
1583 selectedRes[j] = -1; | |
1584 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1585 | |
1586 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1587 /* We allocate memory to save the columns selected */ | |
1588 matrixAux = new string[newSequences]; | |
1589 tmpSequence = new int[oldResidues]; | |
1590 | |
1591 /* Using the information about which residues for each | |
1592 * sequence was selected by others function, we process | |
1593 * these residues to recover the corresponding codons */ | |
1594 for(i = 0; i < newSequences; i++) | |
1595 if(seqMatrix -> getSequence(oldSeqsName[i], tmpSequence)) { | |
1596 for(j = 0; j < oldResidues; j++) { | |
1597 if((selectedRes[j] != -1) && (tmpSequence[j] != 0)) { | |
1598 | |
1599 for(k = 3 * (tmpSequence[j] - 1), l = 0; l < 3; k++, l++) { | |
1600 /* Check whether the nucleotide sequences end has been reached or not. | |
1601 * If it has been reached, complete backtranslation using indetermination | |
1602 * symbols 'N' */ | |
1603 if((int) sequences[mappedSeqs[i]].length() > k) | |
1604 matrixAux[i].resize(matrixAux[i].size() + 1, sequences[mappedSeqs[i]][k]); | |
1605 else | |
1606 matrixAux[i].resize(matrixAux[i].size() + 1, 'N'); | |
1607 } | |
1608 } else if(selectedRes[j] != -1) { | |
1609 matrixAux[i].resize(matrixAux[i].size() + 3, '-'); | |
1610 } | |
1611 } | |
1612 /* If there is any problems with a sequence then | |
1613 * the function returns an error */ | |
1614 } else { | |
1615 return NULL; | |
1616 } | |
1617 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1618 | |
1619 | |
1620 /* Remove columns/sequences composed only by gaps before making the retrotranslation */ | |
1621 | |
1622 | |
1623 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1624 /* When we have all parameters, we create the new | |
1625 * alignment */ | |
1626 newAlig = new alignment(filename, "", matrixAux, oldSeqsName, NULL, newSequences, | |
1627 newResidues * 3, ProtAlig -> getInputFormat(), ProtAlig -> getOutputFormat(), | |
1628 ProtAlig -> getShortNames(), DNAType, true, ProtAlig -> getReverse(), | |
1629 terminalGapOnly, left_boundary, right_boundary, | |
1630 keepSequences, keepHeader, sequenNumber, oldResidues * 3, NULL, NULL, | |
1631 NULL, 0, 0, ProtAlig -> getBlockSize(), NULL, NULL); | |
1632 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1633 | |
1634 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1635 /* Deallocated auxiliar memory */ | |
1636 delete [] matrixAux; | |
1637 delete mappedSeqs; | |
1638 delete tmpSequence; | |
1639 delete selectedRes; | |
1640 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1641 | |
1642 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1643 /* Return the new alignment reference */ | |
1644 return newAlig; | |
1645 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1646 } | |
1647 | |
1648 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1649 /* This function computes the gaps statistics for the input alignment. */ | |
1650 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1651 bool alignment::calculateGapStats(void) { | |
1652 | |
1653 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1654 /* If alignment matrix is not created, return false */ | |
1655 if(sequences == NULL) | |
1656 return false; | |
1657 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1658 | |
1659 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1660 /* If sgaps object is not created, we create them | |
1661 and calculate the statistics */ | |
1662 if(sgaps == NULL) { | |
1663 sgaps = new statisticsGaps(sequences, sequenNumber, residNumber, dataType); | |
1664 sgaps -> applyWindow(ghWindow); | |
1665 } | |
1666 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1667 | |
1668 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1669 return true; | |
1670 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1671 } | |
1672 | |
1673 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1674 /* Print the gaps value for each column in the alignment */ | |
1675 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1676 void alignment::printStatisticsGapsColumns(void) { | |
1677 | |
1678 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1679 /* Check if there is computed the gaps statistics */ | |
1680 if(calculateGapStats()) | |
1681 /* then prints the information */ | |
1682 sgaps -> printGapsColumns(); | |
1683 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1684 } | |
1685 | |
1686 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1687 /* Print the acumulative gaps distribution value from the input alignment */ | |
1688 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1689 void alignment::printStatisticsGapsTotal(void) { | |
1690 | |
1691 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1692 /* Check it there is computed the gaps statistics */ | |
1693 if(calculateGapStats()) | |
1694 /* then prints the information */ | |
1695 sgaps -> printGapsAcl(); | |
1696 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1697 } | |
1698 | |
1699 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1700 /* Set the similarity matrix. This matrix is necessary for some methods in | |
1701 * the program */ | |
1702 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1703 bool alignment::setSimilarityMatrix(similarityMatrix *sm) { | |
1704 | |
1705 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1706 /* If scons object is not created, we create them */ | |
1707 if(scons == NULL) | |
1708 scons = new statisticsConservation(sequences, sequenNumber, residNumber, dataType); | |
1709 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1710 | |
1711 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1712 /* Associate the matrix to the similarity statistics | |
1713 * object */ | |
1714 if(!scons -> setSimilarityMatrix(sm)) | |
1715 return false; | |
1716 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1717 | |
1718 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1719 /* If it's OK, we return true */ | |
1720 return true; | |
1721 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1722 } | |
1723 | |
1724 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1725 /* This method compute the similarity values from the input alignment if it | |
1726 * has not been computed before */ | |
1727 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1728 bool alignment::calculateConservationStats(void) { | |
1729 | |
1730 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1731 /* It the gaps statistics object has not been created | |
1732 * we create it */ | |
1733 if(calculateGapStats() != true) | |
1734 return false; | |
1735 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1736 | |
1737 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1738 /* It the similarity statistics object has not been | |
1739 * created we create it */ | |
1740 if(scons == NULL) | |
1741 return false; | |
1742 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1743 | |
1744 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1745 /* Ask for the similarity matrix */ | |
1746 if(scons -> isSimMatrixDef() != true) | |
1747 return false; | |
1748 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1749 | |
1750 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1751 /* Compute the similarity statistics from the input | |
1752 * alignment */ | |
1753 if(!scons -> calculateVectors(sequences, sgaps->getGapsWindow())) | |
1754 return false; | |
1755 | |
1756 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1757 /* Ask to know if it is necessary to apply any window | |
1758 * method. If it's necessary, we apply it */ | |
1759 if(scons->isDefinedWindow()) | |
1760 return true; | |
1761 else | |
1762 return scons->applyWindow(shWindow); | |
1763 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1764 } | |
1765 | |
1766 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1767 /* Print the similarity value for each column from the alignment */ | |
1768 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1769 void alignment::printStatisticsConservationColumns(void) { | |
1770 | |
1771 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1772 /* Check if the similarity statistics object has been | |
1773 * created */ | |
1774 if(calculateConservationStats()) | |
1775 /* then prints the information */ | |
1776 scons -> printConservationColumns(); | |
1777 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1778 } | |
1779 | |
1780 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1781 /* Print the accumulative similarity distribution values from the alignment */ | |
1782 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1783 void alignment::printStatisticsConservationTotal(void) { | |
1784 | |
1785 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1786 /* Check if the similarity statistics object has been | |
1787 * created */ | |
1788 if(calculateConservationStats()) | |
1789 /* then prints the information */ | |
1790 scons -> printConservationAcl(); | |
1791 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1792 } | |
1793 | |
1794 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1795 /* This method prints the correspondece between the columns in the original | |
1796 * and in the trimmed alignment */ | |
1797 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1798 void alignment::printCorrespondence(void) { | |
1799 int i; | |
1800 | |
1801 cout << "#ColumnsMap\t"; | |
1802 /* Print the saveResidues relathionship */ | |
1803 for(i = 0; i < residNumber - 1; i++) | |
1804 cout << saveResidues[i] << ", "; | |
1805 cout << saveResidues[i] << endl; | |
1806 | |
1807 } | |
1808 | |
1809 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1810 /* Return the number of the sequences in the alignment */ | |
1811 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1812 int alignment::getNumSpecies(void) { | |
1813 return sequenNumber; | |
1814 } | |
1815 | |
1816 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1817 /* Return the number of residues in the alignment */ | |
1818 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1819 int alignment::getNumAminos(void) { | |
1820 return residNumber; | |
1821 } | |
1822 | |
1823 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1824 /* Sets the condition to remove only terminal gaps after applying any | |
1825 * trimming method or not. */ | |
1826 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1827 void alignment::trimTerminalGaps(bool terminalOnly_, int *boundaries_) { | |
1828 terminalGapOnly = terminalOnly_; | |
1829 | |
1830 /* We are only interested in the first and last number of the vector - this | |
1831 * vector is generated by other function where it is important to store all | |
1832 * intervals */ | |
1833 if (boundaries_ != NULL) { | |
1834 left_boundary = boundaries_[0]; | |
1835 right_boundary = boundaries_[1]; | |
1836 } | |
1837 } | |
1838 | |
1839 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1840 /* This method stores the diferent windows values in the alignment object */ | |
1841 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1842 void alignment::setWindowsSize(int ghWindow_, int shWindow_) { | |
1843 ghWindow = ghWindow_; | |
1844 shWindow = shWindow_; | |
1845 } | |
1846 | |
1847 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1848 /* This method lets to change the output format */ | |
1849 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1850 void alignment::setOutputFormat(int format_, bool shortNames_) { | |
1851 oformat = format_; | |
1852 shortNames = shortNames_; | |
1853 } | |
1854 | |
1855 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1856 /* This method sets a new block size value */ | |
1857 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1858 void alignment::setBlockSize(int blockSize_) { | |
1859 blockSize = blockSize_; | |
1860 } | |
1861 | |
1862 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1863 /* Return true if the shortNames flag has been setted */ | |
1864 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1865 int alignment::getShortNames(void) { | |
1866 return shortNames; | |
1867 } | |
1868 | |
1869 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1870 /* Return true if the reverse flag has been setted. */ | |
1871 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1872 int alignment::getReverse(void) { | |
1873 return reverse; | |
1874 } | |
1875 | |
1876 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1877 /* This method lets to change the output alignment orientation */ | |
1878 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1879 void alignment::setReverse(void) { | |
1880 reverse = true; | |
1881 } | |
1882 | |
1883 | |
1884 /* Set appropiate flag to decide whether sequences composed only by gaps should | |
1885 * be kept or not */ | |
1886 void alignment::setKeepSequencesFlag(bool flag) { | |
1887 keepSequences = flag; | |
1888 } | |
1889 | |
1890 /* Set appropiate flag to decide whether original sequences header should be | |
1891 * dumped into the output stream with or without any preprocessing step */ | |
1892 void alignment::setKeepSeqsHeaderFlag(bool flag) { | |
1893 keepHeader = flag; | |
1894 } | |
1895 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1896 /* Return the block size value */ | |
1897 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1898 int alignment::getBlockSize(void) { | |
1899 return blockSize; | |
1900 } | |
1901 | |
1902 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1903 /* This method returns the sequences name */ | |
1904 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1905 void alignment::getSequences(string *Names) { | |
1906 for(int i = 0; i < sequenNumber; i++) | |
1907 Names[i] = seqsName[i]; | |
1908 } | |
1909 | |
1910 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1911 void alignment::getSequences(string *Names, int *lengths) { | |
1912 for(int i = 0; i < sequenNumber; i++) { | |
1913 lengths[i] = (int) utils::removeCharacter('-', sequences[i]).length(); | |
1914 Names[i] = seqsName[i]; | |
1915 } | |
1916 } | |
1917 | |
1918 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1919 void alignment::getSequences(string *Names, string *Sequences, int *Lengths) { | |
1920 for(int i = 0; i < sequenNumber; i++) { | |
1921 Names[i] = seqsName[i]; | |
1922 Sequences[i] = utils::removeCharacter('-', sequences[i]); | |
1923 Lengths[i] = (int) Sequences[i].length(); | |
1924 } | |
1925 } | |
1926 | |
1927 | |
1928 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1929 /* For a given set of sequences name, check for its correspondence with | |
1930 * the current sequences name. If there is not a complete correspondence | |
1931 * between both sets, the method return false, otherwise, return true */ | |
1932 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1933 bool alignment::getSeqNameOrder(string *names, int *orderVector) { | |
1934 int i, j, numNames; | |
1935 | |
1936 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1937 /* For each name in the current alignment, we look | |
1938 * for its correspondence in the input set */ | |
1939 for(i = 0, numNames = 0; i < sequenNumber; i++) { | |
1940 for(j = 0; j < sequenNumber; j++) { | |
1941 if(seqsName[i] == names[j]) { | |
1942 orderVector[i] = j; | |
1943 numNames++; | |
1944 break; | |
1945 } | |
1946 } | |
1947 } | |
1948 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1949 | |
1950 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1951 /* Depending on if we get the complete correspondence | |
1952 * between both sets of names, we return true or not */ | |
1953 if(numNames == sequenNumber) return true; | |
1954 else return false; | |
1955 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1956 } | |
1957 | |
1958 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1959 /* This method lets to build a sequence matrix. A sequence matrix contains | |
1960 * the residue position for each sequence without taking into account the | |
1961 * gaps in the sequence. This means that at position 10 we have the residue | |
1962 * 1 for that sequence */ | |
1963 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1964 void alignment::sequenMatrix(void) { | |
1965 if(seqMatrix == NULL) | |
1966 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
1967 } | |
1968 | |
1969 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1970 /* Use this method to destroy a given sequence matrix */ | |
1971 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1972 void alignment::destroySequenMatrix(void) { | |
1973 if(seqMatrix != NULL) | |
1974 delete seqMatrix; | |
1975 seqMatrix = NULL; | |
1976 } | |
1977 | |
1978 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1979 /* Print the alignment's sequence matrix */ | |
1980 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1981 void alignment::printSequenMatrix(void) { | |
1982 if(seqMatrix == NULL) | |
1983 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
1984 seqMatrix -> printMatrix(); | |
1985 } | |
1986 | |
1987 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1988 /* Given an index, this method returns the sequence matrix column for that | |
1989 * index */ | |
1990 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1991 void alignment::getColumnSeqMatrix(int column, int *columnSeqMatrix) { | |
1992 if(seqMatrix == NULL) | |
1993 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
1994 seqMatrix -> getColumn(column, columnSeqMatrix); | |
1995 } | |
1996 | |
1997 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
1998 /* This function looks if a given value belongs a given row. In the affirmative | |
1999 * case, returns the columns value for that row and value combination */ | |
2000 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2001 void alignment::getColumnSeqMatrix(int value, int row, int *columnSeqMatrix) { | |
2002 if(seqMatrix == NULL) | |
2003 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
2004 seqMatrix -> getColumn(value, row, columnSeqMatrix); | |
2005 } | |
2006 | |
2007 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2008 /* This function change the rows order in the sequence matrix */ | |
2009 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2010 void alignment::setSeqMatrixOrder(int *order) { | |
2011 if(seqMatrix == NULL) | |
2012 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
2013 seqMatrix -> setOrder(order); | |
2014 } | |
2015 | |
2016 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2017 /* This function change the rows order in the sequence matrix */ | |
2018 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2019 sequencesMatrix *alignment::getSeqMatrix(void) { | |
2020 if(seqMatrix == NULL) | |
2021 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
2022 return seqMatrix; | |
2023 } | |
2024 | |
2025 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2026 /* Computes, if it's necessary, and return the alignment's type */ | |
2027 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2028 int alignment::getTypeAlignment(void) { | |
2029 if(dataType == 0) | |
2030 dataType = utils::checkTypeAlignment(sequenNumber, residNumber, sequences); | |
2031 return dataType; | |
2032 } | |
2033 | |
2034 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2035 /* Returns the correspondence between the columns in the original and in the | |
2036 * trimmed alignment */ | |
2037 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2038 int *alignment::getCorrespResidues(void) { | |
2039 return saveResidues; | |
2040 } | |
2041 | |
2042 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2043 /* Returns the correspondence between the sequences in the original and in | |
2044 * the trimmed alignment */ | |
2045 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2046 int *alignment::getCorrespSequences(void) { | |
2047 return saveSequences; | |
2048 } | |
2049 | |
2050 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2051 /* Returns if the alignment is aligned or not */ | |
2052 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2053 bool alignment::isFileAligned(void) { | |
2054 return isAligned; | |
2055 } | |
2056 | |
2057 /* ***************************************************************************** | |
2058 * ***************************************************************************** | |
2059 * ***************************************************************************** | |
2060 * ************************************************************************** */ | |
2061 | |
2062 /* This method removes those columns that exceed a given threshold. If the | |
2063 * number of columns in the trimmed alignment is lower than a given percentage | |
2064 * of the original alignment. The program relaxes the threshold until to add | |
2065 * enough columns to achieve this percentage, to add those columns the program | |
2066 * start in the middle of the original alignment and make, alternatively, | |
2067 * movements to the left and right side from that point. This method also can | |
2068 * return the complementary alignment. */ | |
2069 alignment *alignment::cleanByCutValue(double cut, float baseLine, | |
2070 const int *gInCol, bool complementary) { | |
2071 | |
2072 int i, j, k, jn, oth, pos, block, *vectAux; | |
2073 string *matrixAux, *newSeqsName; | |
2074 alignment *newAlig; | |
2075 newValues counter; | |
2076 | |
2077 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2078 /* Select the columns with a gaps value less or equal | |
2079 * than the cut point. */ | |
2080 for(i = 0, counter.residues = 0; i < residNumber; i++) | |
2081 if(gInCol[i] <= cut) counter.residues++; | |
2082 else saveResidues[i] = -1; | |
2083 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2084 | |
2085 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2086 /* Compute, if it's necessary, the number of columns | |
2087 * necessary to achieve the minimum number of columns | |
2088 * fixed by coverage parameter. */ | |
2089 oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber); | |
2090 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2091 | |
2092 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2093 /* if it's necessary to recover some columns, we | |
2094 * applied this instructions to recover it */ | |
2095 if(oth > 0) { | |
2096 counter.residues += oth; | |
2097 | |
2098 /* Allocate memory */ | |
2099 vectAux = new int[residNumber]; | |
2100 | |
2101 /* Sort a copy of the gInCol vector, and take the value of the column that marks the % baseline */ | |
2102 utils::copyVect((int *) gInCol, vectAux, residNumber); | |
2103 utils::quicksort(vectAux, 0, residNumber-1); | |
2104 cut = vectAux[(int) ((float)(residNumber - 1) * (baseLine)/100.0)]; | |
2105 | |
2106 /* Deallocate memory */ | |
2107 delete [] vectAux; | |
2108 } | |
2109 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2110 | |
2111 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2112 /* Fixed the initial size of blocks as 0.5% of | |
2113 * alignment's length */ | |
2114 for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) { | |
2115 | |
2116 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2117 /* We start in the alignment middle then we move on | |
2118 * right and left side at the same time. */ | |
2119 for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) { | |
2120 | |
2121 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2122 /* Left side. Here, we compute the block's size. */ | |
2123 for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ; | |
2124 | |
2125 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2126 /* if block's size is greater or equal than the fixed | |
2127 * size then we save all columns that have not been | |
2128 * saved previously. */ | |
2129 if((i - jn) >= k) { | |
2130 for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) { | |
2131 if(gInCol[jn] <= cut) { | |
2132 saveResidues[jn] = jn; | |
2133 oth--; | |
2134 } else | |
2135 break; | |
2136 } | |
2137 } | |
2138 i = jn; | |
2139 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2140 | |
2141 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2142 /* Right side. Here, we compute the block's size. */ | |
2143 for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ; | |
2144 | |
2145 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2146 /* if block's size is greater or equal than the fixed | |
2147 * size then we save all columns that have not been | |
2148 * saved previously. */ | |
2149 if((jn - j) >= k) { | |
2150 for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) { | |
2151 if(gInCol[jn] <= cut) { | |
2152 saveResidues[jn] = jn; | |
2153 oth--; | |
2154 } else | |
2155 break; | |
2156 } | |
2157 } | |
2158 j = jn; | |
2159 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2160 } | |
2161 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2162 } | |
2163 | |
2164 /* Keep only columns blocks bigger than an input columns block size */ | |
2165 if(blockSize != 0) { | |
2166 | |
2167 /* Traverse all alignment looking for columns blocks greater than LONGBLOCK, | |
2168 * everytime than a column is not selected by the trimming method, check | |
2169 * whether the current block size until that point is big enough to be kept | |
2170 * or it should be removed from the final alignment */ | |
2171 for(i = 0, pos = 0, block = 0; i < residNumber; i++) { | |
2172 if(saveResidues[i] != -1) | |
2173 block++; | |
2174 else { | |
2175 /* Remove columns from blocks smaller than input blocks size */ | |
2176 if(block < blockSize) | |
2177 for(j = pos; j <= i; j++) | |
2178 saveResidues[j] = -1; | |
2179 pos = i + 1; | |
2180 block = 0; | |
2181 } | |
2182 } | |
2183 /* Check final block separately since it could happen than last block is not | |
2184 * big enough but because the loop end could provoke to ignore it */ | |
2185 if(block < blockSize) | |
2186 for(j = pos; j < i; j++) | |
2187 saveResidues[j] = -1; | |
2188 } | |
2189 | |
2190 /* If the flag -terminalony is set, apply a method to look for internal | |
2191 * boundaries and get back columns inbetween them, if they exist */ | |
2192 if(terminalGapOnly == true) | |
2193 if(!removeOnlyTerminal()) | |
2194 return NULL; | |
2195 | |
2196 /* Once the columns/sequences selection is done, turn it around | |
2197 * if complementary flag is active */ | |
2198 if(complementary == true) | |
2199 computeComplementaryAlig(true, false); | |
2200 | |
2201 /* Check for any additional column/sequence to be removed */ | |
2202 /* Compute new sequences and columns numbers */ | |
2203 counter = removeCols_SeqsAllGaps(); | |
2204 | |
2205 /* Allocate memory for selected sequences/columns */ | |
2206 matrixAux = new string[counter.sequences]; | |
2207 newSeqsName = new string[counter.sequences]; | |
2208 | |
2209 /* Fill local allocated memory with previously selected data */ | |
2210 fillNewDataStructure(matrixAux, newSeqsName); | |
2211 | |
2212 /* When we have all parameters, we create the new alignment */ | |
2213 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
2214 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
2215 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
2216 keepSequences, keepHeader, sequenNumber, residNumber, | |
2217 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
2218 identities, overlaps); | |
2219 | |
2220 /* Deallocate local memory */ | |
2221 delete[] matrixAux; | |
2222 delete[] newSeqsName; | |
2223 | |
2224 /* Return the new alignment reference */ | |
2225 return newAlig; | |
2226 } | |
2227 | |
2228 /* This method removes those columns that not achieve a given threshold. If the | |
2229 * number of columns in the trimmed alignment is lower than a given percentage | |
2230 * of the original alignment. The program relaxes the threshold until to add | |
2231 * enough columns to achieve this percentage, to add those columns the program | |
2232 * start in the middle of the original alignment and make, alternatively, | |
2233 * movements to the left and right side from that point. This method also can | |
2234 * return the complementary alignment. */ | |
2235 alignment *alignment::cleanByCutValue(float cut, float baseLine, | |
2236 const float *ValueVect, bool complementary) { | |
2237 | |
2238 int i, j, k, jn, oth, pos, block; | |
2239 string *matrixAux, *newSeqsName; | |
2240 alignment *newAlig; | |
2241 newValues counter; | |
2242 | |
2243 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2244 /* Select the columns with a conservation's value | |
2245 * greater than the cut point. */ | |
2246 for(i = 0, counter.residues = 0; i < residNumber; i++) | |
2247 if(ValueVect[i] > cut) counter.residues++; | |
2248 else saveResidues[i] = -1; | |
2249 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2250 | |
2251 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2252 /* Compute, if it's necessary, the number of columns | |
2253 * necessary to achieve the minimum number of columns | |
2254 * fixed by coverage value. */ | |
2255 oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber); | |
2256 if(oth > 0) counter.residues += oth; | |
2257 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2258 | |
2259 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2260 /* Fixed the initial size of blocks as 0.5% of | |
2261 * alignment's length */ | |
2262 for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) { | |
2263 | |
2264 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2265 /* We start in the alignment middle then we move on | |
2266 * right and left side at the same time. */ | |
2267 for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) { | |
2268 | |
2269 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2270 /* Left side. Here, we compute the block's size. */ | |
2271 for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ; | |
2272 | |
2273 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2274 /* if block's size is greater or equal than the fixed | |
2275 * size then we save all columns that have not been | |
2276 * saved previously. */ | |
2277 /* Here, we only accept column with a conservation's | |
2278 * value equal to conservation cut point. */ | |
2279 if((i - jn) >= k) { | |
2280 for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) { | |
2281 if(ValueVect[jn] == cut) { | |
2282 saveResidues[jn] = jn; | |
2283 oth--; | |
2284 } else | |
2285 break; | |
2286 } | |
2287 } | |
2288 i = jn; | |
2289 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2290 | |
2291 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2292 /* Right side. Here, we compute the block's size. */ | |
2293 for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ; | |
2294 | |
2295 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2296 /* if block's size is greater or equal than the fixed | |
2297 * size then we select the column and save the block's | |
2298 * size for the next iteraction. it's obvius that we | |
2299 * decrease the column's number needed to finish. */ | |
2300 /* Here, we only accept column with a conservation's | |
2301 * value equal to conservation cut point. */ | |
2302 if((jn - j) >= k) { | |
2303 for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) { | |
2304 if(ValueVect[jn] == cut) { | |
2305 saveResidues[jn] = jn; | |
2306 oth--; | |
2307 } else | |
2308 break; | |
2309 } | |
2310 } | |
2311 j = jn; | |
2312 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2313 } | |
2314 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2315 } | |
2316 | |
2317 /* Keep only columns blocks bigger than an input columns block size */ | |
2318 if(blockSize != 0) { | |
2319 | |
2320 /* Traverse all alignment looking for columns blocks greater than LONGBLOCK, | |
2321 * everytime than a column is not selected by the trimming method, check | |
2322 * whether the current block size until that point is big enough to be kept | |
2323 * or it should be removed from the final alignment */ | |
2324 for(i = 0, pos = 0, block = 0; i < residNumber; i++) { | |
2325 if(saveResidues[i] != -1) | |
2326 block++; | |
2327 else { | |
2328 /* Remove columns from blocks smaller than input blocks size */ | |
2329 if(block < blockSize) | |
2330 for(j = pos; j <= i; j++) | |
2331 saveResidues[j] = -1; | |
2332 pos = i + 1; | |
2333 block = 0; | |
2334 } | |
2335 } | |
2336 /* Check final block separately since it could happen than last block is not | |
2337 * big enough but because the loop end could provoke to ignore it */ | |
2338 if(block < blockSize) | |
2339 for(j = pos; j < i; j++) | |
2340 saveResidues[j] = -1; | |
2341 } | |
2342 | |
2343 /* If the flag -terminalony is set, apply a method to look for internal | |
2344 * boundaries and get back columns inbetween them, if they exist */ | |
2345 if(terminalGapOnly == true) | |
2346 if(!removeOnlyTerminal()) | |
2347 return NULL; | |
2348 | |
2349 /* Once the columns/sequences selection is done, turn it around | |
2350 * if complementary flag is active */ | |
2351 if(complementary == true) | |
2352 computeComplementaryAlig(true, false); | |
2353 | |
2354 /* Check for any additional column/sequence to be removed */ | |
2355 /* Compute new sequences and columns numbers */ | |
2356 counter = removeCols_SeqsAllGaps(); | |
2357 | |
2358 /* Allocate memory for selected sequences/columns */ | |
2359 matrixAux = new string[counter.sequences]; | |
2360 newSeqsName = new string[counter.sequences]; | |
2361 | |
2362 /* Fill local allocated memory with previously selected data */ | |
2363 fillNewDataStructure(matrixAux, newSeqsName); | |
2364 | |
2365 /* When we have all parameters, we create the new alignment */ | |
2366 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
2367 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
2368 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
2369 keepSequences, keepHeader, sequenNumber, residNumber, | |
2370 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
2371 identities, overlaps); | |
2372 | |
2373 /* Deallocate local memory */ | |
2374 delete[] matrixAux; | |
2375 delete[] newSeqsName; | |
2376 | |
2377 /* Return the new alignment reference */ | |
2378 return newAlig; | |
2379 } | |
2380 | |
2381 /* This method removes those columns that not achieve the similarity threshond, | |
2382 * by one hand, and exceed the gaps threshold. If the number of columns that | |
2383 * have been selected is lower that a given coverage, the program relaxes both | |
2384 * thresholds in order to get back some columns and achieve this minimum | |
2385 * number of columns in the new alignment. For that purpose, the program | |
2386 * starts at the middle of the original alignment and makes, alternatively, | |
2387 * movememts to the right and left sides looking for those columns necessary | |
2388 * to achieve the minimum number of columns set by the coverage parameter. | |
2389 * This method also can return the complementary alignmnet consists of those | |
2390 * columns that will be deleted from the original alignment applying the | |
2391 * standard method. */ | |
2392 alignment *alignment::cleanByCutValue(double cutGaps, const int *gInCol, | |
2393 float baseLine, float cutCons, const float *MDK_Win, bool complementary) { | |
2394 | |
2395 int i, j, k, oth, pos, block, jn, blGaps, *vectAuxGaps; | |
2396 string *matrixAux, *newSeqsName; | |
2397 float blCons, *vectAuxCons; | |
2398 alignment *newAlig; | |
2399 newValues counter; | |
2400 | |
2401 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2402 /* Select the columns with a conservation's value | |
2403 * greater than the conservation cut point AND | |
2404 * less or equal than the gap cut point. */ | |
2405 for(i = 0, counter.residues = 0; i < residNumber; i++) | |
2406 if((MDK_Win[i] > cutCons) && (gInCol[i] <= cutGaps)) counter.residues++; | |
2407 else saveResidues[i] = -1; | |
2408 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2409 | |
2410 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2411 /* Compute, if it's necessary, the number of columns | |
2412 * necessary to achieve the minimum number of it fixed | |
2413 * by the coverage parameter. */ | |
2414 oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber); | |
2415 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2416 | |
2417 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2418 /* If it's needed to add new columns, we compute the | |
2419 * news thresholds */ | |
2420 if(oth > 0) { | |
2421 counter.residues += oth; | |
2422 | |
2423 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2424 /* Allocate memory */ | |
2425 vectAuxCons = new float[residNumber]; | |
2426 vectAuxGaps = new int[residNumber]; | |
2427 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2428 | |
2429 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2430 /* Sort a copy of the MDK_Win vector and of the gInCol | |
2431 * vector, and take the value of the column that marks | |
2432 * the % baseline */ | |
2433 utils::copyVect((float *) MDK_Win, vectAuxCons, residNumber); | |
2434 utils::copyVect((int *) gInCol, vectAuxGaps, residNumber); | |
2435 | |
2436 utils::quicksort(vectAuxCons, 0, residNumber-1); | |
2437 utils::quicksort(vectAuxGaps, 0, residNumber-1); | |
2438 | |
2439 blCons = vectAuxCons[(int) ((float)(residNumber - 1) * (100.0 - baseLine)/100.0)]; | |
2440 blGaps = vectAuxGaps[(int) ((float)(residNumber - 1) * (baseLine)/100.0)]; | |
2441 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2442 | |
2443 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2444 /* Deallocate memory */ | |
2445 delete [] vectAuxCons; | |
2446 delete [] vectAuxGaps; | |
2447 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2448 } | |
2449 | |
2450 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2451 /* Fixed the initial size of blocks as 0.5% of | |
2452 * alignment's length */ | |
2453 for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) { | |
2454 | |
2455 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2456 /* We start in the alignment middle then we move on | |
2457 * right and left side at the same time. */ | |
2458 for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) { | |
2459 | |
2460 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2461 /* Left side. Here, we compute the block's size. */ | |
2462 for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ; | |
2463 | |
2464 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2465 /* if block's size is greater or equal than the fixed | |
2466 * size then we select the column and save the block's | |
2467 * size for the next iteraction. it's obvius that we | |
2468 * decrease the column's number needed to finish. */ | |
2469 /* Here, we accept column with a conservation's value | |
2470 * greater or equal than the conservation cut point OR | |
2471 * less or equal than the gap cut point. */ | |
2472 if((i - jn) >= k) { | |
2473 for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) { | |
2474 if((MDK_Win[jn] >= blCons) || (gInCol[jn] <= blGaps)) { | |
2475 saveResidues[jn] = jn; | |
2476 oth--; | |
2477 } else | |
2478 break; | |
2479 } | |
2480 } | |
2481 i = jn; | |
2482 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2483 | |
2484 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2485 /* Right side. Here, we compute the block's size. */ | |
2486 for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ; | |
2487 | |
2488 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2489 /* if block's size is greater or equal than the fixed | |
2490 * size then we select the column and save the block's | |
2491 * size for the next iteraction. it's obvius that we | |
2492 * decrease the column's number needed to finish. */ | |
2493 /* Here, we accept column with a conservation's value | |
2494 * greater or equal than the conservation cut point OR | |
2495 * less or equal than the gap cut point. */ | |
2496 if((jn - j) >= k) { | |
2497 for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) { | |
2498 if((MDK_Win[jn] >= blCons) || (gInCol[jn] <= blGaps)) { | |
2499 saveResidues[jn] = jn; | |
2500 oth--; | |
2501 } else | |
2502 break; | |
2503 } | |
2504 } | |
2505 j = jn; | |
2506 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2507 } | |
2508 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
2509 } | |
2510 | |
2511 /* Keep only columns blocks bigger than an input columns block size */ | |
2512 if(blockSize != 0) { | |
2513 | |
2514 /* Traverse all alignment looking for columns blocks greater than LONGBLOCK, | |
2515 * everytime than a column is not selected by the trimming method, check | |
2516 * whether the current block size until that point is big enough to be kept | |
2517 * or it should be removed from the final alignment */ | |
2518 for(i = 0, pos = 0, block = 0; i < residNumber; i++) { | |
2519 if(saveResidues[i] != -1) | |
2520 block++; | |
2521 else { | |
2522 /* Remove columns from blocks smaller than input blocks size */ | |
2523 if(block < blockSize) | |
2524 for(j = pos; j <= i; j++) | |
2525 saveResidues[j] = -1; | |
2526 pos = i + 1; | |
2527 block = 0; | |
2528 } | |
2529 } | |
2530 /* Check final block separately since it could happen than last block is not | |
2531 * big enough but because the loop end could provoke to ignore it */ | |
2532 if(block < blockSize) | |
2533 for(j = pos; j < i; j++) | |
2534 saveResidues[j] = -1; | |
2535 } | |
2536 | |
2537 /* If the flag -terminalony is set, apply a method to look for internal | |
2538 * boundaries and get back columns inbetween them, if they exist */ | |
2539 if(terminalGapOnly == true) | |
2540 if(!removeOnlyTerminal()) | |
2541 return NULL; | |
2542 | |
2543 /* Once the columns/sequences selection is done, turn it around | |
2544 * if complementary flag is active */ | |
2545 if(complementary == true) | |
2546 computeComplementaryAlig(true, false); | |
2547 | |
2548 /* Check for any additional column/sequence to be removed */ | |
2549 /* Compute new sequences and columns numbers */ | |
2550 counter = removeCols_SeqsAllGaps(); | |
2551 | |
2552 /* Allocate memory for selected sequences/columns */ | |
2553 matrixAux = new string[counter.sequences]; | |
2554 newSeqsName = new string[counter.sequences]; | |
2555 | |
2556 /* Fill local allocated memory with previously selected data */ | |
2557 fillNewDataStructure(matrixAux, newSeqsName); | |
2558 | |
2559 /* When we have all parameters, we create the new alignment */ | |
2560 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
2561 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
2562 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
2563 keepSequences, keepHeader, sequenNumber, residNumber, | |
2564 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
2565 identities, overlaps); | |
2566 | |
2567 /* Deallocate local memory */ | |
2568 delete[] matrixAux; | |
2569 delete[] newSeqsName; | |
2570 | |
2571 /* Return the new alignment reference */ | |
2572 return newAlig; | |
2573 } | |
2574 | |
2575 /* This method carries out the strict and strict plus method. To trim the | |
2576 * alignment in an automated way, the method uses as an input a given gaps | |
2577 * thresholds over a gaps vector values, a given similarity threshold over a | |
2578 * similarity vector values. With a flag, we can decide which method the | |
2579 * program shall apply. With another one, the user can ask for the | |
2580 * complementary alignment. In this method, those columns that has been marked | |
2581 * to be deleted but has, at least, 3 of 4 surronding neighbour selected are | |
2582 * get back to the new alignmnet. At other part of the method, those column | |
2583 * blocks that do not have a minimum size fix by the method will be deleted | |
2584 * from the trimmed alignment. */ | |
2585 alignment *alignment::cleanStrict(int gapCut, const int *gInCol, float simCut, | |
2586 const float *MDK_W, bool complementary, bool variable) { | |
2587 | |
2588 int i, num, lenBlock; | |
2589 alignment *newAlig; | |
2590 | |
2591 deque<int> neighboursBlock; | |
2592 | |
2593 /* Reject columns with gaps number greater than the gap threshold. */ | |
2594 for(i = 0; i < residNumber; i++) | |
2595 if(gInCol[i] > gapCut) | |
2596 saveResidues[i] = -1; | |
2597 | |
2598 /* Reject columns with similarity score under the threshold. */ | |
2599 for(i = 0; i < residNumber; i++) | |
2600 if(MDK_W[i] < simCut) | |
2601 saveResidues[i] = -1; | |
2602 | |
2603 /* Search for those columns that have been removed but have, | |
2604 * at least, 3 out of 4 adjacent columns selected. */ | |
2605 | |
2606 /* Load the first 5 columns into the list */ | |
2607 for(i = 0; i < (residNumber) && i < 5; i++) | |
2608 neighboursBlock.push_back(saveResidues[i]); | |
2609 | |
2610 /* Special case #1 - we analyze it before processing the rest positions | |
2611 * Column: 1 - Save this column considering its neighbours */ | |
2612 if((saveResidues[0] == -1) && (neighboursBlock.size() > 2) && | |
2613 (neighboursBlock[1] != -1) && | |
2614 (neighboursBlock[2] != -1)) | |
2615 saveResidues[0] = 0; | |
2616 | |
2617 /* Special case #2A - we analyze it before processing the rest positions | |
2618 * Column: 2 - Save this column considering its neighbours */ | |
2619 if((saveResidues[1] == -1) && (neighboursBlock.size() > 3) && | |
2620 ((neighboursBlock[0] != -1) && | |
2621 (neighboursBlock[2] != -1) && | |
2622 (neighboursBlock[3] != -1))) | |
2623 saveResidues[1] = 1; | |
2624 | |
2625 /* Special case #2B - when alignment is 3 columns long - we analyze it before | |
2626 * processing the rest positions. | |
2627 * Column: 2 - Save this column considering its neighbours */ | |
2628 if((saveResidues[1] == -1) && (neighboursBlock.size() > 2) && | |
2629 ((neighboursBlock[0] != -1) && | |
2630 (neighboursBlock[2] != -1))) | |
2631 saveResidues[1] = 1; | |
2632 | |
2633 /* Normal cases */ | |
2634 for(i = 2, num = 0; i < (residNumber - 2); i++, num = 0) { | |
2635 if(saveResidues[i] == -1) { | |
2636 if(neighboursBlock[0] != -1) | |
2637 num++; | |
2638 if(neighboursBlock[1] != -1) | |
2639 num++; | |
2640 if(neighboursBlock[3] != -1) | |
2641 num++; | |
2642 if(neighboursBlock[4] != -1) | |
2643 num++; | |
2644 saveResidues[i] = (num >= 3) ? i : -1; | |
2645 | |
2646 } | |
2647 /* We slide the window one position to the right */ | |
2648 if((i+3) < residNumber) { | |
2649 neighboursBlock.pop_front(); | |
2650 neighboursBlock.push_back(saveResidues[i+3]); | |
2651 } | |
2652 } | |
2653 | |
2654 /* Special case #3 - | |
2655 * Column: 2nd last one - We consider the list rather than the saveResidues | |
2656 * vector in order to consider only unmodified values before the previous | |
2657 * loop */ | |
2658 if((saveResidues[(residNumber - 2)] == -1) && (neighboursBlock.size() > 3) && | |
2659 ((neighboursBlock[neighboursBlock.size() - 4] != -1) && | |
2660 (neighboursBlock[neighboursBlock.size() - 3] != -1) && | |
2661 (neighboursBlock[neighboursBlock.size() - 1] != -1))) | |
2662 saveResidues[(residNumber - 2)] = (residNumber - 2); | |
2663 | |
2664 /* Special case #4 - | |
2665 * Column: last one - We consider the list rather than the saveResidues | |
2666 * vector in order to consider only unmodified values before the previous | |
2667 * loop */ | |
2668 if((saveResidues[(residNumber - 1)] == -1) && (neighboursBlock.size() > 2) && | |
2669 ((neighboursBlock[neighboursBlock.size() - 3] != -1) && | |
2670 (neighboursBlock[neighboursBlock.size() - 2] != -1))) | |
2671 saveResidues[(residNumber - 1)] = (residNumber - 1); | |
2672 | |
2673 /* Select blocks size based on user input. It can be set either to 5 or to a | |
2674 * variable number between 3 and 12 depending on alignment length (1% alig) */ | |
2675 if(!variable) | |
2676 lenBlock = 5; | |
2677 else { | |
2678 lenBlock = utils::roundInt(residNumber * 0.01); | |
2679 lenBlock = lenBlock > 3 ? (lenBlock < 12 ? lenBlock : 12) : 3; | |
2680 } | |
2681 | |
2682 /* Allow to change minimal block size */ | |
2683 blockSize = blockSize > 0 ? blockSize : lenBlock; | |
2684 | |
2685 /* Keep only columns blocks bigger than either a computed dinamically or | |
2686 * set by the user block size */ | |
2687 removeSmallerBlocks(blockSize); | |
2688 | |
2689 /* If the flag -terminalony is set, apply a method to look for internal | |
2690 * boundaries and get back columns inbetween them, if they exist */ | |
2691 if(terminalGapOnly == true) | |
2692 if(!removeOnlyTerminal()) | |
2693 return NULL; | |
2694 | |
2695 /* Once the columns/sequences selection is done, turn it around | |
2696 * if complementary flag is active */ | |
2697 if(complementary == true) | |
2698 computeComplementaryAlig(true, false); | |
2699 | |
2700 /* Check for any additional column/sequence to be removed */ | |
2701 /* Compute new sequences and columns numbers */ | |
2702 | |
2703 newValues counter; | |
2704 counter = removeCols_SeqsAllGaps(); | |
2705 | |
2706 /* Allocate memory for selected sequences/columns */ | |
2707 //~ matrixAux = new string[counter.sequences]; | |
2708 //~ newSeqsName = new string[counter.sequences]; | |
2709 | |
2710 /* Fill local allocated memory with previously selected data */ | |
2711 //~ fillNewDataStructure(matrixAux, newSeqsName); | |
2712 | |
2713 fillNewDataStructure(&counter); | |
2714 | |
2715 //~ cerr << counter -> seqsName[counter -> sequences - 1] << endl; | |
2716 | |
2717 /* When we have all parameters, we create the new alignment */ | |
2718 //~ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, counter.sequences, counter.residues, | |
2719 newAlig = new alignment(filename, aligInfo, counter.matrix, counter.seqsName, | |
2720 seqsInfo, counter.sequences, counter.residues, iformat, oformat, shortNames, | |
2721 dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
2722 keepSequences, keepHeader, sequenNumber, | |
2723 residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, | |
2724 blockSize, identities, overlaps); | |
2725 | |
2726 /* Deallocate local memory */ | |
2727 //~ delete[] matrixAux; | |
2728 //~ delete[] newSeqsName; | |
2729 | |
2730 /* Return the new alignment reference */ | |
2731 return newAlig; | |
2732 } | |
2733 | |
2734 /* Remove those sequences with an overlap less than a given threshold. It can | |
2735 * return the complementary alignment if appropiate flag is set. */ | |
2736 alignment *alignment::cleanOverlapSeq(float minimumOverlap, float *overlapSeq, | |
2737 bool complementary) { | |
2738 | |
2739 string *matrixAux, *newSeqsName; | |
2740 alignment *newAlig; | |
2741 newValues counter; | |
2742 int i; | |
2743 | |
2744 /* Keep only those sequences with an overlap value equal or greater than | |
2745 * the minimum overlap value set by the user. */ | |
2746 for(i = 0; i < sequenNumber; i++) | |
2747 if(overlapSeq[i] < minimumOverlap) | |
2748 saveSequences[i] = -1; | |
2749 | |
2750 /* Once the columns/sequences selection is done, turn it around | |
2751 * if complementary flag is active */ | |
2752 if(complementary == true) | |
2753 computeComplementaryAlig(false, true); | |
2754 | |
2755 /* Check for any additional column/sequence to be removed */ | |
2756 /* Compute new sequences and columns numbers */ | |
2757 counter = removeCols_SeqsAllGaps(); | |
2758 | |
2759 /* Allocate memory for selected sequences/columns */ | |
2760 matrixAux = new string[counter.sequences]; | |
2761 newSeqsName = new string[counter.sequences]; | |
2762 | |
2763 /* Fill local allocated memory with previously selected data */ | |
2764 fillNewDataStructure(matrixAux, newSeqsName); | |
2765 | |
2766 /* When we have all parameters, we create the new alignment */ | |
2767 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
2768 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
2769 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
2770 keepSequences, keepHeader, sequenNumber, residNumber, | |
2771 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
2772 identities, overlaps); | |
2773 | |
2774 /* Deallocate local memory */ | |
2775 delete [] matrixAux; | |
2776 delete [] newSeqsName; | |
2777 | |
2778 /* Return the new alignment reference */ | |
2779 return newAlig; | |
2780 } | |
2781 | |
2782 /* Remove those columns, expressed as range, set by the user. It can return | |
2783 * the complementary alignmnet if appropiate flags is set. */ | |
2784 alignment *alignment::removeColumns(int *columns, int init, int size, | |
2785 bool complementary) { | |
2786 | |
2787 string *matrixAux, *newSeqsName; | |
2788 alignment *newAlig; | |
2789 newValues counter; | |
2790 int i, j; | |
2791 | |
2792 /* Delete those range columns defines in the columns vector */ | |
2793 for(i = init; i < size + init; i += 2) | |
2794 for(j = columns[i]; j <= columns[i+1]; j++) | |
2795 saveResidues[j] = -1; | |
2796 | |
2797 /* Once the columns/sequences selection is done, turn it around | |
2798 * if complementary flag is active */ | |
2799 if(complementary == true) | |
2800 computeComplementaryAlig(true, false); | |
2801 | |
2802 /* Check for any additional column/sequence to be removed */ | |
2803 /* Compute new sequences and columns numbers */ | |
2804 counter = removeCols_SeqsAllGaps(); | |
2805 | |
2806 /* Allocate memory for selected sequences/columns */ | |
2807 matrixAux = new string[counter.sequences]; | |
2808 newSeqsName = new string[counter.sequences]; | |
2809 | |
2810 /* Fill local allocated memory with previously selected data */ | |
2811 fillNewDataStructure(matrixAux, newSeqsName); | |
2812 | |
2813 /* When we have all parameters, we create the new alignment */ | |
2814 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
2815 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
2816 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
2817 keepSequences, keepHeader, sequenNumber, residNumber, | |
2818 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
2819 identities, overlaps); | |
2820 | |
2821 /* Deallocate local memory */ | |
2822 delete[] matrixAux; | |
2823 delete[] newSeqsName; | |
2824 | |
2825 /* Return the new alignment reference */ | |
2826 return newAlig; | |
2827 } | |
2828 | |
2829 /* This method removes those sequences, expressed as range of sequences, set by | |
2830 * the user. The method also can return the complementary alignment, it means, | |
2831 * those sequences that originally should be removed from the input alignment */ | |
2832 alignment *alignment::removeSequences(int *seqs, int init, int size, | |
2833 bool complementary) { | |
2834 | |
2835 string *matrixAux, *newSeqsName; | |
2836 alignment *newAlig; | |
2837 newValues counter; | |
2838 int i, j; | |
2839 | |
2840 /* Delete those range of sequences defines by the seqs vector */ | |
2841 for(i = init; i < size + init; i += 2) | |
2842 for(j = seqs[i]; j <= seqs[i+1]; j++) | |
2843 saveSequences[j] = -1; | |
2844 | |
2845 /* Once the columns/sequences selection is done, turn it around | |
2846 * if complementary flag is active */ | |
2847 if(complementary == true) | |
2848 computeComplementaryAlig(false, true); | |
2849 | |
2850 /* Check for any additional column/sequence to be removed */ | |
2851 /* Compute new sequences and columns numbers */ | |
2852 counter = removeCols_SeqsAllGaps(); | |
2853 | |
2854 /* Allocate memory for selected sequences/columns */ | |
2855 matrixAux = new string[counter.sequences]; | |
2856 newSeqsName = new string[counter.sequences]; | |
2857 | |
2858 /* Fill local allocated memory with previously selected data */ | |
2859 fillNewDataStructure(matrixAux, newSeqsName); | |
2860 | |
2861 /* When we have all parameters, we create the new alignment */ | |
2862 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
2863 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
2864 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
2865 keepSequences, keepHeader, sequenNumber, residNumber, | |
2866 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
2867 identities, overlaps); | |
2868 | |
2869 /* Free local memory */ | |
2870 delete [] matrixAux; | |
2871 delete [] newSeqsName; | |
2872 | |
2873 /* Return the new alignment reference */ | |
2874 return newAlig; | |
2875 } | |
2876 | |
2877 /* Function for computing the complementary alignment. It just turn around the | |
2878 * current columns/sequences selection */ | |
2879 void alignment::computeComplementaryAlig(bool residues, bool sequences) { | |
2880 int i; | |
2881 | |
2882 for(i = 0; i < residNumber && residues; i++) | |
2883 saveResidues[i] = (saveResidues[i] == -1) ? i : -1; | |
2884 | |
2885 for(i = 0; i < sequenNumber && sequences; i++) | |
2886 saveSequences[i] = (saveSequences[i] == -1) ? i : -1; | |
2887 } | |
2888 | |
2889 /* Function designed for identifying right and left borders between central | |
2890 * and terminal regions in the alignment. The borders are those columns, first | |
2891 * and last, composed by only residues. Everything inbetween left and right | |
2892 * borders are keept independently of the applied methods */ | |
2893 bool alignment::removeOnlyTerminal(void) { | |
2894 | |
2895 int i; | |
2896 const int *gInCol; | |
2897 | |
2898 if((left_boundary == -1) and (right_boundary == -1)) { | |
2899 /* Get alignments gaps stats and copy it */ | |
2900 if(calculateGapStats() != true) { | |
2901 cerr << endl << "WARNING: Impossible to apply 'terminal-only' method" | |
2902 << endl << endl; | |
2903 return false; | |
2904 } | |
2905 gInCol = sgaps -> getGapsWindow(); | |
2906 | |
2907 /* Identify left and right borders. First and last columns with no gaps */ | |
2908 for(i = 0; i < residNumber && gInCol[i] != 0; i++) ; | |
2909 left_boundary = i; | |
2910 | |
2911 for(i = residNumber - 1; i > -1 && gInCol[i] != 0; i--) ; | |
2912 right_boundary = i; | |
2913 } | |
2914 | |
2915 else if(left_boundary >= right_boundary) { | |
2916 cerr << endl << "ERROR: Check your manually set left '"<< left_boundary | |
2917 << "' and right '" << right_boundary << "' boundaries'" << endl << endl; | |
2918 return false; | |
2919 } | |
2920 | |
2921 /* We increase the right boundary in one position because we use lower strict | |
2922 * condition to get back columns inbetween boundaries removed by any method */ | |
2923 right_boundary += 1; | |
2924 | |
2925 /* Once the interal boundaries have been established, if these limits exist | |
2926 * then retrieved all columns inbetween both boundaries. Columns out of these | |
2927 * limits will remain selected or not depending on the algorithm applied */ | |
2928 for(i = left_boundary; i < right_boundary; i++) | |
2929 saveResidues[i] = i; | |
2930 | |
2931 return true; | |
2932 } | |
2933 | |
2934 /* Function designed to identify and remove those columns blocks smaller than | |
2935 * a given size */ | |
2936 void alignment::removeSmallerBlocks(int blockSize) { | |
2937 | |
2938 int i, j, pos, block; | |
2939 | |
2940 if(blockSize == 0) | |
2941 return ; | |
2942 | |
2943 /* Traverse the alignment looking for blocks greater than BLOCKSIZE, everytime | |
2944 * than a column hasn't been selected, check whether the current block is big | |
2945 * enough to be kept or it should be removed from the final alignment */ | |
2946 for(i = 0, pos = 0, block = 0; i < residNumber; i++) { | |
2947 if(saveResidues[i] != -1) | |
2948 block++; | |
2949 else { | |
2950 /* Remove columns from blocks smaller than input blocks size */ | |
2951 if(block < blockSize) | |
2952 for(j = pos; j <= i; j++) | |
2953 saveResidues[j] = -1; | |
2954 pos = i + 1; | |
2955 block = 0; | |
2956 } | |
2957 } | |
2958 | |
2959 /* Check final block separately since it could happen than last block is not | |
2960 * big enough but because the loop end could provoke to ignore it */ | |
2961 if(block < blockSize) | |
2962 for(j = pos; j < i; j++) | |
2963 saveResidues[j] = -1; | |
2964 return ; | |
2965 } | |
2966 | |
2967 /* Function designed to identify columns and sequences composed only by gaps. | |
2968 * Once these columns/sequences have been identified, they are removed from | |
2969 * final alignment. */ | |
2970 newValues alignment::removeCols_SeqsAllGaps(void) { | |
2971 int i, j, valid, gaps; | |
2972 bool warnings = false; | |
2973 newValues counter; | |
2974 | |
2975 /* Check all valid columns looking for those composed by only gaps */ | |
2976 for(i = 0, counter.residues = 0; i < residNumber; i++) { | |
2977 if(saveResidues[i] == -1) | |
2978 continue; | |
2979 | |
2980 for(j = 0, valid = 0, gaps = 0; j < sequenNumber; j++) { | |
2981 if (saveSequences[j] == -1) | |
2982 continue; | |
2983 if (sequences[j][i] == '-') | |
2984 gaps ++; | |
2985 valid ++; | |
2986 } | |
2987 /* Once a column has been identified, warm about it and remove it */ | |
2988 if(gaps == valid) { | |
2989 if(!warnings) | |
2990 cerr << endl; | |
2991 warnings = true; | |
2992 cerr << "WARNING: Removing column '" << i << "' composed only by gaps" | |
2993 << endl; | |
2994 saveResidues[i] = -1; | |
2995 } else { | |
2996 counter.residues ++; | |
2997 } | |
2998 } | |
2999 | |
3000 /* Check for those selected sequences to see whether there is anyone with | |
3001 * only gaps */ | |
3002 for(i = 0, counter.sequences = 0; i < sequenNumber; i++) { | |
3003 if(saveSequences[i] == -1) | |
3004 continue; | |
3005 | |
3006 for(j = 0, valid = 0, gaps = 0; j < residNumber; j++) { | |
3007 if(saveResidues[j] == -1) | |
3008 continue; | |
3009 if (sequences[i][j] == '-') | |
3010 gaps ++; | |
3011 valid ++; | |
3012 } | |
3013 /* Warm about it and remove each sequence composed only by gaps */ | |
3014 if(gaps == valid) { | |
3015 if(!warnings) | |
3016 cerr << endl; | |
3017 warnings = true; | |
3018 | |
3019 if(keepSequences) { | |
3020 cerr << "WARNING: Keeping sequence '" << seqsName[i] | |
3021 << "' composed only by gaps" << endl; | |
3022 counter.sequences ++; | |
3023 } else { | |
3024 cerr << "WARNING: Removing sequence '" << seqsName[i] | |
3025 << "' composed only by gaps" << endl; | |
3026 saveSequences[i] = -1; | |
3027 } | |
3028 } else { | |
3029 counter.sequences ++; | |
3030 } | |
3031 } | |
3032 if(warnings) | |
3033 cerr << endl; | |
3034 | |
3035 counter.matrix = new string[counter.sequences]; | |
3036 counter.seqsName = new string[counter.sequences]; | |
3037 | |
3038 return counter; | |
3039 } | |
3040 | |
3041 /* Function for copying to previously allocated memory those data selected | |
3042 * for being in the final alignment */ | |
3043 void alignment::fillNewDataStructure(string *newMatrix, string *newNames) { | |
3044 int i, j, k; | |
3045 | |
3046 /* Copy only those sequences/columns selected */ | |
3047 for(i = 0, j = 0; i < sequenNumber; i++) { | |
3048 if(saveSequences[i] == -1) | |
3049 continue; | |
3050 | |
3051 newNames[j] = seqsName[i]; | |
3052 for(k = 0; k < residNumber; k++) { | |
3053 if(saveResidues[k] == -1) | |
3054 continue; | |
3055 newMatrix[j].resize(newMatrix[j].size() + 1, sequences[i][k]); | |
3056 } | |
3057 j++; | |
3058 } | |
3059 } | |
3060 | |
3061 /* Function for copying to previously allocated memory those data selected | |
3062 * for being in the final alignment */ | |
3063 void alignment::fillNewDataStructure(newValues *data) { | |
3064 int i, j, k; | |
3065 | |
3066 /* Copy only those sequences/columns selected */ | |
3067 for(i = 0, j = 0; i < sequenNumber; i++) { | |
3068 if(saveSequences[i] == -1) | |
3069 continue; | |
3070 | |
3071 data -> seqsName[j] = seqsName[i]; | |
3072 for(k = 0; k < residNumber; k++) { | |
3073 if(saveResidues[k] == -1) | |
3074 continue; | |
3075 data -> matrix[j].resize(data -> matrix[j].size() + 1, sequences[i][k]); | |
3076 } | |
3077 j++; | |
3078 } | |
3079 // cerr << data -> seqsName[j-1] << endl; | |
3080 } | |
3081 | |
3082 /* Check if CDS file is correct based on: Residues are DNA/RNA (at most). There | |
3083 * is not gaps in the whole dataset. Each sequence is multiple of 3. At the same | |
3084 * time, the function will remove stop codons if appropiate flags are used */ | |
3085 bool alignment::prepareCodingSequence(bool splitByStopCodon, bool ignStopCodon,\ | |
3086 alignment *proteinAlig) { | |
3087 | |
3088 bool warning = false; | |
3089 size_t found; | |
3090 int i; | |
3091 | |
3092 /* New code: We now care about the presence of wildcards/indeterminations | |
3093 * characters such as 'X' or 'B' into protein sequences as well as about the | |
3094 * presence of Selenocysteines ('U') or Pyrrolysines ('O'). It only works, by | |
3095 * now, with the universal genetic code */ | |
3096 int *protSeqsLengths, numbProtSeqs, current_prot; | |
3097 string *protSeqsNames, *protSequences; | |
3098 char aminoAcid; | |
3099 | |
3100 numbProtSeqs = proteinAlig -> getNumSpecies(); | |
3101 protSeqsNames = new string[numbProtSeqs]; | |
3102 protSequences = new string[numbProtSeqs]; | |
3103 protSeqsLengths = new int[numbProtSeqs]; | |
3104 | |
3105 proteinAlig -> getSequences(protSeqsNames, protSequences, protSeqsLengths); | |
3106 | |
3107 /* Check read sequences are real DNA/RNA */ | |
3108 if (getTypeAlignment() == AAType) { | |
3109 cerr << endl << "ERROR: Check input CDS file. It seems to content protein " | |
3110 << "residues." << endl << endl; | |
3111 return false; | |
3112 } | |
3113 | |
3114 for(i = 0; i < sequenNumber; i++) { | |
3115 | |
3116 /* Get protein sequence to compare against any potential stop codon in the | |
3117 * coding sequence. If there is not protein sequence for current coding | |
3118 * sequence, skip its analysis */ | |
3119 for(current_prot = 0; current_prot < numbProtSeqs; current_prot++) | |
3120 if(protSeqsNames[current_prot] == seqsName[i]) | |
3121 break; | |
3122 if(current_prot == numbProtSeqs) | |
3123 continue; | |
3124 | |
3125 if(sequences[i].find("-") != string::npos) { | |
3126 if (!warning) | |
3127 cerr << endl; | |
3128 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has, at least, one gap" | |
3129 << endl << endl; | |
3130 return false; | |
3131 } | |
3132 | |
3133 if((sequences[i].length() % 3) != 0) { | |
3134 if (!warning) | |
3135 cerr << endl; | |
3136 warning = true; | |
3137 cerr << "WARNING: Sequence length \"" << seqsName[i] << "\" is not " | |
3138 << "multiple of 3 (length: " << sequences[i].length() << ")" << endl; | |
3139 } | |
3140 | |
3141 /* Ignore stop codons from the CDS if set by the user */ | |
3142 if (ignStopCodon) | |
3143 continue; | |
3144 | |
3145 /* Detect universal stop codons in the CDS. Then, compare those stop codons | |
3146 * against the protein sequence to see whether they are real stop codons or | |
3147 * are representing rare amino-acids such as Selenocysteines or Pyrrolysines | |
3148 * It also allows stop-codons when there are wildcards/indet characters in | |
3149 * the protein sequence. CDS sequences could be splitted using stop codons | |
3150 * from the sequence itself */ | |
3151 | |
3152 /* Initialize first appearence of a given stop codon to -1. | |
3153 * That means that it has not been found yet */ | |
3154 found = -1; | |
3155 do { | |
3156 found = sequences[i].find("TGA", found + 1); | |
3157 | |
3158 /* If a stop codon has been found and its position is multiple of 3. | |
3159 * Analize it */ | |
3160 if((found != string::npos) && (((int) found % 3) == 0)) { | |
3161 | |
3162 aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]); | |
3163 /* It may be a Selenocysteine ('TGA') which should be represented as 'U' | |
3164 * or wildcard/indet characters such as 'X' or 'B' */ | |
3165 //~ if ((aminoAcid == 'U') || (aminoAcid == 'X') || (aminoAcid == 'B')) | |
3166 /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character | |
3167 * such as 'B'/'X' is present, skip current stop codon */ | |
3168 if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \ | |
3169 (aminoAcid == 'B')) | |
3170 continue; | |
3171 | |
3172 /* If split_by_stop_codon flag is activated then cut input CDS sequence | |
3173 * up to first appearance of a stop codon */ | |
3174 else if(splitByStopCodon) { | |
3175 if (!warning) | |
3176 cerr << endl; | |
3177 warning = true; | |
3178 cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first" | |
3179 << " appearance of stop codon \"TGA\" (residue \"" << aminoAcid | |
3180 << "\") at position " << (int) found + 1 << " (length: " | |
3181 << sequences[i].length() << ")" << endl; | |
3182 sequences[i].resize((int) found); | |
3183 } | |
3184 /* Otherwise, warn about it and return an error */ | |
3185 else { | |
3186 if (!warning) | |
3187 cerr << endl; | |
3188 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \"" | |
3189 << "TGA\" (residue \"" << aminoAcid << "\") at position " | |
3190 << (int) found + 1 << " (length: " << sequences[i].length() << ")" | |
3191 << endl << endl; | |
3192 return false; | |
3193 } | |
3194 } | |
3195 /* Iterate over the CDS until not stop codon is found */ | |
3196 } while(found != string::npos); | |
3197 | |
3198 /* Initialize first appearence of a given stop codon to -1. | |
3199 * That means that it has not been found yet */ | |
3200 found = -1; | |
3201 do { | |
3202 found = sequences[i].find("TAA", found + 1); | |
3203 | |
3204 /* If a stop codon has been found and its position is multiple of 3. | |
3205 * Analize it */ | |
3206 if((found != string::npos) && (((int) found % 3) == 0)) { | |
3207 | |
3208 aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]); | |
3209 /* Check if there is any wildcard/indet characters such as 'X' or 'B' */ | |
3210 //~ if ((aminoAcid == 'X') || (aminoAcid == 'B')) | |
3211 /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character | |
3212 * such as 'B'/'X' is present, skip current stop codon */ | |
3213 if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \ | |
3214 (aminoAcid == 'B')) | |
3215 continue; | |
3216 | |
3217 /* If split_by_stop_codon flag is activated then cut input CDS sequence | |
3218 * up to first appearance of a stop codon */ | |
3219 else if(splitByStopCodon) { | |
3220 if (!warning) | |
3221 cerr << endl; | |
3222 warning = true; | |
3223 cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first" | |
3224 << " appearance of stop codon \"TAA\" (residue \"" << aminoAcid | |
3225 << "\") at position " << (int) found + 1 << " (length: " | |
3226 << sequences[i].length() << ")" << endl; | |
3227 sequences[i].resize((int) found); | |
3228 } | |
3229 /* Otherwise, warn about it and return an error */ | |
3230 else { | |
3231 if (!warning) | |
3232 cerr << endl; | |
3233 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \"" | |
3234 << "TAA\" (residue \"" << aminoAcid << "\") at position " | |
3235 << (int) found + 1 << " (length: " << sequences[i].length() << ")" | |
3236 << endl << endl; | |
3237 return false; | |
3238 } | |
3239 } | |
3240 /* Iterate over the CDS until not stop codon is found */ | |
3241 } while(found != string::npos); | |
3242 | |
3243 /* Initialize first appearence of a given stop codon to -1. | |
3244 * That means that it has not been found yet */ | |
3245 found = -1; | |
3246 do { | |
3247 found = sequences[i].find("TAG", found + 1); | |
3248 /* If a stop codon has been found and its position is multiple of 3. | |
3249 * Analize it */ | |
3250 if((found != string::npos) && (((int) found % 3) == 0)) { | |
3251 | |
3252 aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]); | |
3253 /* It may be a Pyrrolysine ('TAG') which should be represented as 'O' | |
3254 * or wildcard/indet characters such as 'X' or 'B' */ | |
3255 //~ if ((aminoAcid == 'O') || (aminoAcid == 'X') || (aminoAcid == 'B')) | |
3256 /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character | |
3257 * such as 'B'/'X' is present, skip current stop codon */ | |
3258 if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \ | |
3259 (aminoAcid == 'B')) | |
3260 continue; | |
3261 | |
3262 /* If split_by_stop_codon flag is activated then cut input CDS sequence | |
3263 * up to first appearance of a stop codon */ | |
3264 else if(splitByStopCodon) { | |
3265 if (!warning) | |
3266 cerr << endl; | |
3267 warning = true; | |
3268 cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first" | |
3269 << " appearance of stop codon \"TAG\" (residue \"" << aminoAcid | |
3270 << "\") at position " << (int) found + 1 << " (length: " | |
3271 << sequences[i].length() << ")" << endl; | |
3272 sequences[i].resize((int) found); | |
3273 } | |
3274 /* Otherwise, warn about it and return an error */ | |
3275 else { | |
3276 if (!warning) | |
3277 cerr << endl; | |
3278 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \"" | |
3279 << "TAG\" (residue \"" << aminoAcid << "\") at position " | |
3280 << (int) found + 1 << " (length: " << sequences[i].length() << ")" | |
3281 << endl << endl; | |
3282 return false; | |
3283 } | |
3284 } | |
3285 /* Iterate over the CDS until not stop codon is found */ | |
3286 } while(found != string::npos); | |
3287 } | |
3288 | |
3289 /* If everything was return an OK to informat about it. */ | |
3290 return true; | |
3291 } | |
3292 | |
3293 /* Function designed to check whether input CDS file is correct or not based on | |
3294 * some features: Sequences are in both files (it could be more on CDS file), | |
3295 * they have (more or less) same ength. Otherwise, some nucleotides could be | |
3296 * excluded or some 'N's added to fit protein length. */ | |
3297 bool alignment::checkCorrespondence(string *names, int *lengths, int \ | |
3298 totalInputSeqs, int multiple = 1) { | |
3299 | |
3300 int i, j, seqLength, indet; | |
3301 bool warnings = false; | |
3302 string tmp; | |
3303 | |
3304 /* For each sequence in the current protein alignment, look for its coding | |
3305 * DNA sequence checking that they have the same size. */ | |
3306 for(i = 0; i < sequenNumber; i++) { | |
3307 | |
3308 /* Get protein sequence length removing any possible gap. Get as well last | |
3309 * residue from current sequence */ | |
3310 | |
3311 tmp = utils::removeCharacter('-', sequences[i]); | |
3312 seqLength = tmp.length() * multiple; | |
3313 indet = ((int) tmp.length() - utils::min((int) tmp.find_last_not_of("X"), \ | |
3314 (int) tmp.find_last_not_of("x"))) - 1; | |
3315 | |
3316 /* Go through all available CDS looking for the one with the same ID */ | |
3317 for(j = 0; j < totalInputSeqs; j++) { | |
3318 | |
3319 /* Once both ID matchs, compare its lengths */ | |
3320 if(seqsName[i] == names[j]) { | |
3321 | |
3322 /* If both sequences have the same length, stop the search */ | |
3323 if(seqLength == lengths[j]) | |
3324 break; | |
3325 | |
3326 /* If nucleotide sequence is larger than protein sequence, warn about | |
3327 * it and continue the verification process. It will used the 'Nth' | |
3328 * first nucleotides for the conversion */ | |
3329 else if(seqLength < lengths[j]) { | |
3330 if (!warnings) | |
3331 cerr << endl; | |
3332 warnings = true; | |
3333 cerr << "WARNING: Sequence \"" << seqsName[i] << "\" will be cut at " | |
3334 << "position " << seqLength << " (length: "<< lengths[j] << ")" | |
3335 << endl; | |
3336 break; | |
3337 } | |
3338 | |
3339 /* It has been detected some indeterminations at the end of the protein | |
3340 * sequence. That issue could be cause by some incomplete codons in the | |
3341 * nucleotide sequences. This issue is solved adding as much 'N' symbols | |
3342 * as it is needed to preserve the backtranslated alignment */ | |
3343 else if((indet > 0) && (indet > (seqLength - lengths[j])/3)) { | |
3344 if (!warnings) | |
3345 cerr << endl; | |
3346 warnings = true; | |
3347 cerr << "WARNING: Sequence \"" << seqsName[i] << "\" has some inde" | |
3348 << "termination symbols 'X' at the end of sequence. They will be" | |
3349 << " included in the final alignment." << endl; | |
3350 break; | |
3351 } | |
3352 | |
3353 /* If nucleotide sequence is shorter than protein sequence, return an | |
3354 * error since it is not feasible to cut the input protein aligment to | |
3355 * fit it into CDNA sequences size */ | |
3356 else { | |
3357 if (!warnings) | |
3358 cerr << endl; | |
3359 warnings = true; | |
3360 cerr << "WARNING: Sequence \"" << seqsName[i] << "\" has less nucleo" | |
3361 << "tides (" << lengths[j] << ") than expected (" << seqLength | |
3362 << "). It will be added N's to complete the sequence" << endl; | |
3363 break; | |
3364 } | |
3365 } | |
3366 } | |
3367 | |
3368 /* Warn about a mismatch a sequences name level */ | |
3369 if(j == totalInputSeqs) { | |
3370 cerr << endl << "ERROR: Sequence \"" << seqsName[i] << "\" is not in " | |
3371 << "CDS file." << endl << endl; | |
3372 return false; | |
3373 } | |
3374 } | |
3375 | |
3376 /* If everything is OK, return an appropiate flag */ | |
3377 return true; | |
3378 } |