comparison trimal_repo/source/alignment.cpp @ 0:b15a3147e604 draft

"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
author padge
date Fri, 25 Mar 2022 17:10:43 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b15a3147e604
1 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
2 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
3
4 trimAl v1.4: a tool for automated alignment trimming in large-scale
5 phylogenetics analyses.
6
7 readAl v1.4: a tool for automated alignment conversion among different
8 formats.
9
10 statAl v1.4: a tool for getting stats about multiple sequence alignments.
11
12
13 2009-2015 Capella-Gutierrez S. and Gabaldon, T.
14 [scapella, tgabaldon]@crg.es
15
16 This file is part of trimAl/readAl.
17
18 trimAl/readAl are free software: you can redistribute it and/or modify
19 it under the terms of the GNU General Public License as published by
20 the Free Software Foundation, the last available version.
21
22 trimAl/readAl are distributed in the hope that it will be useful,
23 but WITHOUT ANY WARRANTY; without even the implied warranty of
24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 GNU General Public License for more details.
26
27 You should have received a copy of the GNU General Public License
28 along with trimAl/readAl. If not, see <http://www.gnu.org/licenses/>.
29
30 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
31 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
32 using namespace std;
33
34 #include <float.h>
35 #include "alignment.h"
36 #include "rwAlignment.cpp"
37 #include "autAlignment.cpp"
38
39 #include <deque>
40
41 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
42 /* Class constructor */
43 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
44
45 alignment::alignment(void) {
46
47 /* Alignment parameter */
48 sequenNumber = 0;
49 residNumber = 0;
50
51 /* Are the input sequences aligned? */
52 isAligned = false;
53
54 /* Should the output file be reversed? */
55 reverse = false;
56
57 /* Should be trimmed only terminal gaps? - set automated and manual boundaries
58 * values */
59 terminalGapOnly = false;
60 left_boundary = -1;
61 right_boundary = -1;
62
63 /* Input and output formats */
64 iformat = 0;
65 oformat = 0;
66 shortNames = false;
67
68 forceCaps = false;
69 upperCase = false;
70 lowerCase = false;
71
72 /* Indicate whether sequences composed only by gaps should be kept or not */
73 keepSequences = false;
74
75 /* Indicate whether original header, they may include non-alphanumerical
76 * characters, should be dumped into output stream without any preprocessing
77 * step */
78 keepHeader = false;
79
80 gapSymbol = "-";
81
82 /* Sequence datatype: DNA, RNA or Protein */
83 dataType = 0;
84
85 /* Window sizes to trim the input alignment */
86 ghWindow = 0;
87 shWindow = 0;
88
89 /* Minimum block size in the new alignment */
90 blockSize = 0;
91
92 /* Is this alignmnet new? */
93 oldAlignment = false;
94
95 /* Sequence residues number */
96 residuesNumber = NULL;
97
98 /* Columns and sequences that have been previously selected */
99 saveResidues = NULL;
100 saveSequences = NULL;
101
102 /* Input sequences as well other information such as sequences name, etc */
103 sequences = NULL;
104 seqsName = NULL;
105 seqsInfo = NULL;
106
107 /* Information about input alignment */
108 filename = "";
109 aligInfo = "";
110
111 /* Information computed from alignment */
112 sgaps = NULL;
113 scons = NULL;
114 seqMatrix = NULL;
115
116 identities = NULL;
117 overlaps = NULL;
118
119 /* ***** ***** ***** ***** ***** ***** ***** ***** */
120 }
121
122 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
123 /* Class constructor */
124 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
125
126 alignment::alignment(string o_filename, string o_aligInfo, string *o_sequences, string *o_seqsName,
127 string *o_seqsInfo, int o_sequenNumber, int o_residNumber, int o_iformat, int o_oformat,
128 bool o_shortNames, int o_dataType, int o_isAligned, bool o_reverse, bool o_terminalGapOnly,
129 int o_left_boundary, int o_right_boundary,
130 bool o_keepSeqs, bool o_keepHeader, int OldSequences, int OldResidues, int *o_residuesNumber,
131 int *o_saveResidues, int *o_saveSequences, int o_ghWindow, int o_shWindow, int o_blockSize,
132 float **o_identities, float **o_overlaps) {
133
134 /* ***** ***** ***** ***** ***** ***** ***** ***** */
135 int i, j, k, ll;
136 /* ***** ***** ***** ***** ***** ***** ***** ***** */
137
138 oldAlignment = true;
139
140 /* ***** ***** ***** ***** ***** ***** ***** ***** */
141 /* Assign the parameter values to the variables */
142 sequenNumber = o_sequenNumber;
143 residNumber = o_residNumber;
144
145 iformat = o_iformat;
146 oformat = o_oformat;
147 shortNames = o_shortNames;
148
149 dataType = o_dataType;
150
151 ghWindow = o_ghWindow;
152 shWindow = o_shWindow;
153
154 blockSize = o_blockSize;
155
156 isAligned = o_isAligned;
157 reverse = o_reverse;
158
159 terminalGapOnly = o_terminalGapOnly;
160 right_boundary = o_right_boundary;
161 left_boundary = o_left_boundary;
162
163 filename = o_filename;
164 aligInfo = o_aligInfo;
165 /* ***** ***** ***** ***** ***** ***** ***** ***** */
166
167 keepSequences = o_keepSeqs;
168 keepHeader = o_keepHeader;
169
170 forceCaps = false;
171 upperCase = false;
172 lowerCase = false;
173
174 gapSymbol = "-";
175
176
177 /* ***** ***** ***** ***** ***** ***** ***** ***** */
178 /* Basic information for the new alignment */
179 sequences = new string[sequenNumber];
180 for(i = 0; i < sequenNumber; i++)
181 sequences[i] = o_sequences[i];
182
183 seqsName = new string[sequenNumber];
184 for(i = 0; i < sequenNumber; i++)
185 seqsName[i] = o_seqsName[i];
186 /* ***** ***** ***** ***** ***** ***** ***** ***** */
187
188 /* ***** ***** ***** ***** ***** ***** ***** ***** */
189 residuesNumber = new int[sequenNumber];
190 if((isAligned) || (o_residuesNumber != NULL)) {
191 for(i = 0; i < sequenNumber; i++)
192 residuesNumber[i] = residNumber;
193 } else {
194 for(i = 0; i < sequenNumber; i++)
195 residuesNumber[i] = o_residuesNumber[i];
196 }
197 /* ***** ***** ***** ***** ***** ***** ***** ***** */
198
199 /* ***** ***** ***** ***** ***** ***** ***** ***** */
200 if(o_seqsInfo != NULL) {
201 seqsInfo = new string[sequenNumber];
202 for(i = 0; i < sequenNumber; i++)
203 seqsInfo[i] = o_seqsInfo[i];
204 } else seqsInfo = NULL;
205
206 saveResidues = NULL;
207 if(o_saveResidues != NULL) {
208 saveResidues = new int[residNumber];
209 for(i = 0, j = 0; i < OldResidues; i++)
210 if(o_saveResidues[i] != -1) {
211 saveResidues[j] = o_saveResidues[i];
212 j++;
213 }
214 }
215
216 saveSequences = NULL;
217 if(o_saveSequences != NULL) {
218 saveSequences = new int[sequenNumber];
219 for(i = 0, j = 0; i < OldSequences; i++)
220 if(o_saveSequences[i] != -1) {
221 saveSequences[j] = o_saveSequences[i];
222 j++;
223 }
224 }
225 /* ***** ***** ***** ***** ***** ***** ***** ***** */
226
227 /* ***** ***** ***** ***** ***** ***** ***** ***** */
228 identities = NULL;
229 if(o_identities != NULL) {
230 identities = new float*[sequenNumber];
231 for(i = 0, j = 0; i < OldSequences; i++) {
232 if(o_saveSequences[i] != -1) {
233 identities[j] = new float[sequenNumber];
234 for(k = 0, ll = 0; k < OldSequences; k++) {
235 if(o_saveSequences[k] != -1) {
236 identities[j][ll] = o_identities[i][k];
237 ll++;
238 }
239 }
240 j++;
241 }
242 }
243 }
244
245 overlaps = NULL;
246 if(o_overlaps != NULL) {
247 overlaps = new float*[sequenNumber];
248 for(i = 0, j = 0; i < OldSequences; i++) {
249 if(o_saveSequences[i] != -1) {
250 overlaps[j] = new float[sequenNumber];
251 for(k = 0, ll = 0; k < OldSequences; k++) {
252 if(o_saveSequences[k] != -1) {
253 overlaps[j][ll] = o_overlaps[i][k];
254 ll++;
255 }
256 }
257 j++;
258 }
259 }
260 }
261 /* ***** ***** ***** ***** ***** ***** ***** ***** */
262
263 /* ***** ***** ***** ***** ***** ***** ***** ***** */
264 /* Any structure associated to the new alignment is
265 * initialize to NULL. In this way, these structure,
266 * if it will be necessary, has to be computed */
267 sgaps = NULL;
268 scons = NULL;
269 seqMatrix = NULL;
270 identities = NULL;
271 /* ***** ***** ***** ***** ***** ***** ***** ***** */
272 }
273
274 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
275 /* Overlapping operator = to use it as a kind of class constructor */
276 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
277
278 alignment &alignment::operator=(const alignment &old) {
279 int i, j;
280
281 if(this != &old) {
282
283 oldAlignment = true;
284
285 /* ***** ***** ***** ***** ***** ***** ***** ***** */
286 /* Assign the parameter values to the variables */
287 sequenNumber = old.sequenNumber;
288 residNumber = old.residNumber;
289
290 isAligned = old.isAligned;
291 reverse = old.reverse;
292
293 terminalGapOnly = old.terminalGapOnly;
294 right_boundary = old.right_boundary;
295 left_boundary = old.left_boundary;
296
297 iformat = old.iformat;
298 oformat = old.oformat;
299 shortNames = old.shortNames;
300
301 dataType = old.dataType;
302
303 ghWindow = old.ghWindow;
304 shWindow = old.shWindow;
305
306 blockSize = old.blockSize;
307
308 filename = old.filename;
309 aligInfo = old.aligInfo;
310 /* ***** ***** ***** ***** ***** ***** ***** ***** */
311
312 keepSequences = old.keepSequences;
313 keepHeader = old.keepHeader;
314
315 forceCaps = old.forceCaps;
316 upperCase = old.upperCase;
317 lowerCase = old.lowerCase;
318
319 gapSymbol = old.gapSymbol;
320
321 /* ***** ***** ***** ***** ***** ***** ***** ***** */
322 sequences = new string[sequenNumber];
323 for(i = 0; i < sequenNumber; i++)
324 sequences[i] = old.sequences[i];
325
326 seqsName = new string[sequenNumber];
327 for(i = 0; i < sequenNumber; i++)
328 seqsName[i] = old.seqsName[i];
329 /* ***** ***** ***** ***** ***** ***** ***** ***** */
330
331 /* ***** ***** ***** ***** ***** ***** ***** ***** */
332 delete [] residuesNumber;
333 if(old.residuesNumber) {
334 residuesNumber = new int[sequenNumber];
335 for(i = 0; i < sequenNumber; i++)
336 residuesNumber[i] = old.residuesNumber[i];
337 } residuesNumber = NULL;
338 /* ***** ***** ***** ***** ***** ***** ***** ***** */
339
340 /* ***** ***** ***** ***** ***** ***** ***** ***** */
341 delete [] seqsInfo;
342 if(old.seqsInfo) {
343 seqsInfo = new string[sequenNumber];
344 for(i = 0; i < sequenNumber; i++)
345 seqsInfo[i] = old.seqsInfo[i];
346 } else seqsInfo = NULL;
347
348 delete [] saveResidues;
349 if(old.saveResidues) {
350 saveResidues = new int[residNumber];
351 for(i = 0; i < residNumber; i++)
352 saveResidues[i] = old.saveResidues[i];
353 } else saveResidues = NULL;
354
355 delete [] saveSequences;
356 if(old.saveSequences) {
357 saveSequences = new int[sequenNumber];
358 for(i = 0; i < sequenNumber; i++)
359 saveSequences[i] = old.saveSequences[i];
360 } else saveSequences = NULL;
361 /* ***** ***** ***** ***** ***** ***** ***** ***** */
362
363 /* ***** ***** ***** ***** ***** ***** ***** ***** */
364 delete [] identities;
365 if(old.identities) {
366 identities = new float*[sequenNumber];
367 for(i = 0; i < sequenNumber; i++) {
368 identities[i] = new float[sequenNumber];
369 for(j = 0; j < sequenNumber; j++)
370 identities[i][j] = old.identities[i][j];
371 }
372 } else identities = NULL;
373 /* ***** ***** ***** ***** ***** ***** ***** ***** */
374
375 /* ***** ***** ***** ***** ***** ***** ***** ***** */
376 delete [] overlaps;
377 if(old.overlaps) {
378 overlaps = new float*[sequenNumber];
379 for(i = 0; i < sequenNumber; i++) {
380 overlaps[i] = new float[sequenNumber];
381 for(j = 0; j < sequenNumber; j++)
382 overlaps[i][j] = old.overlaps[i][j];
383 }
384 } else overlaps = NULL;
385 /* ***** ***** ***** ***** ***** ***** ***** ***** */
386
387 /* ***** ***** ***** ***** ***** ***** ***** ***** */
388 delete sgaps;
389 sgaps = NULL;
390
391 delete scons;
392 scons = NULL;
393
394 delete seqMatrix;
395 seqMatrix = old.seqMatrix;
396 /* ***** ***** ***** ***** ***** ***** ***** ***** */
397 }
398 /* ***** ***** ***** ***** ***** ***** ***** ***** */
399 return *this;
400 }
401
402 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
403 /* Class destructor */
404 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
405
406 alignment::~alignment(void) {
407 int i;
408
409 /* ***** ***** ***** ***** ***** ***** ***** ***** */
410 if(sequences != NULL)
411 delete [] sequences;
412 sequences = NULL;
413
414 if(seqsName != NULL)
415 delete [] seqsName;
416 seqsName = NULL;
417 /* ***** ***** ***** ***** ***** ***** ***** ***** */
418
419 /* ***** ***** ***** ***** ***** ***** ***** ***** */
420 if(residuesNumber != NULL)
421 delete [] residuesNumber;
422 residuesNumber = NULL;
423 /* ***** ***** ***** ***** ***** ***** ***** ***** */
424
425 /* ***** ***** ***** ***** ***** ***** ***** ***** */
426 if(seqsInfo != NULL)
427 delete [] seqsInfo;
428 seqsInfo = NULL;
429
430 if(saveResidues != NULL)
431 delete[] saveResidues;
432 saveResidues = NULL;
433
434 if(saveSequences != NULL)
435 delete[] saveSequences;
436 saveSequences = NULL;
437 /* ***** ***** ***** ***** ***** ***** ***** ***** */
438
439 /* ***** ***** ***** ***** ***** ***** ***** ***** */
440 if(identities != NULL) {
441 for(i = 0; i < sequenNumber; i++)
442 delete [] identities[i];
443 delete [] identities;
444 }
445 identities = NULL;
446 /* ***** ***** ***** ***** ***** ***** ***** ***** */
447
448 /* ***** ***** ***** ***** ***** ***** ***** ***** */
449 if(overlaps != NULL) {
450 for(i = 0; i < sequenNumber; i++)
451 delete [] overlaps[i];
452 delete [] overlaps;
453 }
454 overlaps = NULL;
455 /* ***** ***** ***** ***** ***** ***** ***** ***** */
456
457 /* ***** ***** ***** ***** ***** ***** ***** ***** */
458 if(sgaps != NULL)
459 delete sgaps;
460 sgaps = NULL;
461
462 if(scons != NULL)
463 delete scons;
464 scons = NULL;
465
466 if(seqMatrix != NULL)
467 delete seqMatrix;
468 seqMatrix = NULL;
469 /* ***** ***** ***** ***** ***** ***** ***** ***** */
470
471 /* ***** ***** ***** ***** ***** ***** ***** ***** */
472 oldAlignment = false;
473
474 sequenNumber = 0;
475 residNumber = 0;
476
477 isAligned = false;
478 reverse = false;
479
480 terminalGapOnly = false;
481 right_boundary = -1;
482 left_boundary = -1;
483
484 iformat = 0;
485 oformat = 0;
486 shortNames = false;
487
488 dataType = 0;
489
490 ghWindow = 0;
491 shWindow = 0;
492
493 blockSize = 0;
494
495 filename.clear();
496 aligInfo.clear();
497 /* ***** ***** ***** ***** ***** ***** ***** ***** */
498 }
499
500 // Try to load current alignment and inform otherwise
501 bool alignment::loadAlignment(char *alignmentFile) {
502
503 // Detect input alignment format - it is an strict detection procedure
504 iformat = formatInputAlignment(alignmentFile);
505 // Unless it is indicated somewhere else, output alignment format will be
506 // the same as the input one
507 oformat = iformat;
508
509 // Use the appropiate function to read input alignment
510 switch(iformat) {
511 case 1:
512 return loadClustalAlignment(alignmentFile);
513 case 3:
514 return loadNBRF_PirAlignment(alignmentFile);
515 case 8:
516 return loadFastaAlignment(alignmentFile);
517 case 11:
518 return loadPhylip3_2Alignment(alignmentFile);
519 case 12:
520 return loadPhylipAlignment(alignmentFile);
521 case 17:
522 return loadNexusAlignment(alignmentFile);
523 case 21:
524 return loadMegaInterleavedAlignment(alignmentFile);
525 case 22:
526 return loadMegaNonInterleavedAlignment(alignmentFile);
527 // Return a FALSE value - meaning the input alignment was not loaded
528 default:
529 return false;
530 }
531 return false;
532 }
533
534 // Return input alignment format - it is useful to set-up output alignment one
535 int alignment::getInputFormat(void) {
536 return iformat;
537 }
538
539 // Return output alignment format
540 int alignment::getOutputFormat(void) {
541 return oformat;
542 }
543
544 // Return whether there is one or more alignments at the input file.
545 // Currently trimAl family only support single alignments per file.
546 int alignment::typeInputFile(void) {
547 return SINGLE;
548 }
549
550 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
551 /* This function prints the alignmnet to the standard output using an
552 * appropiate function depending on the output format */
553 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
554 bool alignment::printAlignment(void){
555
556 /* ***** ***** ***** ***** ***** ***** ***** ***** */
557 if(sequences == NULL)
558 return false;
559 /* ***** ***** ***** ***** ***** ***** ***** ***** */
560
561 if((residNumber == 0) || (sequenNumber == 0)) {
562 cerr << endl << "WARNING: Output alignment has not been generated. "
563 << "It is empty" << endl << endl;
564 return true;
565 }
566
567 /* ***** ***** ***** ***** ***** ***** ***** ***** */
568 switch(oformat) {
569 case 1:
570 alignmentClustalToFile(cout);
571 break;
572 case 3:
573 alignmentNBRF_PirToFile(cout);
574 break;
575 case 8:
576 alignmentFastaToFile(cout);
577 break;
578 case 11:
579 alignmentPhylip3_2ToFile(cout);
580 break;
581 case 12:
582 alignmentPhylipToFile(cout);
583 break;
584 case 13:
585 alignmentPhylip_PamlToFile(cout);
586 break;
587 case 17:
588 alignmentNexusToFile(cout);
589 break;
590 case 21: case 22:
591 alignmentMegaToFile(cout);
592 break;
593 case 99:
594 getSequences(cout);
595 break;
596 case 100:
597 alignmentColourHTML(cout);
598 break;
599 default:
600 return false;
601 }
602 /* ***** ***** ***** ***** ***** ***** ***** ***** */
603 return true;
604 }
605
606 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
607 /* This function puts the alignment in a given file */
608 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
609 bool alignment::saveAlignment(char *destFile) {
610
611 ofstream file;
612 /* ***** ***** ***** ***** ***** ***** ***** ***** */
613 if(sequences == NULL)
614 return false;
615 /* ***** ***** ***** ***** ***** ***** ***** ***** */
616
617 if((residNumber == 0) || (sequenNumber == 0)) {
618 cerr << endl << "WARNING: Output alignment has not been generated. "
619 << "It is empty." << endl << endl;
620 return true;
621 }
622
623 /* Check whether the input sequences file is aligned or not before creating
624 * a given output format. */
625 switch(oformat) {
626 case 1: case 11: case 12: case 13: case 17: case 21: case 22:
627 /* Check whether sequences in the alignment are aligned or not.
628 * Warn about it if there are not aligned. */
629 if (!isAligned) {
630 cerr << endl << "ERROR: Sequences are not aligned. Output format is "
631 << "not compatible with unaligned sequences.";
632 return false;
633 }
634 }
635
636 /* ***** ***** ***** ***** ***** ***** ***** ***** */
637 /* File open and correct open check */
638 file.open(destFile);
639 if(!file) return false;
640 /* ***** ***** ***** ***** ***** ***** ***** ***** */
641
642 /* ***** ***** ***** ***** ***** ***** ***** ***** */
643 /* Depending on the output format, we call to the
644 * appropiate function */
645 switch(oformat) {
646 case 1:
647 alignmentClustalToFile(file);
648 break;
649 case 3:
650 alignmentNBRF_PirToFile(file);
651 break;
652 case 8:
653 alignmentFastaToFile(file);
654 break;
655 case 11:
656 alignmentPhylip3_2ToFile(file);
657 break;
658 case 12:
659 alignmentPhylipToFile(file);
660 break;
661 case 13:
662 alignmentPhylip_PamlToFile(file);
663 break;
664 case 17:
665 alignmentNexusToFile(file);
666 break;
667 case 21: case 22:
668 alignmentMegaToFile(file);
669 break;
670 case 99:
671 getSequences(file);
672 break;
673 case 100:
674 alignmentColourHTML(file);
675 break;
676 default:
677 return false;
678 /* ***** ***** ***** ***** ***** ***** ***** ***** */
679 }
680
681 /* ***** ***** ***** ***** ***** ***** ***** ***** */
682 /* Close the output file */
683 file.close();
684
685 /* ***** ***** ***** ***** ***** ***** ***** ***** */
686 /* All is OK, return true */
687 return true;
688 }
689
690 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
691 /* This function trimms a given alignment based on the gap distribution
692 * values. To trim the alignment, this function uses two parameters:
693 *
694 * baseLine: Minimum percentage of columns that should
695 * be conserved in the new alignment.
696 * gapsPct: Maximum percentage of gaps per column that
697 * should be in the new alignment.
698 *
699 * The function selects that combination of parameters that maximize the
700 * final number of columns in the new alignment. There is an extra parameter
701 * to ask for the complementary alignment. A complementary alignment consits
702 * of those columns that would delete applying the standard approach. */
703 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
704 alignment *alignment::cleanGaps(float baseLine, float gapsPct, bool complementary) {
705
706 alignment *ret;
707 double cut;
708
709 /* ***** ***** ***** ***** ***** ***** ***** ***** */
710 /* If gaps statistics are not calculated, we
711 * calculate them */
712 if(calculateGapStats() != true)
713 return NULL;
714 /* ***** ***** ***** ***** ***** ***** ***** ***** */
715
716 /* ***** ***** ***** ***** ***** ***** ***** ***** */
717 /* Obtain the cut point using the given parameters */
718 cut = sgaps -> calcCutPoint(baseLine, gapsPct);
719 /* ***** ***** ***** ***** ***** ***** ***** ***** */
720
721 /* ***** ***** ***** ***** ***** ***** ***** ***** */
722 /* Once we have the cut value proposed, we call the
723 * appropiate method to clean the alignment and, then,
724 * generate the new alignment. */
725 ret = cleanByCutValue(cut, baseLine, sgaps -> getGapsWindow(), complementary);
726 /* ***** ***** ***** ***** ***** ***** ***** ***** */
727
728 /* ***** ***** ***** ***** ***** ***** ***** ***** */
729 /* Return a reference of the new alignment */
730 return ret;
731 /* ***** ***** ***** ***** ***** ***** ***** ***** */
732 }
733
734 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
735 /* This function trimms a given alignment based on the similarity distribution
736 * values. To trim the alignment, this function uses two parameters:
737 *
738 * baseLine: Minimum percentage of columns that should
739 * be conserved in the new alignment.
740 * conservat: Minimum value of similarity per column that
741 * should be in the new alignment.
742 *
743 * The function selects that combination of parameters that maximize the
744 * final number of columns in the new alignment. There is an extra parameter
745 * to ask for the complementary alignment. A complementary alignment consits
746 * of those columns that would delete applying the standard approach. */
747 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
748 alignment *alignment::cleanConservation(float baseLine, float conservationPct, bool complementary) {
749
750 alignment *ret;
751 float cut;
752
753 /* ***** ***** ***** ***** ***** ***** ***** ***** */
754 /* If conservation's statistics are not calculated,
755 * we calculate them */
756 if(calculateConservationStats() != true)
757 return NULL;
758 /* ***** ***** ***** ***** ***** ***** ***** ***** */
759
760 /* ***** ***** ***** ***** ***** ***** ***** ***** */
761 /* Calculate the cut point using the given parameters */
762 cut = (float) scons -> calcCutPoint(baseLine, conservationPct);
763 /* ***** ***** ***** ***** ***** ***** ***** ***** */
764
765 /* ***** ***** ***** ***** ***** ***** ***** ***** */
766 /* Once we have the cut value, we call the appropiate
767 * method to clean the alignment and, then, generate
768 the new alignment */
769 ret = cleanByCutValue(cut, baseLine, scons -> getMdkwVector(), complementary);
770 /* ***** ***** ***** ***** ***** ***** ***** ***** */
771
772 /* ***** ***** ***** ***** ***** ***** ***** ***** */
773 /* Return a reference of the new alignment */
774 return ret;
775 /* ***** ***** ***** ***** ***** ***** ***** ***** */
776 }
777
778 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
779 /* This function trimms a given alignment based on the similarity and gaps
780 * distribution values. To trim the alignment, this function uses three
781 * parameters:
782 *
783 * baseLine: Minimum percentage of columns that should
784 * be conserved in the new alignment.
785 * gapsPct: Maximum percentage of gaps per column that
786 * should be in the new alignment.
787 * conservat: Minimum value of similarity per column that
788 * should be in the new alignment.
789 *
790 * The function selects that combination of parameters that maximize the
791 * final number of columns in the new alignment. If the baseLine parameter
792 * is the most strict, the other two ones would be relaxed to keep columns
793 * that would not be selected. There is an extra parameter to ask for the
794 * complementary alignment. A complementary alignment consits of those
795 * columns that would delete applying the standard approach. */
796 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
797 alignment *alignment::clean(float baseLine, float GapsPct, float conservationPct, bool complementary) {
798
799 alignment *ret;
800 float cutCons;
801 double cutGaps;
802
803 /* ***** ***** ***** ***** ***** ***** ***** ***** */
804 /* If gaps statistics are not calculated, we calculate
805 * them */
806 if(calculateGapStats() != true)
807 return NULL;
808 /* ***** ***** ***** ***** ***** ***** ***** ***** */
809
810 /* ***** ***** ***** ***** ***** ***** ***** ***** */
811 /* If conservation's statistics are not calculated,
812 * we calculate them */
813 if(calculateConservationStats() != true)
814 return NULL;
815 /* ***** ***** ***** ***** ***** ***** ***** ***** */
816
817 /* ***** ***** ***** ***** ***** ***** ***** ***** */
818 /* Calculate the two cut points using the parameters */
819 cutGaps = sgaps->calcCutPoint(baseLine, GapsPct);
820 cutCons = scons->calcCutPoint(baseLine, conservationPct);
821 /* ***** ***** ***** ***** ***** ***** ***** ***** */
822
823 /* ***** ***** ***** ***** ***** ***** ***** ***** */
824 /* Clean the alingment using the two cut values, the
825 * gapsWindow and MDK_Windows vectors and the baseline
826 * value */
827 ret = cleanByCutValue(cutGaps, sgaps -> getGapsWindow(), baseLine, cutCons, scons -> getMdkwVector(), complementary);
828 /* ***** ***** ***** ***** ***** ***** ***** ***** */
829
830 /* ***** ***** ***** ***** ***** ***** ***** ***** */
831 /* Return a reference of the clean alignment object */
832 return ret;
833 /* ***** ***** ***** ***** ***** ***** ***** ***** */
834 }
835
836 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
837 /* This function cleans a given alignment based on the consistency values
838 * asses from a dataset of alignments. The method takes three parameters:
839 *
840 * cutpoint: Lower limit (0-1) of comparefile value admits in
841 * the new alignment.
842 * baseline: Minimum percentage of columns to have in the new alignment
843 * vectValues: A vector with alignment's consistency values.
844 *
845 * The function computes the optimal parameter combination value to trim
846 * an alignment based on the consistency value from the comparison among
847 * a dataset of alignments with the same sequences. */
848 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
849 alignment *alignment::cleanCompareFile(float cutpoint, float baseLine, float *vectValues, bool complementary) {
850
851 alignment *ret;
852 float cut, *vectAux;
853
854 /* ***** ***** ***** ***** ***** ***** ***** ***** */
855 /* Allocate memory */
856 vectAux = new float[residNumber];
857 /* ***** ***** ***** ***** ***** ***** ***** ***** */
858
859 /* ***** ***** ***** ***** ***** ***** ***** ***** */
860 /* Sort a copy of the vectValues vector, and take the
861 * value at 100% - baseline position. */
862 utils::copyVect((float *) vectValues, vectAux, residNumber);
863 utils::quicksort(vectAux, 0, residNumber-1);
864 cut = vectAux[(int) ((float)(residNumber - 1) * (100.0 - baseLine)/100.0)];
865 /* ***** ***** ***** ***** ***** ***** ***** ***** */
866
867 /* ***** ***** ***** ***** ***** ***** ***** ***** */
868 /* We have to decide which is the smallest value
869 * between the cutpoint value and the value from
870 * the minimum percentage threshold */
871 cut = cutpoint < cut ? cutpoint : cut;
872 /* ***** ***** ***** ***** ***** ***** ***** ***** */
873
874 /* ***** ***** ***** ***** ***** ***** ***** ***** */
875 /* Deallocate memory */
876 delete [] vectAux;
877 /* ***** ***** ***** ***** ***** ***** ***** ***** */
878
879 /* ***** ***** ***** ***** ***** ***** ***** ***** */
880 /* Clean the selected alignment using the input parameters. */
881 ret = cleanByCutValue(cut, baseLine, vectValues, complementary);
882 /* ***** ***** ***** ***** ***** ***** ***** ***** */
883
884 /* ***** ***** ***** ***** ***** ***** ***** ***** */
885 /* Return a refernce of the new alignment */
886 return ret;
887 /* ***** ***** ***** ***** ***** ***** ***** ***** */
888 }
889
890 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
891 /* This function removes those sequences that are misaligned with the rest
892 * of sequences in the alignment at given sequence and residue overlaps.
893 *
894 * overlapColumn: Set the minimum similarity fraction value that has
895 * to have a position from a given sequence to be
896 * considered as an hit.
897 * minimumOverlap: Set the minimum proportion of hits that has to be
898 * a sequence to keep it in the new alignment.
899 *
900 * At the same time, it's possible to get an alignment with those columns
901 * that this method should remove setting for that purpose the complementary
902 * parameter to True */
903 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
904 alignment *alignment::cleanSpuriousSeq(float overlapColumn, float minimumOverlap, bool complementary) {
905
906 float *overlapVector;
907 alignment *newAlig;
908
909 /* ***** ***** ***** ***** ***** ***** ***** ***** */
910 /* Allocate local memory */
911 overlapVector = new float[sequenNumber];
912 /* ***** ***** ***** ***** ***** ***** ***** ***** */
913
914 /* ***** ***** ***** ***** ***** ***** ***** ***** */
915 /* Compute the overlap's vector using the overlap
916 * column's value */
917 if(!calculateSpuriousVector(overlapColumn, overlapVector))
918 return NULL;
919 /* ***** ***** ***** ***** ***** ***** ***** ***** */
920
921 /* ***** ***** ***** ***** ***** ***** ***** ***** */
922 /* Select and remove the sequences with a overlap less
923 * than threshold's overlap and create a new alignemnt */
924 newAlig = cleanOverlapSeq(minimumOverlap, overlapVector, complementary);
925 /* ***** ***** ***** ***** ***** ***** ***** ***** */
926
927 /* ***** ***** ***** ***** ***** ***** ***** ***** */
928 /* Deallocate local memory */
929 delete [] overlapVector;
930 /* ***** ***** ***** ***** ***** ***** ***** ***** */
931
932 /* ***** ***** ***** ***** ***** ***** ***** ***** */
933 /* Return a reference of the clean alignment object */
934 return newAlig;
935 /* ***** ***** ***** ***** ***** ***** ***** ***** */
936 }
937
938 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
939 /* This function implements the gappyout approach. To trim the alignment, this
940 * method computes the slope from the gaps distribution from the input alig.
941 * Using this information, the method asses the most abrupt change between
942 * three consecutive points from that distribution and selects the first of
943 * this point as aan cut-off point.
944 *
945 * The method also can return the complementary alignment that consists of
946 * those columns that would be delete applying this algorithm. */
947 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
948 alignment *alignment::clean2ndSlope(bool complementarity) {
949
950 alignment *ret;
951 int cut;
952
953 /* ***** ***** ***** ***** ***** ***** ***** ***** */
954 /* If gaps statistics are not calculated, we calculate
955 * them */
956 if(calculateGapStats() != true)
957 return NULL;
958 /* ***** ***** ***** ***** ***** ***** ***** ***** */
959
960 /* ***** ***** ***** ***** ***** ***** ***** ***** */
961 /* We get the cut point using a automatic method for
962 * this purpose. */
963 cut = sgaps -> calcCutPoint2ndSlope();
964 /* ***** ***** ***** ***** ***** ***** ***** ***** */
965
966 /* ***** ***** ***** ***** ***** ***** ***** ***** */
967 /* Using the cut point calculates in last steps, we
968 * clean the alignment and generate a new alignment */
969 ret = cleanByCutValue(cut, 0, sgaps->getGapsWindow(), complementarity);
970 /* ***** ***** ***** ***** ***** ***** ***** ***** */
971
972 /* ***** ***** ***** ***** ***** ***** ***** ***** */
973 /* Returns the new alignment. */
974 return ret;
975 /* ***** ***** ***** ***** ***** ***** ***** ***** */
976 }
977
978 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
979 /* This method implements the strict and strictplus approaches, to apply
980 * these algorithms, the program computes the gaps and similarity distribution
981 * values to decide which ones are the optimal cutoff points. To compute these
982 * cutoff points, the method applies those steps described in the manual. */
983 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
984 alignment *alignment::cleanCombMethods(bool complementarity, bool variable) {
985
986 float simCut, first20Point, last80Point, *simil, *vectAux;
987 int i, j, acm, gapCut, *positions, *gaps;
988 double inic, fin, vlr;
989
990 /* ***** ***** ***** ***** ***** ***** ***** ***** */
991 /* If gaps statistics are not calculated, we calculate
992 * them */
993 if(calculateGapStats() != true)
994 return NULL;
995 /* ***** ***** ***** ***** ***** ***** ***** ***** */
996
997 /* ***** ***** ***** ***** ***** ***** ***** ***** */
998 /* Computes the gap cut point using a automatic method
999 * and at the same time, we get the gaps values from
1000 * the alignment. */
1001 gapCut = sgaps -> calcCutPoint2ndSlope();
1002 gaps = sgaps -> getGapsWindow();
1003 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1004
1005 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1006 /* If conservation's statistics are not calculated,
1007 * we calculate them */
1008 if(calculateConservationStats() != true)
1009 return NULL;
1010
1011 // Once the conservation stats have been calculated, generate the vector
1012 // containing the similarity value for each column considering parameters
1013 // such as windows size
1014 simil = scons -> getMdkwVector();
1015 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1016
1017 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1018 /* Allocate local memory and initializate it to -1 */
1019 positions = new int[residNumber];
1020 utils::initlVect(positions, residNumber, -1);
1021 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1022
1023 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1024 /* The method only selects columns with gaps number
1025 * less or equal than the gap's cut point. Counts the
1026 * number of columns that have been selected */
1027 for(i = 0, acm = 0; i < residNumber; i++) {
1028 if(gaps[i] <= gapCut) {
1029 positions[i] = i;
1030 acm++;
1031 }
1032 }
1033 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1034
1035 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1036 /* Allocate local memory and save the similaritys
1037 * values for the columns that have been selected */
1038 vectAux = new float[acm];
1039 for(i = 0, j = 0; i < residNumber; i++)
1040 if(positions[i] != -1)
1041 vectAux[j++] = simil[i];
1042 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1043
1044 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1045 /* Sort the conservation's value vector. */
1046 utils::quicksort(vectAux, 0, acm-1);
1047
1048 /* ...and search for the vector points at the 20 and
1049 * 80% of length. */
1050 first20Point = 0;
1051 last80Point = 0;
1052 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1053
1054 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1055 for(i = acm - 1, j = 1; i >= 0; i--, j++) {
1056 if((((float) j/acm) * 100.0) <= 20.0)
1057 first20Point = vectAux[i];
1058 if((((float) j/acm) * 100.0) <= 80.0)
1059 last80Point = vectAux[i];
1060 }
1061 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1062
1063 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1064 /* Computes the logaritmic's values for those points.
1065 * Finally the method computes the similarity cut
1066 * point using these values. */
1067 inic = log10(first20Point);
1068 fin = log10(last80Point);
1069 vlr = ((inic - fin) / 10) + fin;
1070 simCut = (float) pow(10, vlr);
1071 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1072
1073 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1074 /* Clean the alignment and generate a new alignment
1075 * object using the gaps cut and the similaritys cut
1076 * values */
1077 alignment *ret = cleanStrict(gapCut, sgaps -> getGapsWindow(),
1078 simCut, scons -> getMdkwVector(), complementarity, variable);
1079 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1080
1081 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1082 /* Deallocate local memory */
1083 delete [] vectAux;
1084 delete [] positions;
1085 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1086
1087 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1088 /* Return a reference of the new alignment */
1089 return ret;
1090 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1091 }
1092
1093 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1094 /* This function allows to delete those columns consistents of only gaps.
1095 * This function is specially useful when we remove misaligned sequences from
1096 * a given alignmnet. As the rest of functions, this function can return the
1097 * complementary alignment but this alignment would have only columns with
1098 * all gaps. */
1099 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1100 alignment *alignment::cleanNoAllGaps(bool complementarity) {
1101
1102 alignment *ret;
1103
1104 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1105 /* If gaps statistics are not calculated, we calculate
1106 * them */
1107 if(calculateGapStats() != true)
1108 return NULL;
1109 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1110
1111 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1112 /* We want to conserve the columns with gaps' number
1113 * less or equal than sequences' number - 1 */
1114 ret = cleanByCutValue((sequenNumber - 1), 0, sgaps->getGapsWindow(), complementarity);
1115 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1116
1117 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1118 /* Returns the new alignment. */
1119 return ret;
1120 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1121 }
1122
1123 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1124 /* This method computes the overlap for each sequence given a overlap value.
1125 * This overlap set the minimum fraction that has to be a position for the
1126 * selected sequence to count as an hit. This proportion measures how much
1127 * is similar, (in terms of residues, indetermination and gaps), a given
1128 * position respect to the element on its column. */
1129 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1130 bool alignment::calculateSpuriousVector(float overlap, float *spuriousVector) {
1131
1132 int i, j, k, seqValue, ovrlap, hit;
1133 float floatOverlap;
1134 char indet;
1135
1136 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1137 /* Compute the overlap */
1138 floatOverlap = overlap * float(sequenNumber-1);
1139 ovrlap = int(overlap * (sequenNumber-1));
1140
1141 if(floatOverlap > float(ovrlap))
1142 ovrlap++;
1143 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1144
1145 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1146 /* If the spurious vectos is NULL, returns false. */
1147 if(spuriousVector == NULL)
1148 return false;
1149 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1150
1151 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1152 /* Depending on the kind of alignment, we have
1153 * different indetermination symbol */
1154 if(getTypeAlignment() == AAType)
1155 indet = 'X';
1156 else
1157 indet = 'N';
1158 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1159
1160 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1161 /* For each alignment's sequence, computes its overlap */
1162 for(i = 0, seqValue = 0; i < sequenNumber; i++, seqValue = 0) {
1163
1164 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1165 /* For each alignment's column, computes the overlap
1166 * between the selected sequence and the other ones */
1167 for(j = 0; j < residNumber; j++) {
1168
1169 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1170 /* For sequences are before the sequence selected */
1171 for(k = 0, hit = 0; k < i; k++) {
1172
1173 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1174 /* If the element of sequence selected is the same
1175 * that the element of sequence considered, computes
1176 * a hit */
1177 if(sequences[i][j] == sequences[k][j])
1178 hit++;
1179
1180 /* If the element of sequence selected isn't a 'X' nor
1181 * 'N' (indetermination) or a '-' (gap) and the element
1182 * of sequence considered isn't a a 'X' nor 'N'
1183 * (indetermination) or a '-' (gap), computes a hit */
1184 else if((sequences[i][j] != indet) && (sequences[i][j] != '-')
1185 && (sequences[k][j] != indet) && (sequences[k][j] != '-'))
1186 hit++;
1187 }
1188 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1189
1190 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1191 /* For sequences are after the sequence selected */
1192 for(k = (i + 1); k < sequenNumber; k++) {
1193
1194 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1195 /* If the element of sequence selected is the same
1196 * that the element of sequence considered, computes
1197 * a hit */
1198 if(sequences[i][j] == sequences[k][j])
1199 hit++;
1200
1201 /* If the element of sequence selected isn't a 'X' nor
1202 * 'N' (indetermination) or a '-' (gap) and the element
1203 * of sequence considered isn't a a 'X' nor 'N'
1204 * (indetermination) or a '-' (gap), computes a hit */
1205 else if((sequences[i][j] != indet) && (sequences[i][j] != '-')
1206 && (sequences[k][j] != indet) && (sequences[k][j] != '-'))
1207 hit++;
1208 }
1209 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1210
1211 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1212 /* Finally, if the hit's number divided by number of
1213 * sequences minus one is greater or equal than
1214 * overlap's value, computes a column's hit. */
1215 if(hit >= ovrlap)
1216 seqValue++;
1217 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1218 }
1219
1220 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1221 /* For each alignment's sequence, computes its spurious's
1222 * or overlap's value as the column's hits -for that
1223 sequence- divided by column's number. */
1224 spuriousVector[i] = ((float) seqValue / residNumber);
1225 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1226 }
1227
1228 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1229 /* If there is not problem in the method, return true */
1230 return true;
1231 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1232 }
1233
1234 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1235 /* This method compute the cluster belongs to a given alignment at certain
1236 * identity threshold. To know the clusters from the alignment, those
1237 * clusters are calculated as follow: 1) Select the longest sequences. 2)
1238 * Using the previous computed identity values, compare if the second
1239 * longest sequence belongs, because the identity value with the cluster
1240 * representative is equal or greater than a given threshold, to this cluster
1241 * or not. 3) If the sequence belongs to this cluster, we add it, if not
1242 * belongs, we create a new cluster and fix this sequence as its representative
1243 * 4) Continue with the rest of sequences. In the case that a given sequence
1244 * can belong to more than one cluster, we choose the cluster which one
1245 * maximize the identity value respect to its representative sequence */
1246 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1247 int *alignment::calculateRepresentativeSeq(float maximumIdent) {
1248
1249 int i, j, pos, clusterNum, **seqs;
1250 int *cluster;
1251 static int *repres;
1252 float max;
1253
1254 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1255 /* Ask for the sequence identities assesment */
1256 if(identities == NULL)
1257 calculateSeqIdentity();
1258 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1259
1260 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1261 seqs = new int*[sequenNumber];
1262 for(i = 0; i < sequenNumber; i++) {
1263 seqs[i] = new int[2];
1264 seqs[i][0] = utils::removeCharacter('-', sequences[i]).size();
1265 seqs[i][1] = i;
1266 }
1267 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1268
1269 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1270 utils::quicksort(seqs, 0, sequenNumber-1);
1271 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1272
1273 cluster = new int[sequenNumber];
1274 cluster[0] = seqs[sequenNumber - 1][1];
1275 clusterNum = 1;
1276
1277 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1278 for(i = sequenNumber - 2; i >= 0; i--) {
1279
1280 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1281 for(j = 0, max = 0, pos = -1; j < clusterNum; j++) {
1282 if(identities[seqs[i][1]][cluster[j]] > maximumIdent) {
1283 if(identities[seqs[i][1]][cluster[j]] > max) {
1284 max = identities[seqs[i][1]][cluster[j]];
1285 pos = j;
1286 }
1287 }
1288 }
1289 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1290
1291 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1292 if(pos == -1) {
1293 cluster[j] = seqs[i][1];
1294 clusterNum++;
1295 }
1296 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1297 }
1298
1299 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1300 repres = new int[clusterNum + 1];
1301 repres[0] = clusterNum;
1302 for(i = 0; i < clusterNum; i++)
1303 repres[i+1] = cluster[i];
1304 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1305
1306 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1307 /* Deallocate dinamic memory */
1308 for(i = 0; i < sequenNumber; i++)
1309 delete [] seqs[i];
1310
1311 delete [] cluster;
1312 delete [] seqs;
1313 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1314
1315 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1316 return repres;
1317 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1318 }
1319
1320 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1321 /* This method looks for the optimal cut point for a given clusters number.
1322 * The idea is to find a identity cut-off point that can be used to get a
1323 * number of representative sequences similar to the input parameter */
1324 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1325 float alignment::getCutPointClusters(int clusterNumber) {
1326
1327 float max, min, avg, gMax, gMin, startingPoint, prevValue = 0, iter = 0;
1328 int i, j, clusterNum, *cluster, **seqs;
1329
1330 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1331 /* If the user wants only one cluster means that all
1332 * of sequences have to be in the same clauster.
1333 * Otherwise, if the users wants the maximum number of
1334 * clusters means that each sequence have to be in their
1335 * own cluster */
1336 if(clusterNumber == sequenNumber) return 1;
1337 else if(clusterNumber == 1) return 0;
1338 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1339
1340 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1341 /* Ask for the sequence identities assesment */
1342 if(identities == NULL)
1343 calculateSeqIdentity();
1344 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1345
1346 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1347 /* Compute the maximum, the minimum and the average
1348 * identity values from the sequences */
1349 for(i = 0,gMax = 0, gMin = 1, startingPoint = 0; i < sequenNumber; i++) {
1350 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1351 for(j = 0, max = 0, avg = 0, min = 1; j < i; j++) {
1352 if(max < identities[i][j])
1353 max = identities[i][j];
1354 if(min > identities[i][j])
1355 min = identities[i][j];
1356 avg += identities[i][j];
1357 }
1358 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1359 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1360 for(j = i + 1; j < sequenNumber; j++) {
1361 if(max < identities[i][j])
1362 max = identities[i][j];
1363 if(min > identities[i][j])
1364 min = identities[i][j];
1365 avg += identities[i][j];
1366 }
1367 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1368 startingPoint += avg / (sequenNumber - 1);
1369 if(max > gMax) gMax = max;
1370 if(min < gMin) gMin = min;
1371 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1372 }
1373 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1374
1375 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1376 /* Take the starting point as the average value */
1377 startingPoint /= sequenNumber;
1378 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1379
1380 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1381 /* Compute and sort the sequence length */
1382 seqs = new int*[sequenNumber];
1383 for(i = 0; i < sequenNumber; i++) {
1384 seqs[i] = new int[2];
1385 seqs[i][0] = utils::removeCharacter('-', sequences[i]).size();
1386 seqs[i][1] = i;
1387 }
1388 utils::quicksort(seqs, 0, sequenNumber-1);
1389 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1390
1391 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1392 /* Create the data structure to store the different
1393 * clusters for a given thresholds */
1394 cluster = new int[sequenNumber];
1395 cluster[0] = seqs[sequenNumber - 1][1];
1396 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1397
1398 /**** ***** ***** ***** ***** ***** ***** ***** */
1399 /* Look for the optimal identity value to get the
1400 * number of cluster set by the user. To do that, the
1401 * method starts in the average identity value and moves
1402 * to the right or lefe side of this value depending on
1403 * if this value is so strict or not. We set an flag to
1404 * avoid enter in an infinite loop, if we get the same
1405 * value for more than 10 consecutive point without get
1406 * the given cluster number means that it's impossible
1407 * to get this cut-off and we need to stop the search */
1408 do {
1409 clusterNum = 1;
1410 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1411 /* Start the search */
1412 for(i = sequenNumber - 2; i >= 0; i--) {
1413 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1414 for(j = 0; j < clusterNum; j++)
1415 if(identities[seqs[i][1]][cluster[j]] > startingPoint)
1416 break;
1417 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1418
1419 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1420 if(j == clusterNum) {
1421 cluster[j] = seqs[i][1];
1422 clusterNum++;
1423 }
1424 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1425 }
1426 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1427
1428 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1429 /* Given the cutoff point, if we get the same cluster
1430 * number or we have been getting for more than 10
1431 * consecutive times the same cutoff point, we stop */
1432 if((clusterNum == clusterNumber) || (iter > 10))
1433 break;
1434 else {
1435 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1436 /* We have to move to the left side from the average
1437 * to decrease the cutoff value and get a smaller number
1438 * of clusters */
1439 if(clusterNum > clusterNumber) {
1440 gMax = startingPoint;
1441 startingPoint = (gMax + gMin)/2;
1442 } else {
1443 /* In the opposite side, we have to move to the right
1444 * side from the cutting point to be more strict and get
1445 * a bigger number of clusters */
1446 gMin = startingPoint;
1447 startingPoint = (gMax + gMin)/2;
1448 }
1449 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1450
1451 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1452 /* If the clusters number from the previous iteration
1453 * is different from the current number, we put the
1454 * iteration number to 0 and store this new value */
1455 if(prevValue != clusterNum) {
1456 iter = 0;
1457 prevValue = clusterNum;
1458 /* Otherwise, we increase the iteration number to
1459 * avoid enter in an infinitve loop */
1460 } else iter++;
1461 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1462 }
1463 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1464 } while(true);
1465
1466 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1467 /* Deallocate dinamic memory */
1468 for(i = 0; i < sequenNumber; i++) delete [] seqs[i];
1469 delete [] seqs;
1470 delete [] cluster;
1471 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1472
1473 return startingPoint;
1474 }
1475
1476 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1477 /* This method select one representative sequence (the longest one) per each
1478 * cluster from the input alignment and generate a new alignment */
1479 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1480 alignment *alignment::getClustering(float identityThreshold) {
1481
1482 string *matrixAux, *newSeqsName;
1483 int i, j, *clustering;
1484 alignment *newAlig;
1485
1486 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1487 /* Get the representative member for each cluster
1488 * given a maximum identity threshold */
1489 clustering = calculateRepresentativeSeq(identityThreshold);
1490 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1491
1492 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1493 /* Put all sequences to be deleted and get back those
1494 * sequences that are representative for each cluster
1495 * */
1496 for(i = 0; i < sequenNumber; i ++)
1497 saveSequences[i] = -1;
1498 for(i = 1; i <= clustering[0]; i ++)
1499 saveSequences[clustering[i]] = clustering[i];
1500 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1501
1502 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1503 /* We allocate memory to save the sequences selected */
1504 matrixAux = new string[clustering[0]];
1505 newSeqsName = new string[clustering[0]];
1506
1507 /* Copy to new structures the information that have
1508 * been selected previously. */
1509 for(i = 0, j = 0; i < sequenNumber; i++)
1510 if(saveSequences[i] != -1) {
1511 newSeqsName[j] = seqsName[i];
1512 matrixAux[j] = sequences[i];
1513 j++;
1514 }
1515 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1516
1517 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1518 /* When we have all parameters, we create the new
1519 * alignment */
1520 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
1521 clustering[0], residNumber, iformat, oformat, shortNames, dataType, isAligned,
1522 reverse, terminalGapOnly, left_boundary, right_boundary,
1523 keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber,
1524 saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities,
1525 overlaps);
1526 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1527
1528 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1529 /* Deallocated auxiliar memory */
1530 delete [] matrixAux;
1531 delete [] newSeqsName;
1532 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1533
1534 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1535 /* Return the new alignment reference */
1536 return newAlig;
1537 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1538 }
1539
1540 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1541 /* This function returns the backtranslation for a given protein processed
1542 * alignment into its CDS alignment. To do this convertion, the function needs
1543 * the Coding sequences as well the original composition of each protein
1544 * sequence. Also, the function needs to know which columns/sequences will be
1545 * in the final alignment to carray out the conversion. */
1546 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1547 alignment *alignment::getTranslationCDS(int newResidues, int newSequences, int *ColumnsToKeep, string *oldSeqsName, sequencesMatrix *seqMatrix, alignment *ProtAlig) {
1548
1549 string *matrixAux;
1550 alignment *newAlig;
1551 int i, j, k, l, oldResidues;
1552 int *mappedSeqs, *tmpSequence, *selectedRes;
1553
1554 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1555 /* Map the selected protein sequences to the input
1556 * coding sequences */
1557 mappedSeqs = new int[newSequences];
1558 for(i = 0; i < sequenNumber; i++)
1559 for(j = 0; j < newSequences; j++)
1560 if(!seqsName[i].compare(oldSeqsName[j])) {
1561 mappedSeqs[j] = i;
1562 break;
1563 }
1564 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1565
1566 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1567 /* Get the original alignment size as well the original
1568 * selected/non-selected columns */
1569 oldResidues = seqMatrix -> getResidNumber();
1570 selectedRes = new int[oldResidues];
1571
1572 for(i = 0; i < oldResidues; i++)
1573 selectedRes[i] = i;
1574
1575 for(j = 0; j < ColumnsToKeep[0]; j++)
1576 selectedRes[j] = -1;
1577
1578 for(i = 0; i < newResidues - 1; i++)
1579 for(j = ColumnsToKeep[i] + 1; j < ColumnsToKeep[i+1]; j++)
1580 selectedRes[j] = -1;
1581
1582 for(j = ColumnsToKeep[newResidues - 1] + 1; j < oldResidues; j++)
1583 selectedRes[j] = -1;
1584 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1585
1586 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1587 /* We allocate memory to save the columns selected */
1588 matrixAux = new string[newSequences];
1589 tmpSequence = new int[oldResidues];
1590
1591 /* Using the information about which residues for each
1592 * sequence was selected by others function, we process
1593 * these residues to recover the corresponding codons */
1594 for(i = 0; i < newSequences; i++)
1595 if(seqMatrix -> getSequence(oldSeqsName[i], tmpSequence)) {
1596 for(j = 0; j < oldResidues; j++) {
1597 if((selectedRes[j] != -1) && (tmpSequence[j] != 0)) {
1598
1599 for(k = 3 * (tmpSequence[j] - 1), l = 0; l < 3; k++, l++) {
1600 /* Check whether the nucleotide sequences end has been reached or not.
1601 * If it has been reached, complete backtranslation using indetermination
1602 * symbols 'N' */
1603 if((int) sequences[mappedSeqs[i]].length() > k)
1604 matrixAux[i].resize(matrixAux[i].size() + 1, sequences[mappedSeqs[i]][k]);
1605 else
1606 matrixAux[i].resize(matrixAux[i].size() + 1, 'N');
1607 }
1608 } else if(selectedRes[j] != -1) {
1609 matrixAux[i].resize(matrixAux[i].size() + 3, '-');
1610 }
1611 }
1612 /* If there is any problems with a sequence then
1613 * the function returns an error */
1614 } else {
1615 return NULL;
1616 }
1617 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1618
1619
1620 /* Remove columns/sequences composed only by gaps before making the retrotranslation */
1621
1622
1623 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1624 /* When we have all parameters, we create the new
1625 * alignment */
1626 newAlig = new alignment(filename, "", matrixAux, oldSeqsName, NULL, newSequences,
1627 newResidues * 3, ProtAlig -> getInputFormat(), ProtAlig -> getOutputFormat(),
1628 ProtAlig -> getShortNames(), DNAType, true, ProtAlig -> getReverse(),
1629 terminalGapOnly, left_boundary, right_boundary,
1630 keepSequences, keepHeader, sequenNumber, oldResidues * 3, NULL, NULL,
1631 NULL, 0, 0, ProtAlig -> getBlockSize(), NULL, NULL);
1632 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1633
1634 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1635 /* Deallocated auxiliar memory */
1636 delete [] matrixAux;
1637 delete mappedSeqs;
1638 delete tmpSequence;
1639 delete selectedRes;
1640 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1641
1642 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1643 /* Return the new alignment reference */
1644 return newAlig;
1645 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1646 }
1647
1648 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1649 /* This function computes the gaps statistics for the input alignment. */
1650 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1651 bool alignment::calculateGapStats(void) {
1652
1653 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1654 /* If alignment matrix is not created, return false */
1655 if(sequences == NULL)
1656 return false;
1657 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1658
1659 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1660 /* If sgaps object is not created, we create them
1661 and calculate the statistics */
1662 if(sgaps == NULL) {
1663 sgaps = new statisticsGaps(sequences, sequenNumber, residNumber, dataType);
1664 sgaps -> applyWindow(ghWindow);
1665 }
1666 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1667
1668 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1669 return true;
1670 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1671 }
1672
1673 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1674 /* Print the gaps value for each column in the alignment */
1675 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1676 void alignment::printStatisticsGapsColumns(void) {
1677
1678 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1679 /* Check if there is computed the gaps statistics */
1680 if(calculateGapStats())
1681 /* then prints the information */
1682 sgaps -> printGapsColumns();
1683 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1684 }
1685
1686 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1687 /* Print the acumulative gaps distribution value from the input alignment */
1688 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1689 void alignment::printStatisticsGapsTotal(void) {
1690
1691 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1692 /* Check it there is computed the gaps statistics */
1693 if(calculateGapStats())
1694 /* then prints the information */
1695 sgaps -> printGapsAcl();
1696 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1697 }
1698
1699 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1700 /* Set the similarity matrix. This matrix is necessary for some methods in
1701 * the program */
1702 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1703 bool alignment::setSimilarityMatrix(similarityMatrix *sm) {
1704
1705 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1706 /* If scons object is not created, we create them */
1707 if(scons == NULL)
1708 scons = new statisticsConservation(sequences, sequenNumber, residNumber, dataType);
1709 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1710
1711 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1712 /* Associate the matrix to the similarity statistics
1713 * object */
1714 if(!scons -> setSimilarityMatrix(sm))
1715 return false;
1716 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1717
1718 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1719 /* If it's OK, we return true */
1720 return true;
1721 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1722 }
1723
1724 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1725 /* This method compute the similarity values from the input alignment if it
1726 * has not been computed before */
1727 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1728 bool alignment::calculateConservationStats(void) {
1729
1730 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1731 /* It the gaps statistics object has not been created
1732 * we create it */
1733 if(calculateGapStats() != true)
1734 return false;
1735 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1736
1737 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1738 /* It the similarity statistics object has not been
1739 * created we create it */
1740 if(scons == NULL)
1741 return false;
1742 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1743
1744 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1745 /* Ask for the similarity matrix */
1746 if(scons -> isSimMatrixDef() != true)
1747 return false;
1748 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1749
1750 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1751 /* Compute the similarity statistics from the input
1752 * alignment */
1753 if(!scons -> calculateVectors(sequences, sgaps->getGapsWindow()))
1754 return false;
1755
1756 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1757 /* Ask to know if it is necessary to apply any window
1758 * method. If it's necessary, we apply it */
1759 if(scons->isDefinedWindow())
1760 return true;
1761 else
1762 return scons->applyWindow(shWindow);
1763 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1764 }
1765
1766 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1767 /* Print the similarity value for each column from the alignment */
1768 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1769 void alignment::printStatisticsConservationColumns(void) {
1770
1771 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1772 /* Check if the similarity statistics object has been
1773 * created */
1774 if(calculateConservationStats())
1775 /* then prints the information */
1776 scons -> printConservationColumns();
1777 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1778 }
1779
1780 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1781 /* Print the accumulative similarity distribution values from the alignment */
1782 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1783 void alignment::printStatisticsConservationTotal(void) {
1784
1785 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1786 /* Check if the similarity statistics object has been
1787 * created */
1788 if(calculateConservationStats())
1789 /* then prints the information */
1790 scons -> printConservationAcl();
1791 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1792 }
1793
1794 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1795 /* This method prints the correspondece between the columns in the original
1796 * and in the trimmed alignment */
1797 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1798 void alignment::printCorrespondence(void) {
1799 int i;
1800
1801 cout << "#ColumnsMap\t";
1802 /* Print the saveResidues relathionship */
1803 for(i = 0; i < residNumber - 1; i++)
1804 cout << saveResidues[i] << ", ";
1805 cout << saveResidues[i] << endl;
1806
1807 }
1808
1809 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1810 /* Return the number of the sequences in the alignment */
1811 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1812 int alignment::getNumSpecies(void) {
1813 return sequenNumber;
1814 }
1815
1816 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1817 /* Return the number of residues in the alignment */
1818 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1819 int alignment::getNumAminos(void) {
1820 return residNumber;
1821 }
1822
1823 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1824 /* Sets the condition to remove only terminal gaps after applying any
1825 * trimming method or not. */
1826 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1827 void alignment::trimTerminalGaps(bool terminalOnly_, int *boundaries_) {
1828 terminalGapOnly = terminalOnly_;
1829
1830 /* We are only interested in the first and last number of the vector - this
1831 * vector is generated by other function where it is important to store all
1832 * intervals */
1833 if (boundaries_ != NULL) {
1834 left_boundary = boundaries_[0];
1835 right_boundary = boundaries_[1];
1836 }
1837 }
1838
1839 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1840 /* This method stores the diferent windows values in the alignment object */
1841 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1842 void alignment::setWindowsSize(int ghWindow_, int shWindow_) {
1843 ghWindow = ghWindow_;
1844 shWindow = shWindow_;
1845 }
1846
1847 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1848 /* This method lets to change the output format */
1849 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1850 void alignment::setOutputFormat(int format_, bool shortNames_) {
1851 oformat = format_;
1852 shortNames = shortNames_;
1853 }
1854
1855 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1856 /* This method sets a new block size value */
1857 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1858 void alignment::setBlockSize(int blockSize_) {
1859 blockSize = blockSize_;
1860 }
1861
1862 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1863 /* Return true if the shortNames flag has been setted */
1864 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1865 int alignment::getShortNames(void) {
1866 return shortNames;
1867 }
1868
1869 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1870 /* Return true if the reverse flag has been setted. */
1871 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1872 int alignment::getReverse(void) {
1873 return reverse;
1874 }
1875
1876 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1877 /* This method lets to change the output alignment orientation */
1878 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1879 void alignment::setReverse(void) {
1880 reverse = true;
1881 }
1882
1883
1884 /* Set appropiate flag to decide whether sequences composed only by gaps should
1885 * be kept or not */
1886 void alignment::setKeepSequencesFlag(bool flag) {
1887 keepSequences = flag;
1888 }
1889
1890 /* Set appropiate flag to decide whether original sequences header should be
1891 * dumped into the output stream with or without any preprocessing step */
1892 void alignment::setKeepSeqsHeaderFlag(bool flag) {
1893 keepHeader = flag;
1894 }
1895 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1896 /* Return the block size value */
1897 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1898 int alignment::getBlockSize(void) {
1899 return blockSize;
1900 }
1901
1902 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1903 /* This method returns the sequences name */
1904 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1905 void alignment::getSequences(string *Names) {
1906 for(int i = 0; i < sequenNumber; i++)
1907 Names[i] = seqsName[i];
1908 }
1909
1910 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1911 void alignment::getSequences(string *Names, int *lengths) {
1912 for(int i = 0; i < sequenNumber; i++) {
1913 lengths[i] = (int) utils::removeCharacter('-', sequences[i]).length();
1914 Names[i] = seqsName[i];
1915 }
1916 }
1917
1918 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1919 void alignment::getSequences(string *Names, string *Sequences, int *Lengths) {
1920 for(int i = 0; i < sequenNumber; i++) {
1921 Names[i] = seqsName[i];
1922 Sequences[i] = utils::removeCharacter('-', sequences[i]);
1923 Lengths[i] = (int) Sequences[i].length();
1924 }
1925 }
1926
1927
1928 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1929 /* For a given set of sequences name, check for its correspondence with
1930 * the current sequences name. If there is not a complete correspondence
1931 * between both sets, the method return false, otherwise, return true */
1932 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1933 bool alignment::getSeqNameOrder(string *names, int *orderVector) {
1934 int i, j, numNames;
1935
1936 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1937 /* For each name in the current alignment, we look
1938 * for its correspondence in the input set */
1939 for(i = 0, numNames = 0; i < sequenNumber; i++) {
1940 for(j = 0; j < sequenNumber; j++) {
1941 if(seqsName[i] == names[j]) {
1942 orderVector[i] = j;
1943 numNames++;
1944 break;
1945 }
1946 }
1947 }
1948 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1949
1950 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1951 /* Depending on if we get the complete correspondence
1952 * between both sets of names, we return true or not */
1953 if(numNames == sequenNumber) return true;
1954 else return false;
1955 /* ***** ***** ***** ***** ***** ***** ***** ***** */
1956 }
1957
1958 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1959 /* This method lets to build a sequence matrix. A sequence matrix contains
1960 * the residue position for each sequence without taking into account the
1961 * gaps in the sequence. This means that at position 10 we have the residue
1962 * 1 for that sequence */
1963 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1964 void alignment::sequenMatrix(void) {
1965 if(seqMatrix == NULL)
1966 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
1967 }
1968
1969 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1970 /* Use this method to destroy a given sequence matrix */
1971 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1972 void alignment::destroySequenMatrix(void) {
1973 if(seqMatrix != NULL)
1974 delete seqMatrix;
1975 seqMatrix = NULL;
1976 }
1977
1978 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1979 /* Print the alignment's sequence matrix */
1980 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1981 void alignment::printSequenMatrix(void) {
1982 if(seqMatrix == NULL)
1983 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
1984 seqMatrix -> printMatrix();
1985 }
1986
1987 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1988 /* Given an index, this method returns the sequence matrix column for that
1989 * index */
1990 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1991 void alignment::getColumnSeqMatrix(int column, int *columnSeqMatrix) {
1992 if(seqMatrix == NULL)
1993 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
1994 seqMatrix -> getColumn(column, columnSeqMatrix);
1995 }
1996
1997 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
1998 /* This function looks if a given value belongs a given row. In the affirmative
1999 * case, returns the columns value for that row and value combination */
2000 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2001 void alignment::getColumnSeqMatrix(int value, int row, int *columnSeqMatrix) {
2002 if(seqMatrix == NULL)
2003 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
2004 seqMatrix -> getColumn(value, row, columnSeqMatrix);
2005 }
2006
2007 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2008 /* This function change the rows order in the sequence matrix */
2009 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2010 void alignment::setSeqMatrixOrder(int *order) {
2011 if(seqMatrix == NULL)
2012 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
2013 seqMatrix -> setOrder(order);
2014 }
2015
2016 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2017 /* This function change the rows order in the sequence matrix */
2018 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2019 sequencesMatrix *alignment::getSeqMatrix(void) {
2020 if(seqMatrix == NULL)
2021 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
2022 return seqMatrix;
2023 }
2024
2025 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2026 /* Computes, if it's necessary, and return the alignment's type */
2027 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2028 int alignment::getTypeAlignment(void) {
2029 if(dataType == 0)
2030 dataType = utils::checkTypeAlignment(sequenNumber, residNumber, sequences);
2031 return dataType;
2032 }
2033
2034 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2035 /* Returns the correspondence between the columns in the original and in the
2036 * trimmed alignment */
2037 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2038 int *alignment::getCorrespResidues(void) {
2039 return saveResidues;
2040 }
2041
2042 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2043 /* Returns the correspondence between the sequences in the original and in
2044 * the trimmed alignment */
2045 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2046 int *alignment::getCorrespSequences(void) {
2047 return saveSequences;
2048 }
2049
2050 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2051 /* Returns if the alignment is aligned or not */
2052 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
2053 bool alignment::isFileAligned(void) {
2054 return isAligned;
2055 }
2056
2057 /* *****************************************************************************
2058 * *****************************************************************************
2059 * *****************************************************************************
2060 * ************************************************************************** */
2061
2062 /* This method removes those columns that exceed a given threshold. If the
2063 * number of columns in the trimmed alignment is lower than a given percentage
2064 * of the original alignment. The program relaxes the threshold until to add
2065 * enough columns to achieve this percentage, to add those columns the program
2066 * start in the middle of the original alignment and make, alternatively,
2067 * movements to the left and right side from that point. This method also can
2068 * return the complementary alignment. */
2069 alignment *alignment::cleanByCutValue(double cut, float baseLine,
2070 const int *gInCol, bool complementary) {
2071
2072 int i, j, k, jn, oth, pos, block, *vectAux;
2073 string *matrixAux, *newSeqsName;
2074 alignment *newAlig;
2075 newValues counter;
2076
2077 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2078 /* Select the columns with a gaps value less or equal
2079 * than the cut point. */
2080 for(i = 0, counter.residues = 0; i < residNumber; i++)
2081 if(gInCol[i] <= cut) counter.residues++;
2082 else saveResidues[i] = -1;
2083 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2084
2085 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2086 /* Compute, if it's necessary, the number of columns
2087 * necessary to achieve the minimum number of columns
2088 * fixed by coverage parameter. */
2089 oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber);
2090 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2091
2092 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2093 /* if it's necessary to recover some columns, we
2094 * applied this instructions to recover it */
2095 if(oth > 0) {
2096 counter.residues += oth;
2097
2098 /* Allocate memory */
2099 vectAux = new int[residNumber];
2100
2101 /* Sort a copy of the gInCol vector, and take the value of the column that marks the % baseline */
2102 utils::copyVect((int *) gInCol, vectAux, residNumber);
2103 utils::quicksort(vectAux, 0, residNumber-1);
2104 cut = vectAux[(int) ((float)(residNumber - 1) * (baseLine)/100.0)];
2105
2106 /* Deallocate memory */
2107 delete [] vectAux;
2108 }
2109 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2110
2111 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2112 /* Fixed the initial size of blocks as 0.5% of
2113 * alignment's length */
2114 for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) {
2115
2116 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2117 /* We start in the alignment middle then we move on
2118 * right and left side at the same time. */
2119 for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) {
2120
2121 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2122 /* Left side. Here, we compute the block's size. */
2123 for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ;
2124
2125 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2126 /* if block's size is greater or equal than the fixed
2127 * size then we save all columns that have not been
2128 * saved previously. */
2129 if((i - jn) >= k) {
2130 for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) {
2131 if(gInCol[jn] <= cut) {
2132 saveResidues[jn] = jn;
2133 oth--;
2134 } else
2135 break;
2136 }
2137 }
2138 i = jn;
2139 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2140
2141 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2142 /* Right side. Here, we compute the block's size. */
2143 for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ;
2144
2145 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2146 /* if block's size is greater or equal than the fixed
2147 * size then we save all columns that have not been
2148 * saved previously. */
2149 if((jn - j) >= k) {
2150 for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) {
2151 if(gInCol[jn] <= cut) {
2152 saveResidues[jn] = jn;
2153 oth--;
2154 } else
2155 break;
2156 }
2157 }
2158 j = jn;
2159 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2160 }
2161 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2162 }
2163
2164 /* Keep only columns blocks bigger than an input columns block size */
2165 if(blockSize != 0) {
2166
2167 /* Traverse all alignment looking for columns blocks greater than LONGBLOCK,
2168 * everytime than a column is not selected by the trimming method, check
2169 * whether the current block size until that point is big enough to be kept
2170 * or it should be removed from the final alignment */
2171 for(i = 0, pos = 0, block = 0; i < residNumber; i++) {
2172 if(saveResidues[i] != -1)
2173 block++;
2174 else {
2175 /* Remove columns from blocks smaller than input blocks size */
2176 if(block < blockSize)
2177 for(j = pos; j <= i; j++)
2178 saveResidues[j] = -1;
2179 pos = i + 1;
2180 block = 0;
2181 }
2182 }
2183 /* Check final block separately since it could happen than last block is not
2184 * big enough but because the loop end could provoke to ignore it */
2185 if(block < blockSize)
2186 for(j = pos; j < i; j++)
2187 saveResidues[j] = -1;
2188 }
2189
2190 /* If the flag -terminalony is set, apply a method to look for internal
2191 * boundaries and get back columns inbetween them, if they exist */
2192 if(terminalGapOnly == true)
2193 if(!removeOnlyTerminal())
2194 return NULL;
2195
2196 /* Once the columns/sequences selection is done, turn it around
2197 * if complementary flag is active */
2198 if(complementary == true)
2199 computeComplementaryAlig(true, false);
2200
2201 /* Check for any additional column/sequence to be removed */
2202 /* Compute new sequences and columns numbers */
2203 counter = removeCols_SeqsAllGaps();
2204
2205 /* Allocate memory for selected sequences/columns */
2206 matrixAux = new string[counter.sequences];
2207 newSeqsName = new string[counter.sequences];
2208
2209 /* Fill local allocated memory with previously selected data */
2210 fillNewDataStructure(matrixAux, newSeqsName);
2211
2212 /* When we have all parameters, we create the new alignment */
2213 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
2214 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
2215 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
2216 keepSequences, keepHeader, sequenNumber, residNumber,
2217 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
2218 identities, overlaps);
2219
2220 /* Deallocate local memory */
2221 delete[] matrixAux;
2222 delete[] newSeqsName;
2223
2224 /* Return the new alignment reference */
2225 return newAlig;
2226 }
2227
2228 /* This method removes those columns that not achieve a given threshold. If the
2229 * number of columns in the trimmed alignment is lower than a given percentage
2230 * of the original alignment. The program relaxes the threshold until to add
2231 * enough columns to achieve this percentage, to add those columns the program
2232 * start in the middle of the original alignment and make, alternatively,
2233 * movements to the left and right side from that point. This method also can
2234 * return the complementary alignment. */
2235 alignment *alignment::cleanByCutValue(float cut, float baseLine,
2236 const float *ValueVect, bool complementary) {
2237
2238 int i, j, k, jn, oth, pos, block;
2239 string *matrixAux, *newSeqsName;
2240 alignment *newAlig;
2241 newValues counter;
2242
2243 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2244 /* Select the columns with a conservation's value
2245 * greater than the cut point. */
2246 for(i = 0, counter.residues = 0; i < residNumber; i++)
2247 if(ValueVect[i] > cut) counter.residues++;
2248 else saveResidues[i] = -1;
2249 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2250
2251 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2252 /* Compute, if it's necessary, the number of columns
2253 * necessary to achieve the minimum number of columns
2254 * fixed by coverage value. */
2255 oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber);
2256 if(oth > 0) counter.residues += oth;
2257 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2258
2259 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2260 /* Fixed the initial size of blocks as 0.5% of
2261 * alignment's length */
2262 for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) {
2263
2264 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2265 /* We start in the alignment middle then we move on
2266 * right and left side at the same time. */
2267 for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) {
2268
2269 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2270 /* Left side. Here, we compute the block's size. */
2271 for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ;
2272
2273 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2274 /* if block's size is greater or equal than the fixed
2275 * size then we save all columns that have not been
2276 * saved previously. */
2277 /* Here, we only accept column with a conservation's
2278 * value equal to conservation cut point. */
2279 if((i - jn) >= k) {
2280 for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) {
2281 if(ValueVect[jn] == cut) {
2282 saveResidues[jn] = jn;
2283 oth--;
2284 } else
2285 break;
2286 }
2287 }
2288 i = jn;
2289 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2290
2291 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2292 /* Right side. Here, we compute the block's size. */
2293 for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ;
2294
2295 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2296 /* if block's size is greater or equal than the fixed
2297 * size then we select the column and save the block's
2298 * size for the next iteraction. it's obvius that we
2299 * decrease the column's number needed to finish. */
2300 /* Here, we only accept column with a conservation's
2301 * value equal to conservation cut point. */
2302 if((jn - j) >= k) {
2303 for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) {
2304 if(ValueVect[jn] == cut) {
2305 saveResidues[jn] = jn;
2306 oth--;
2307 } else
2308 break;
2309 }
2310 }
2311 j = jn;
2312 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2313 }
2314 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2315 }
2316
2317 /* Keep only columns blocks bigger than an input columns block size */
2318 if(blockSize != 0) {
2319
2320 /* Traverse all alignment looking for columns blocks greater than LONGBLOCK,
2321 * everytime than a column is not selected by the trimming method, check
2322 * whether the current block size until that point is big enough to be kept
2323 * or it should be removed from the final alignment */
2324 for(i = 0, pos = 0, block = 0; i < residNumber; i++) {
2325 if(saveResidues[i] != -1)
2326 block++;
2327 else {
2328 /* Remove columns from blocks smaller than input blocks size */
2329 if(block < blockSize)
2330 for(j = pos; j <= i; j++)
2331 saveResidues[j] = -1;
2332 pos = i + 1;
2333 block = 0;
2334 }
2335 }
2336 /* Check final block separately since it could happen than last block is not
2337 * big enough but because the loop end could provoke to ignore it */
2338 if(block < blockSize)
2339 for(j = pos; j < i; j++)
2340 saveResidues[j] = -1;
2341 }
2342
2343 /* If the flag -terminalony is set, apply a method to look for internal
2344 * boundaries and get back columns inbetween them, if they exist */
2345 if(terminalGapOnly == true)
2346 if(!removeOnlyTerminal())
2347 return NULL;
2348
2349 /* Once the columns/sequences selection is done, turn it around
2350 * if complementary flag is active */
2351 if(complementary == true)
2352 computeComplementaryAlig(true, false);
2353
2354 /* Check for any additional column/sequence to be removed */
2355 /* Compute new sequences and columns numbers */
2356 counter = removeCols_SeqsAllGaps();
2357
2358 /* Allocate memory for selected sequences/columns */
2359 matrixAux = new string[counter.sequences];
2360 newSeqsName = new string[counter.sequences];
2361
2362 /* Fill local allocated memory with previously selected data */
2363 fillNewDataStructure(matrixAux, newSeqsName);
2364
2365 /* When we have all parameters, we create the new alignment */
2366 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
2367 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
2368 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
2369 keepSequences, keepHeader, sequenNumber, residNumber,
2370 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
2371 identities, overlaps);
2372
2373 /* Deallocate local memory */
2374 delete[] matrixAux;
2375 delete[] newSeqsName;
2376
2377 /* Return the new alignment reference */
2378 return newAlig;
2379 }
2380
2381 /* This method removes those columns that not achieve the similarity threshond,
2382 * by one hand, and exceed the gaps threshold. If the number of columns that
2383 * have been selected is lower that a given coverage, the program relaxes both
2384 * thresholds in order to get back some columns and achieve this minimum
2385 * number of columns in the new alignment. For that purpose, the program
2386 * starts at the middle of the original alignment and makes, alternatively,
2387 * movememts to the right and left sides looking for those columns necessary
2388 * to achieve the minimum number of columns set by the coverage parameter.
2389 * This method also can return the complementary alignmnet consists of those
2390 * columns that will be deleted from the original alignment applying the
2391 * standard method. */
2392 alignment *alignment::cleanByCutValue(double cutGaps, const int *gInCol,
2393 float baseLine, float cutCons, const float *MDK_Win, bool complementary) {
2394
2395 int i, j, k, oth, pos, block, jn, blGaps, *vectAuxGaps;
2396 string *matrixAux, *newSeqsName;
2397 float blCons, *vectAuxCons;
2398 alignment *newAlig;
2399 newValues counter;
2400
2401 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2402 /* Select the columns with a conservation's value
2403 * greater than the conservation cut point AND
2404 * less or equal than the gap cut point. */
2405 for(i = 0, counter.residues = 0; i < residNumber; i++)
2406 if((MDK_Win[i] > cutCons) && (gInCol[i] <= cutGaps)) counter.residues++;
2407 else saveResidues[i] = -1;
2408 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2409
2410 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2411 /* Compute, if it's necessary, the number of columns
2412 * necessary to achieve the minimum number of it fixed
2413 * by the coverage parameter. */
2414 oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber);
2415 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2416
2417 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2418 /* If it's needed to add new columns, we compute the
2419 * news thresholds */
2420 if(oth > 0) {
2421 counter.residues += oth;
2422
2423 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2424 /* Allocate memory */
2425 vectAuxCons = new float[residNumber];
2426 vectAuxGaps = new int[residNumber];
2427 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2428
2429 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2430 /* Sort a copy of the MDK_Win vector and of the gInCol
2431 * vector, and take the value of the column that marks
2432 * the % baseline */
2433 utils::copyVect((float *) MDK_Win, vectAuxCons, residNumber);
2434 utils::copyVect((int *) gInCol, vectAuxGaps, residNumber);
2435
2436 utils::quicksort(vectAuxCons, 0, residNumber-1);
2437 utils::quicksort(vectAuxGaps, 0, residNumber-1);
2438
2439 blCons = vectAuxCons[(int) ((float)(residNumber - 1) * (100.0 - baseLine)/100.0)];
2440 blGaps = vectAuxGaps[(int) ((float)(residNumber - 1) * (baseLine)/100.0)];
2441 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2442
2443 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2444 /* Deallocate memory */
2445 delete [] vectAuxCons;
2446 delete [] vectAuxGaps;
2447 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2448 }
2449
2450 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2451 /* Fixed the initial size of blocks as 0.5% of
2452 * alignment's length */
2453 for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) {
2454
2455 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2456 /* We start in the alignment middle then we move on
2457 * right and left side at the same time. */
2458 for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) {
2459
2460 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2461 /* Left side. Here, we compute the block's size. */
2462 for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ;
2463
2464 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2465 /* if block's size is greater or equal than the fixed
2466 * size then we select the column and save the block's
2467 * size for the next iteraction. it's obvius that we
2468 * decrease the column's number needed to finish. */
2469 /* Here, we accept column with a conservation's value
2470 * greater or equal than the conservation cut point OR
2471 * less or equal than the gap cut point. */
2472 if((i - jn) >= k) {
2473 for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) {
2474 if((MDK_Win[jn] >= blCons) || (gInCol[jn] <= blGaps)) {
2475 saveResidues[jn] = jn;
2476 oth--;
2477 } else
2478 break;
2479 }
2480 }
2481 i = jn;
2482 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2483
2484 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2485 /* Right side. Here, we compute the block's size. */
2486 for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ;
2487
2488 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2489 /* if block's size is greater or equal than the fixed
2490 * size then we select the column and save the block's
2491 * size for the next iteraction. it's obvius that we
2492 * decrease the column's number needed to finish. */
2493 /* Here, we accept column with a conservation's value
2494 * greater or equal than the conservation cut point OR
2495 * less or equal than the gap cut point. */
2496 if((jn - j) >= k) {
2497 for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) {
2498 if((MDK_Win[jn] >= blCons) || (gInCol[jn] <= blGaps)) {
2499 saveResidues[jn] = jn;
2500 oth--;
2501 } else
2502 break;
2503 }
2504 }
2505 j = jn;
2506 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2507 }
2508 /* ***** ***** ***** ***** ***** ***** ***** ***** */
2509 }
2510
2511 /* Keep only columns blocks bigger than an input columns block size */
2512 if(blockSize != 0) {
2513
2514 /* Traverse all alignment looking for columns blocks greater than LONGBLOCK,
2515 * everytime than a column is not selected by the trimming method, check
2516 * whether the current block size until that point is big enough to be kept
2517 * or it should be removed from the final alignment */
2518 for(i = 0, pos = 0, block = 0; i < residNumber; i++) {
2519 if(saveResidues[i] != -1)
2520 block++;
2521 else {
2522 /* Remove columns from blocks smaller than input blocks size */
2523 if(block < blockSize)
2524 for(j = pos; j <= i; j++)
2525 saveResidues[j] = -1;
2526 pos = i + 1;
2527 block = 0;
2528 }
2529 }
2530 /* Check final block separately since it could happen than last block is not
2531 * big enough but because the loop end could provoke to ignore it */
2532 if(block < blockSize)
2533 for(j = pos; j < i; j++)
2534 saveResidues[j] = -1;
2535 }
2536
2537 /* If the flag -terminalony is set, apply a method to look for internal
2538 * boundaries and get back columns inbetween them, if they exist */
2539 if(terminalGapOnly == true)
2540 if(!removeOnlyTerminal())
2541 return NULL;
2542
2543 /* Once the columns/sequences selection is done, turn it around
2544 * if complementary flag is active */
2545 if(complementary == true)
2546 computeComplementaryAlig(true, false);
2547
2548 /* Check for any additional column/sequence to be removed */
2549 /* Compute new sequences and columns numbers */
2550 counter = removeCols_SeqsAllGaps();
2551
2552 /* Allocate memory for selected sequences/columns */
2553 matrixAux = new string[counter.sequences];
2554 newSeqsName = new string[counter.sequences];
2555
2556 /* Fill local allocated memory with previously selected data */
2557 fillNewDataStructure(matrixAux, newSeqsName);
2558
2559 /* When we have all parameters, we create the new alignment */
2560 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
2561 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
2562 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
2563 keepSequences, keepHeader, sequenNumber, residNumber,
2564 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
2565 identities, overlaps);
2566
2567 /* Deallocate local memory */
2568 delete[] matrixAux;
2569 delete[] newSeqsName;
2570
2571 /* Return the new alignment reference */
2572 return newAlig;
2573 }
2574
2575 /* This method carries out the strict and strict plus method. To trim the
2576 * alignment in an automated way, the method uses as an input a given gaps
2577 * thresholds over a gaps vector values, a given similarity threshold over a
2578 * similarity vector values. With a flag, we can decide which method the
2579 * program shall apply. With another one, the user can ask for the
2580 * complementary alignment. In this method, those columns that has been marked
2581 * to be deleted but has, at least, 3 of 4 surronding neighbour selected are
2582 * get back to the new alignmnet. At other part of the method, those column
2583 * blocks that do not have a minimum size fix by the method will be deleted
2584 * from the trimmed alignment. */
2585 alignment *alignment::cleanStrict(int gapCut, const int *gInCol, float simCut,
2586 const float *MDK_W, bool complementary, bool variable) {
2587
2588 int i, num, lenBlock;
2589 alignment *newAlig;
2590
2591 deque<int> neighboursBlock;
2592
2593 /* Reject columns with gaps number greater than the gap threshold. */
2594 for(i = 0; i < residNumber; i++)
2595 if(gInCol[i] > gapCut)
2596 saveResidues[i] = -1;
2597
2598 /* Reject columns with similarity score under the threshold. */
2599 for(i = 0; i < residNumber; i++)
2600 if(MDK_W[i] < simCut)
2601 saveResidues[i] = -1;
2602
2603 /* Search for those columns that have been removed but have,
2604 * at least, 3 out of 4 adjacent columns selected. */
2605
2606 /* Load the first 5 columns into the list */
2607 for(i = 0; i < (residNumber) && i < 5; i++)
2608 neighboursBlock.push_back(saveResidues[i]);
2609
2610 /* Special case #1 - we analyze it before processing the rest positions
2611 * Column: 1 - Save this column considering its neighbours */
2612 if((saveResidues[0] == -1) && (neighboursBlock.size() > 2) &&
2613 (neighboursBlock[1] != -1) &&
2614 (neighboursBlock[2] != -1))
2615 saveResidues[0] = 0;
2616
2617 /* Special case #2A - we analyze it before processing the rest positions
2618 * Column: 2 - Save this column considering its neighbours */
2619 if((saveResidues[1] == -1) && (neighboursBlock.size() > 3) &&
2620 ((neighboursBlock[0] != -1) &&
2621 (neighboursBlock[2] != -1) &&
2622 (neighboursBlock[3] != -1)))
2623 saveResidues[1] = 1;
2624
2625 /* Special case #2B - when alignment is 3 columns long - we analyze it before
2626 * processing the rest positions.
2627 * Column: 2 - Save this column considering its neighbours */
2628 if((saveResidues[1] == -1) && (neighboursBlock.size() > 2) &&
2629 ((neighboursBlock[0] != -1) &&
2630 (neighboursBlock[2] != -1)))
2631 saveResidues[1] = 1;
2632
2633 /* Normal cases */
2634 for(i = 2, num = 0; i < (residNumber - 2); i++, num = 0) {
2635 if(saveResidues[i] == -1) {
2636 if(neighboursBlock[0] != -1)
2637 num++;
2638 if(neighboursBlock[1] != -1)
2639 num++;
2640 if(neighboursBlock[3] != -1)
2641 num++;
2642 if(neighboursBlock[4] != -1)
2643 num++;
2644 saveResidues[i] = (num >= 3) ? i : -1;
2645
2646 }
2647 /* We slide the window one position to the right */
2648 if((i+3) < residNumber) {
2649 neighboursBlock.pop_front();
2650 neighboursBlock.push_back(saveResidues[i+3]);
2651 }
2652 }
2653
2654 /* Special case #3 -
2655 * Column: 2nd last one - We consider the list rather than the saveResidues
2656 * vector in order to consider only unmodified values before the previous
2657 * loop */
2658 if((saveResidues[(residNumber - 2)] == -1) && (neighboursBlock.size() > 3) &&
2659 ((neighboursBlock[neighboursBlock.size() - 4] != -1) &&
2660 (neighboursBlock[neighboursBlock.size() - 3] != -1) &&
2661 (neighboursBlock[neighboursBlock.size() - 1] != -1)))
2662 saveResidues[(residNumber - 2)] = (residNumber - 2);
2663
2664 /* Special case #4 -
2665 * Column: last one - We consider the list rather than the saveResidues
2666 * vector in order to consider only unmodified values before the previous
2667 * loop */
2668 if((saveResidues[(residNumber - 1)] == -1) && (neighboursBlock.size() > 2) &&
2669 ((neighboursBlock[neighboursBlock.size() - 3] != -1) &&
2670 (neighboursBlock[neighboursBlock.size() - 2] != -1)))
2671 saveResidues[(residNumber - 1)] = (residNumber - 1);
2672
2673 /* Select blocks size based on user input. It can be set either to 5 or to a
2674 * variable number between 3 and 12 depending on alignment length (1% alig) */
2675 if(!variable)
2676 lenBlock = 5;
2677 else {
2678 lenBlock = utils::roundInt(residNumber * 0.01);
2679 lenBlock = lenBlock > 3 ? (lenBlock < 12 ? lenBlock : 12) : 3;
2680 }
2681
2682 /* Allow to change minimal block size */
2683 blockSize = blockSize > 0 ? blockSize : lenBlock;
2684
2685 /* Keep only columns blocks bigger than either a computed dinamically or
2686 * set by the user block size */
2687 removeSmallerBlocks(blockSize);
2688
2689 /* If the flag -terminalony is set, apply a method to look for internal
2690 * boundaries and get back columns inbetween them, if they exist */
2691 if(terminalGapOnly == true)
2692 if(!removeOnlyTerminal())
2693 return NULL;
2694
2695 /* Once the columns/sequences selection is done, turn it around
2696 * if complementary flag is active */
2697 if(complementary == true)
2698 computeComplementaryAlig(true, false);
2699
2700 /* Check for any additional column/sequence to be removed */
2701 /* Compute new sequences and columns numbers */
2702
2703 newValues counter;
2704 counter = removeCols_SeqsAllGaps();
2705
2706 /* Allocate memory for selected sequences/columns */
2707 //~ matrixAux = new string[counter.sequences];
2708 //~ newSeqsName = new string[counter.sequences];
2709
2710 /* Fill local allocated memory with previously selected data */
2711 //~ fillNewDataStructure(matrixAux, newSeqsName);
2712
2713 fillNewDataStructure(&counter);
2714
2715 //~ cerr << counter -> seqsName[counter -> sequences - 1] << endl;
2716
2717 /* When we have all parameters, we create the new alignment */
2718 //~ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, counter.sequences, counter.residues,
2719 newAlig = new alignment(filename, aligInfo, counter.matrix, counter.seqsName,
2720 seqsInfo, counter.sequences, counter.residues, iformat, oformat, shortNames,
2721 dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
2722 keepSequences, keepHeader, sequenNumber,
2723 residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow,
2724 blockSize, identities, overlaps);
2725
2726 /* Deallocate local memory */
2727 //~ delete[] matrixAux;
2728 //~ delete[] newSeqsName;
2729
2730 /* Return the new alignment reference */
2731 return newAlig;
2732 }
2733
2734 /* Remove those sequences with an overlap less than a given threshold. It can
2735 * return the complementary alignment if appropiate flag is set. */
2736 alignment *alignment::cleanOverlapSeq(float minimumOverlap, float *overlapSeq,
2737 bool complementary) {
2738
2739 string *matrixAux, *newSeqsName;
2740 alignment *newAlig;
2741 newValues counter;
2742 int i;
2743
2744 /* Keep only those sequences with an overlap value equal or greater than
2745 * the minimum overlap value set by the user. */
2746 for(i = 0; i < sequenNumber; i++)
2747 if(overlapSeq[i] < minimumOverlap)
2748 saveSequences[i] = -1;
2749
2750 /* Once the columns/sequences selection is done, turn it around
2751 * if complementary flag is active */
2752 if(complementary == true)
2753 computeComplementaryAlig(false, true);
2754
2755 /* Check for any additional column/sequence to be removed */
2756 /* Compute new sequences and columns numbers */
2757 counter = removeCols_SeqsAllGaps();
2758
2759 /* Allocate memory for selected sequences/columns */
2760 matrixAux = new string[counter.sequences];
2761 newSeqsName = new string[counter.sequences];
2762
2763 /* Fill local allocated memory with previously selected data */
2764 fillNewDataStructure(matrixAux, newSeqsName);
2765
2766 /* When we have all parameters, we create the new alignment */
2767 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
2768 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
2769 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
2770 keepSequences, keepHeader, sequenNumber, residNumber,
2771 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
2772 identities, overlaps);
2773
2774 /* Deallocate local memory */
2775 delete [] matrixAux;
2776 delete [] newSeqsName;
2777
2778 /* Return the new alignment reference */
2779 return newAlig;
2780 }
2781
2782 /* Remove those columns, expressed as range, set by the user. It can return
2783 * the complementary alignmnet if appropiate flags is set. */
2784 alignment *alignment::removeColumns(int *columns, int init, int size,
2785 bool complementary) {
2786
2787 string *matrixAux, *newSeqsName;
2788 alignment *newAlig;
2789 newValues counter;
2790 int i, j;
2791
2792 /* Delete those range columns defines in the columns vector */
2793 for(i = init; i < size + init; i += 2)
2794 for(j = columns[i]; j <= columns[i+1]; j++)
2795 saveResidues[j] = -1;
2796
2797 /* Once the columns/sequences selection is done, turn it around
2798 * if complementary flag is active */
2799 if(complementary == true)
2800 computeComplementaryAlig(true, false);
2801
2802 /* Check for any additional column/sequence to be removed */
2803 /* Compute new sequences and columns numbers */
2804 counter = removeCols_SeqsAllGaps();
2805
2806 /* Allocate memory for selected sequences/columns */
2807 matrixAux = new string[counter.sequences];
2808 newSeqsName = new string[counter.sequences];
2809
2810 /* Fill local allocated memory with previously selected data */
2811 fillNewDataStructure(matrixAux, newSeqsName);
2812
2813 /* When we have all parameters, we create the new alignment */
2814 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
2815 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
2816 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
2817 keepSequences, keepHeader, sequenNumber, residNumber,
2818 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
2819 identities, overlaps);
2820
2821 /* Deallocate local memory */
2822 delete[] matrixAux;
2823 delete[] newSeqsName;
2824
2825 /* Return the new alignment reference */
2826 return newAlig;
2827 }
2828
2829 /* This method removes those sequences, expressed as range of sequences, set by
2830 * the user. The method also can return the complementary alignment, it means,
2831 * those sequences that originally should be removed from the input alignment */
2832 alignment *alignment::removeSequences(int *seqs, int init, int size,
2833 bool complementary) {
2834
2835 string *matrixAux, *newSeqsName;
2836 alignment *newAlig;
2837 newValues counter;
2838 int i, j;
2839
2840 /* Delete those range of sequences defines by the seqs vector */
2841 for(i = init; i < size + init; i += 2)
2842 for(j = seqs[i]; j <= seqs[i+1]; j++)
2843 saveSequences[j] = -1;
2844
2845 /* Once the columns/sequences selection is done, turn it around
2846 * if complementary flag is active */
2847 if(complementary == true)
2848 computeComplementaryAlig(false, true);
2849
2850 /* Check for any additional column/sequence to be removed */
2851 /* Compute new sequences and columns numbers */
2852 counter = removeCols_SeqsAllGaps();
2853
2854 /* Allocate memory for selected sequences/columns */
2855 matrixAux = new string[counter.sequences];
2856 newSeqsName = new string[counter.sequences];
2857
2858 /* Fill local allocated memory with previously selected data */
2859 fillNewDataStructure(matrixAux, newSeqsName);
2860
2861 /* When we have all parameters, we create the new alignment */
2862 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
2863 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
2864 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
2865 keepSequences, keepHeader, sequenNumber, residNumber,
2866 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
2867 identities, overlaps);
2868
2869 /* Free local memory */
2870 delete [] matrixAux;
2871 delete [] newSeqsName;
2872
2873 /* Return the new alignment reference */
2874 return newAlig;
2875 }
2876
2877 /* Function for computing the complementary alignment. It just turn around the
2878 * current columns/sequences selection */
2879 void alignment::computeComplementaryAlig(bool residues, bool sequences) {
2880 int i;
2881
2882 for(i = 0; i < residNumber && residues; i++)
2883 saveResidues[i] = (saveResidues[i] == -1) ? i : -1;
2884
2885 for(i = 0; i < sequenNumber && sequences; i++)
2886 saveSequences[i] = (saveSequences[i] == -1) ? i : -1;
2887 }
2888
2889 /* Function designed for identifying right and left borders between central
2890 * and terminal regions in the alignment. The borders are those columns, first
2891 * and last, composed by only residues. Everything inbetween left and right
2892 * borders are keept independently of the applied methods */
2893 bool alignment::removeOnlyTerminal(void) {
2894
2895 int i;
2896 const int *gInCol;
2897
2898 if((left_boundary == -1) and (right_boundary == -1)) {
2899 /* Get alignments gaps stats and copy it */
2900 if(calculateGapStats() != true) {
2901 cerr << endl << "WARNING: Impossible to apply 'terminal-only' method"
2902 << endl << endl;
2903 return false;
2904 }
2905 gInCol = sgaps -> getGapsWindow();
2906
2907 /* Identify left and right borders. First and last columns with no gaps */
2908 for(i = 0; i < residNumber && gInCol[i] != 0; i++) ;
2909 left_boundary = i;
2910
2911 for(i = residNumber - 1; i > -1 && gInCol[i] != 0; i--) ;
2912 right_boundary = i;
2913 }
2914
2915 else if(left_boundary >= right_boundary) {
2916 cerr << endl << "ERROR: Check your manually set left '"<< left_boundary
2917 << "' and right '" << right_boundary << "' boundaries'" << endl << endl;
2918 return false;
2919 }
2920
2921 /* We increase the right boundary in one position because we use lower strict
2922 * condition to get back columns inbetween boundaries removed by any method */
2923 right_boundary += 1;
2924
2925 /* Once the interal boundaries have been established, if these limits exist
2926 * then retrieved all columns inbetween both boundaries. Columns out of these
2927 * limits will remain selected or not depending on the algorithm applied */
2928 for(i = left_boundary; i < right_boundary; i++)
2929 saveResidues[i] = i;
2930
2931 return true;
2932 }
2933
2934 /* Function designed to identify and remove those columns blocks smaller than
2935 * a given size */
2936 void alignment::removeSmallerBlocks(int blockSize) {
2937
2938 int i, j, pos, block;
2939
2940 if(blockSize == 0)
2941 return ;
2942
2943 /* Traverse the alignment looking for blocks greater than BLOCKSIZE, everytime
2944 * than a column hasn't been selected, check whether the current block is big
2945 * enough to be kept or it should be removed from the final alignment */
2946 for(i = 0, pos = 0, block = 0; i < residNumber; i++) {
2947 if(saveResidues[i] != -1)
2948 block++;
2949 else {
2950 /* Remove columns from blocks smaller than input blocks size */
2951 if(block < blockSize)
2952 for(j = pos; j <= i; j++)
2953 saveResidues[j] = -1;
2954 pos = i + 1;
2955 block = 0;
2956 }
2957 }
2958
2959 /* Check final block separately since it could happen than last block is not
2960 * big enough but because the loop end could provoke to ignore it */
2961 if(block < blockSize)
2962 for(j = pos; j < i; j++)
2963 saveResidues[j] = -1;
2964 return ;
2965 }
2966
2967 /* Function designed to identify columns and sequences composed only by gaps.
2968 * Once these columns/sequences have been identified, they are removed from
2969 * final alignment. */
2970 newValues alignment::removeCols_SeqsAllGaps(void) {
2971 int i, j, valid, gaps;
2972 bool warnings = false;
2973 newValues counter;
2974
2975 /* Check all valid columns looking for those composed by only gaps */
2976 for(i = 0, counter.residues = 0; i < residNumber; i++) {
2977 if(saveResidues[i] == -1)
2978 continue;
2979
2980 for(j = 0, valid = 0, gaps = 0; j < sequenNumber; j++) {
2981 if (saveSequences[j] == -1)
2982 continue;
2983 if (sequences[j][i] == '-')
2984 gaps ++;
2985 valid ++;
2986 }
2987 /* Once a column has been identified, warm about it and remove it */
2988 if(gaps == valid) {
2989 if(!warnings)
2990 cerr << endl;
2991 warnings = true;
2992 cerr << "WARNING: Removing column '" << i << "' composed only by gaps"
2993 << endl;
2994 saveResidues[i] = -1;
2995 } else {
2996 counter.residues ++;
2997 }
2998 }
2999
3000 /* Check for those selected sequences to see whether there is anyone with
3001 * only gaps */
3002 for(i = 0, counter.sequences = 0; i < sequenNumber; i++) {
3003 if(saveSequences[i] == -1)
3004 continue;
3005
3006 for(j = 0, valid = 0, gaps = 0; j < residNumber; j++) {
3007 if(saveResidues[j] == -1)
3008 continue;
3009 if (sequences[i][j] == '-')
3010 gaps ++;
3011 valid ++;
3012 }
3013 /* Warm about it and remove each sequence composed only by gaps */
3014 if(gaps == valid) {
3015 if(!warnings)
3016 cerr << endl;
3017 warnings = true;
3018
3019 if(keepSequences) {
3020 cerr << "WARNING: Keeping sequence '" << seqsName[i]
3021 << "' composed only by gaps" << endl;
3022 counter.sequences ++;
3023 } else {
3024 cerr << "WARNING: Removing sequence '" << seqsName[i]
3025 << "' composed only by gaps" << endl;
3026 saveSequences[i] = -1;
3027 }
3028 } else {
3029 counter.sequences ++;
3030 }
3031 }
3032 if(warnings)
3033 cerr << endl;
3034
3035 counter.matrix = new string[counter.sequences];
3036 counter.seqsName = new string[counter.sequences];
3037
3038 return counter;
3039 }
3040
3041 /* Function for copying to previously allocated memory those data selected
3042 * for being in the final alignment */
3043 void alignment::fillNewDataStructure(string *newMatrix, string *newNames) {
3044 int i, j, k;
3045
3046 /* Copy only those sequences/columns selected */
3047 for(i = 0, j = 0; i < sequenNumber; i++) {
3048 if(saveSequences[i] == -1)
3049 continue;
3050
3051 newNames[j] = seqsName[i];
3052 for(k = 0; k < residNumber; k++) {
3053 if(saveResidues[k] == -1)
3054 continue;
3055 newMatrix[j].resize(newMatrix[j].size() + 1, sequences[i][k]);
3056 }
3057 j++;
3058 }
3059 }
3060
3061 /* Function for copying to previously allocated memory those data selected
3062 * for being in the final alignment */
3063 void alignment::fillNewDataStructure(newValues *data) {
3064 int i, j, k;
3065
3066 /* Copy only those sequences/columns selected */
3067 for(i = 0, j = 0; i < sequenNumber; i++) {
3068 if(saveSequences[i] == -1)
3069 continue;
3070
3071 data -> seqsName[j] = seqsName[i];
3072 for(k = 0; k < residNumber; k++) {
3073 if(saveResidues[k] == -1)
3074 continue;
3075 data -> matrix[j].resize(data -> matrix[j].size() + 1, sequences[i][k]);
3076 }
3077 j++;
3078 }
3079 // cerr << data -> seqsName[j-1] << endl;
3080 }
3081
3082 /* Check if CDS file is correct based on: Residues are DNA/RNA (at most). There
3083 * is not gaps in the whole dataset. Each sequence is multiple of 3. At the same
3084 * time, the function will remove stop codons if appropiate flags are used */
3085 bool alignment::prepareCodingSequence(bool splitByStopCodon, bool ignStopCodon,\
3086 alignment *proteinAlig) {
3087
3088 bool warning = false;
3089 size_t found;
3090 int i;
3091
3092 /* New code: We now care about the presence of wildcards/indeterminations
3093 * characters such as 'X' or 'B' into protein sequences as well as about the
3094 * presence of Selenocysteines ('U') or Pyrrolysines ('O'). It only works, by
3095 * now, with the universal genetic code */
3096 int *protSeqsLengths, numbProtSeqs, current_prot;
3097 string *protSeqsNames, *protSequences;
3098 char aminoAcid;
3099
3100 numbProtSeqs = proteinAlig -> getNumSpecies();
3101 protSeqsNames = new string[numbProtSeqs];
3102 protSequences = new string[numbProtSeqs];
3103 protSeqsLengths = new int[numbProtSeqs];
3104
3105 proteinAlig -> getSequences(protSeqsNames, protSequences, protSeqsLengths);
3106
3107 /* Check read sequences are real DNA/RNA */
3108 if (getTypeAlignment() == AAType) {
3109 cerr << endl << "ERROR: Check input CDS file. It seems to content protein "
3110 << "residues." << endl << endl;
3111 return false;
3112 }
3113
3114 for(i = 0; i < sequenNumber; i++) {
3115
3116 /* Get protein sequence to compare against any potential stop codon in the
3117 * coding sequence. If there is not protein sequence for current coding
3118 * sequence, skip its analysis */
3119 for(current_prot = 0; current_prot < numbProtSeqs; current_prot++)
3120 if(protSeqsNames[current_prot] == seqsName[i])
3121 break;
3122 if(current_prot == numbProtSeqs)
3123 continue;
3124
3125 if(sequences[i].find("-") != string::npos) {
3126 if (!warning)
3127 cerr << endl;
3128 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has, at least, one gap"
3129 << endl << endl;
3130 return false;
3131 }
3132
3133 if((sequences[i].length() % 3) != 0) {
3134 if (!warning)
3135 cerr << endl;
3136 warning = true;
3137 cerr << "WARNING: Sequence length \"" << seqsName[i] << "\" is not "
3138 << "multiple of 3 (length: " << sequences[i].length() << ")" << endl;
3139 }
3140
3141 /* Ignore stop codons from the CDS if set by the user */
3142 if (ignStopCodon)
3143 continue;
3144
3145 /* Detect universal stop codons in the CDS. Then, compare those stop codons
3146 * against the protein sequence to see whether they are real stop codons or
3147 * are representing rare amino-acids such as Selenocysteines or Pyrrolysines
3148 * It also allows stop-codons when there are wildcards/indet characters in
3149 * the protein sequence. CDS sequences could be splitted using stop codons
3150 * from the sequence itself */
3151
3152 /* Initialize first appearence of a given stop codon to -1.
3153 * That means that it has not been found yet */
3154 found = -1;
3155 do {
3156 found = sequences[i].find("TGA", found + 1);
3157
3158 /* If a stop codon has been found and its position is multiple of 3.
3159 * Analize it */
3160 if((found != string::npos) && (((int) found % 3) == 0)) {
3161
3162 aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]);
3163 /* It may be a Selenocysteine ('TGA') which should be represented as 'U'
3164 * or wildcard/indet characters such as 'X' or 'B' */
3165 //~ if ((aminoAcid == 'U') || (aminoAcid == 'X') || (aminoAcid == 'B'))
3166 /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character
3167 * such as 'B'/'X' is present, skip current stop codon */
3168 if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \
3169 (aminoAcid == 'B'))
3170 continue;
3171
3172 /* If split_by_stop_codon flag is activated then cut input CDS sequence
3173 * up to first appearance of a stop codon */
3174 else if(splitByStopCodon) {
3175 if (!warning)
3176 cerr << endl;
3177 warning = true;
3178 cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first"
3179 << " appearance of stop codon \"TGA\" (residue \"" << aminoAcid
3180 << "\") at position " << (int) found + 1 << " (length: "
3181 << sequences[i].length() << ")" << endl;
3182 sequences[i].resize((int) found);
3183 }
3184 /* Otherwise, warn about it and return an error */
3185 else {
3186 if (!warning)
3187 cerr << endl;
3188 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \""
3189 << "TGA\" (residue \"" << aminoAcid << "\") at position "
3190 << (int) found + 1 << " (length: " << sequences[i].length() << ")"
3191 << endl << endl;
3192 return false;
3193 }
3194 }
3195 /* Iterate over the CDS until not stop codon is found */
3196 } while(found != string::npos);
3197
3198 /* Initialize first appearence of a given stop codon to -1.
3199 * That means that it has not been found yet */
3200 found = -1;
3201 do {
3202 found = sequences[i].find("TAA", found + 1);
3203
3204 /* If a stop codon has been found and its position is multiple of 3.
3205 * Analize it */
3206 if((found != string::npos) && (((int) found % 3) == 0)) {
3207
3208 aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]);
3209 /* Check if there is any wildcard/indet characters such as 'X' or 'B' */
3210 //~ if ((aminoAcid == 'X') || (aminoAcid == 'B'))
3211 /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character
3212 * such as 'B'/'X' is present, skip current stop codon */
3213 if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \
3214 (aminoAcid == 'B'))
3215 continue;
3216
3217 /* If split_by_stop_codon flag is activated then cut input CDS sequence
3218 * up to first appearance of a stop codon */
3219 else if(splitByStopCodon) {
3220 if (!warning)
3221 cerr << endl;
3222 warning = true;
3223 cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first"
3224 << " appearance of stop codon \"TAA\" (residue \"" << aminoAcid
3225 << "\") at position " << (int) found + 1 << " (length: "
3226 << sequences[i].length() << ")" << endl;
3227 sequences[i].resize((int) found);
3228 }
3229 /* Otherwise, warn about it and return an error */
3230 else {
3231 if (!warning)
3232 cerr << endl;
3233 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \""
3234 << "TAA\" (residue \"" << aminoAcid << "\") at position "
3235 << (int) found + 1 << " (length: " << sequences[i].length() << ")"
3236 << endl << endl;
3237 return false;
3238 }
3239 }
3240 /* Iterate over the CDS until not stop codon is found */
3241 } while(found != string::npos);
3242
3243 /* Initialize first appearence of a given stop codon to -1.
3244 * That means that it has not been found yet */
3245 found = -1;
3246 do {
3247 found = sequences[i].find("TAG", found + 1);
3248 /* If a stop codon has been found and its position is multiple of 3.
3249 * Analize it */
3250 if((found != string::npos) && (((int) found % 3) == 0)) {
3251
3252 aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]);
3253 /* It may be a Pyrrolysine ('TAG') which should be represented as 'O'
3254 * or wildcard/indet characters such as 'X' or 'B' */
3255 //~ if ((aminoAcid == 'O') || (aminoAcid == 'X') || (aminoAcid == 'B'))
3256 /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character
3257 * such as 'B'/'X' is present, skip current stop codon */
3258 if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \
3259 (aminoAcid == 'B'))
3260 continue;
3261
3262 /* If split_by_stop_codon flag is activated then cut input CDS sequence
3263 * up to first appearance of a stop codon */
3264 else if(splitByStopCodon) {
3265 if (!warning)
3266 cerr << endl;
3267 warning = true;
3268 cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first"
3269 << " appearance of stop codon \"TAG\" (residue \"" << aminoAcid
3270 << "\") at position " << (int) found + 1 << " (length: "
3271 << sequences[i].length() << ")" << endl;
3272 sequences[i].resize((int) found);
3273 }
3274 /* Otherwise, warn about it and return an error */
3275 else {
3276 if (!warning)
3277 cerr << endl;
3278 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \""
3279 << "TAG\" (residue \"" << aminoAcid << "\") at position "
3280 << (int) found + 1 << " (length: " << sequences[i].length() << ")"
3281 << endl << endl;
3282 return false;
3283 }
3284 }
3285 /* Iterate over the CDS until not stop codon is found */
3286 } while(found != string::npos);
3287 }
3288
3289 /* If everything was return an OK to informat about it. */
3290 return true;
3291 }
3292
3293 /* Function designed to check whether input CDS file is correct or not based on
3294 * some features: Sequences are in both files (it could be more on CDS file),
3295 * they have (more or less) same ength. Otherwise, some nucleotides could be
3296 * excluded or some 'N's added to fit protein length. */
3297 bool alignment::checkCorrespondence(string *names, int *lengths, int \
3298 totalInputSeqs, int multiple = 1) {
3299
3300 int i, j, seqLength, indet;
3301 bool warnings = false;
3302 string tmp;
3303
3304 /* For each sequence in the current protein alignment, look for its coding
3305 * DNA sequence checking that they have the same size. */
3306 for(i = 0; i < sequenNumber; i++) {
3307
3308 /* Get protein sequence length removing any possible gap. Get as well last
3309 * residue from current sequence */
3310
3311 tmp = utils::removeCharacter('-', sequences[i]);
3312 seqLength = tmp.length() * multiple;
3313 indet = ((int) tmp.length() - utils::min((int) tmp.find_last_not_of("X"), \
3314 (int) tmp.find_last_not_of("x"))) - 1;
3315
3316 /* Go through all available CDS looking for the one with the same ID */
3317 for(j = 0; j < totalInputSeqs; j++) {
3318
3319 /* Once both ID matchs, compare its lengths */
3320 if(seqsName[i] == names[j]) {
3321
3322 /* If both sequences have the same length, stop the search */
3323 if(seqLength == lengths[j])
3324 break;
3325
3326 /* If nucleotide sequence is larger than protein sequence, warn about
3327 * it and continue the verification process. It will used the 'Nth'
3328 * first nucleotides for the conversion */
3329 else if(seqLength < lengths[j]) {
3330 if (!warnings)
3331 cerr << endl;
3332 warnings = true;
3333 cerr << "WARNING: Sequence \"" << seqsName[i] << "\" will be cut at "
3334 << "position " << seqLength << " (length: "<< lengths[j] << ")"
3335 << endl;
3336 break;
3337 }
3338
3339 /* It has been detected some indeterminations at the end of the protein
3340 * sequence. That issue could be cause by some incomplete codons in the
3341 * nucleotide sequences. This issue is solved adding as much 'N' symbols
3342 * as it is needed to preserve the backtranslated alignment */
3343 else if((indet > 0) && (indet > (seqLength - lengths[j])/3)) {
3344 if (!warnings)
3345 cerr << endl;
3346 warnings = true;
3347 cerr << "WARNING: Sequence \"" << seqsName[i] << "\" has some inde"
3348 << "termination symbols 'X' at the end of sequence. They will be"
3349 << " included in the final alignment." << endl;
3350 break;
3351 }
3352
3353 /* If nucleotide sequence is shorter than protein sequence, return an
3354 * error since it is not feasible to cut the input protein aligment to
3355 * fit it into CDNA sequences size */
3356 else {
3357 if (!warnings)
3358 cerr << endl;
3359 warnings = true;
3360 cerr << "WARNING: Sequence \"" << seqsName[i] << "\" has less nucleo"
3361 << "tides (" << lengths[j] << ") than expected (" << seqLength
3362 << "). It will be added N's to complete the sequence" << endl;
3363 break;
3364 }
3365 }
3366 }
3367
3368 /* Warn about a mismatch a sequences name level */
3369 if(j == totalInputSeqs) {
3370 cerr << endl << "ERROR: Sequence \"" << seqsName[i] << "\" is not in "
3371 << "CDS file." << endl << endl;
3372 return false;
3373 }
3374 }
3375
3376 /* If everything is OK, return an appropiate flag */
3377 return true;
3378 }