Mercurial > repos > padge > trimal
comparison trimal_repo/source/alignment.cpp @ 0:b15a3147e604 draft
"planemo upload for repository https://github.com/inab/trimal commit cbe1e8577ecb1a46709034a40dff36052e876e7a-dirty"
| author | padge |
|---|---|
| date | Fri, 25 Mar 2022 17:10:43 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b15a3147e604 |
|---|---|
| 1 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** | |
| 2 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** | |
| 3 | |
| 4 trimAl v1.4: a tool for automated alignment trimming in large-scale | |
| 5 phylogenetics analyses. | |
| 6 | |
| 7 readAl v1.4: a tool for automated alignment conversion among different | |
| 8 formats. | |
| 9 | |
| 10 statAl v1.4: a tool for getting stats about multiple sequence alignments. | |
| 11 | |
| 12 | |
| 13 2009-2015 Capella-Gutierrez S. and Gabaldon, T. | |
| 14 [scapella, tgabaldon]@crg.es | |
| 15 | |
| 16 This file is part of trimAl/readAl. | |
| 17 | |
| 18 trimAl/readAl are free software: you can redistribute it and/or modify | |
| 19 it under the terms of the GNU General Public License as published by | |
| 20 the Free Software Foundation, the last available version. | |
| 21 | |
| 22 trimAl/readAl are distributed in the hope that it will be useful, | |
| 23 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 25 GNU General Public License for more details. | |
| 26 | |
| 27 You should have received a copy of the GNU General Public License | |
| 28 along with trimAl/readAl. If not, see <http://www.gnu.org/licenses/>. | |
| 29 | |
| 30 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** | |
| 31 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 32 using namespace std; | |
| 33 | |
| 34 #include <float.h> | |
| 35 #include "alignment.h" | |
| 36 #include "rwAlignment.cpp" | |
| 37 #include "autAlignment.cpp" | |
| 38 | |
| 39 #include <deque> | |
| 40 | |
| 41 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 42 /* Class constructor */ | |
| 43 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 44 | |
| 45 alignment::alignment(void) { | |
| 46 | |
| 47 /* Alignment parameter */ | |
| 48 sequenNumber = 0; | |
| 49 residNumber = 0; | |
| 50 | |
| 51 /* Are the input sequences aligned? */ | |
| 52 isAligned = false; | |
| 53 | |
| 54 /* Should the output file be reversed? */ | |
| 55 reverse = false; | |
| 56 | |
| 57 /* Should be trimmed only terminal gaps? - set automated and manual boundaries | |
| 58 * values */ | |
| 59 terminalGapOnly = false; | |
| 60 left_boundary = -1; | |
| 61 right_boundary = -1; | |
| 62 | |
| 63 /* Input and output formats */ | |
| 64 iformat = 0; | |
| 65 oformat = 0; | |
| 66 shortNames = false; | |
| 67 | |
| 68 forceCaps = false; | |
| 69 upperCase = false; | |
| 70 lowerCase = false; | |
| 71 | |
| 72 /* Indicate whether sequences composed only by gaps should be kept or not */ | |
| 73 keepSequences = false; | |
| 74 | |
| 75 /* Indicate whether original header, they may include non-alphanumerical | |
| 76 * characters, should be dumped into output stream without any preprocessing | |
| 77 * step */ | |
| 78 keepHeader = false; | |
| 79 | |
| 80 gapSymbol = "-"; | |
| 81 | |
| 82 /* Sequence datatype: DNA, RNA or Protein */ | |
| 83 dataType = 0; | |
| 84 | |
| 85 /* Window sizes to trim the input alignment */ | |
| 86 ghWindow = 0; | |
| 87 shWindow = 0; | |
| 88 | |
| 89 /* Minimum block size in the new alignment */ | |
| 90 blockSize = 0; | |
| 91 | |
| 92 /* Is this alignmnet new? */ | |
| 93 oldAlignment = false; | |
| 94 | |
| 95 /* Sequence residues number */ | |
| 96 residuesNumber = NULL; | |
| 97 | |
| 98 /* Columns and sequences that have been previously selected */ | |
| 99 saveResidues = NULL; | |
| 100 saveSequences = NULL; | |
| 101 | |
| 102 /* Input sequences as well other information such as sequences name, etc */ | |
| 103 sequences = NULL; | |
| 104 seqsName = NULL; | |
| 105 seqsInfo = NULL; | |
| 106 | |
| 107 /* Information about input alignment */ | |
| 108 filename = ""; | |
| 109 aligInfo = ""; | |
| 110 | |
| 111 /* Information computed from alignment */ | |
| 112 sgaps = NULL; | |
| 113 scons = NULL; | |
| 114 seqMatrix = NULL; | |
| 115 | |
| 116 identities = NULL; | |
| 117 overlaps = NULL; | |
| 118 | |
| 119 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 120 } | |
| 121 | |
| 122 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 123 /* Class constructor */ | |
| 124 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 125 | |
| 126 alignment::alignment(string o_filename, string o_aligInfo, string *o_sequences, string *o_seqsName, | |
| 127 string *o_seqsInfo, int o_sequenNumber, int o_residNumber, int o_iformat, int o_oformat, | |
| 128 bool o_shortNames, int o_dataType, int o_isAligned, bool o_reverse, bool o_terminalGapOnly, | |
| 129 int o_left_boundary, int o_right_boundary, | |
| 130 bool o_keepSeqs, bool o_keepHeader, int OldSequences, int OldResidues, int *o_residuesNumber, | |
| 131 int *o_saveResidues, int *o_saveSequences, int o_ghWindow, int o_shWindow, int o_blockSize, | |
| 132 float **o_identities, float **o_overlaps) { | |
| 133 | |
| 134 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 135 int i, j, k, ll; | |
| 136 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 137 | |
| 138 oldAlignment = true; | |
| 139 | |
| 140 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 141 /* Assign the parameter values to the variables */ | |
| 142 sequenNumber = o_sequenNumber; | |
| 143 residNumber = o_residNumber; | |
| 144 | |
| 145 iformat = o_iformat; | |
| 146 oformat = o_oformat; | |
| 147 shortNames = o_shortNames; | |
| 148 | |
| 149 dataType = o_dataType; | |
| 150 | |
| 151 ghWindow = o_ghWindow; | |
| 152 shWindow = o_shWindow; | |
| 153 | |
| 154 blockSize = o_blockSize; | |
| 155 | |
| 156 isAligned = o_isAligned; | |
| 157 reverse = o_reverse; | |
| 158 | |
| 159 terminalGapOnly = o_terminalGapOnly; | |
| 160 right_boundary = o_right_boundary; | |
| 161 left_boundary = o_left_boundary; | |
| 162 | |
| 163 filename = o_filename; | |
| 164 aligInfo = o_aligInfo; | |
| 165 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 166 | |
| 167 keepSequences = o_keepSeqs; | |
| 168 keepHeader = o_keepHeader; | |
| 169 | |
| 170 forceCaps = false; | |
| 171 upperCase = false; | |
| 172 lowerCase = false; | |
| 173 | |
| 174 gapSymbol = "-"; | |
| 175 | |
| 176 | |
| 177 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 178 /* Basic information for the new alignment */ | |
| 179 sequences = new string[sequenNumber]; | |
| 180 for(i = 0; i < sequenNumber; i++) | |
| 181 sequences[i] = o_sequences[i]; | |
| 182 | |
| 183 seqsName = new string[sequenNumber]; | |
| 184 for(i = 0; i < sequenNumber; i++) | |
| 185 seqsName[i] = o_seqsName[i]; | |
| 186 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 187 | |
| 188 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 189 residuesNumber = new int[sequenNumber]; | |
| 190 if((isAligned) || (o_residuesNumber != NULL)) { | |
| 191 for(i = 0; i < sequenNumber; i++) | |
| 192 residuesNumber[i] = residNumber; | |
| 193 } else { | |
| 194 for(i = 0; i < sequenNumber; i++) | |
| 195 residuesNumber[i] = o_residuesNumber[i]; | |
| 196 } | |
| 197 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 198 | |
| 199 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 200 if(o_seqsInfo != NULL) { | |
| 201 seqsInfo = new string[sequenNumber]; | |
| 202 for(i = 0; i < sequenNumber; i++) | |
| 203 seqsInfo[i] = o_seqsInfo[i]; | |
| 204 } else seqsInfo = NULL; | |
| 205 | |
| 206 saveResidues = NULL; | |
| 207 if(o_saveResidues != NULL) { | |
| 208 saveResidues = new int[residNumber]; | |
| 209 for(i = 0, j = 0; i < OldResidues; i++) | |
| 210 if(o_saveResidues[i] != -1) { | |
| 211 saveResidues[j] = o_saveResidues[i]; | |
| 212 j++; | |
| 213 } | |
| 214 } | |
| 215 | |
| 216 saveSequences = NULL; | |
| 217 if(o_saveSequences != NULL) { | |
| 218 saveSequences = new int[sequenNumber]; | |
| 219 for(i = 0, j = 0; i < OldSequences; i++) | |
| 220 if(o_saveSequences[i] != -1) { | |
| 221 saveSequences[j] = o_saveSequences[i]; | |
| 222 j++; | |
| 223 } | |
| 224 } | |
| 225 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 226 | |
| 227 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 228 identities = NULL; | |
| 229 if(o_identities != NULL) { | |
| 230 identities = new float*[sequenNumber]; | |
| 231 for(i = 0, j = 0; i < OldSequences; i++) { | |
| 232 if(o_saveSequences[i] != -1) { | |
| 233 identities[j] = new float[sequenNumber]; | |
| 234 for(k = 0, ll = 0; k < OldSequences; k++) { | |
| 235 if(o_saveSequences[k] != -1) { | |
| 236 identities[j][ll] = o_identities[i][k]; | |
| 237 ll++; | |
| 238 } | |
| 239 } | |
| 240 j++; | |
| 241 } | |
| 242 } | |
| 243 } | |
| 244 | |
| 245 overlaps = NULL; | |
| 246 if(o_overlaps != NULL) { | |
| 247 overlaps = new float*[sequenNumber]; | |
| 248 for(i = 0, j = 0; i < OldSequences; i++) { | |
| 249 if(o_saveSequences[i] != -1) { | |
| 250 overlaps[j] = new float[sequenNumber]; | |
| 251 for(k = 0, ll = 0; k < OldSequences; k++) { | |
| 252 if(o_saveSequences[k] != -1) { | |
| 253 overlaps[j][ll] = o_overlaps[i][k]; | |
| 254 ll++; | |
| 255 } | |
| 256 } | |
| 257 j++; | |
| 258 } | |
| 259 } | |
| 260 } | |
| 261 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 262 | |
| 263 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 264 /* Any structure associated to the new alignment is | |
| 265 * initialize to NULL. In this way, these structure, | |
| 266 * if it will be necessary, has to be computed */ | |
| 267 sgaps = NULL; | |
| 268 scons = NULL; | |
| 269 seqMatrix = NULL; | |
| 270 identities = NULL; | |
| 271 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 272 } | |
| 273 | |
| 274 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 275 /* Overlapping operator = to use it as a kind of class constructor */ | |
| 276 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 277 | |
| 278 alignment &alignment::operator=(const alignment &old) { | |
| 279 int i, j; | |
| 280 | |
| 281 if(this != &old) { | |
| 282 | |
| 283 oldAlignment = true; | |
| 284 | |
| 285 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 286 /* Assign the parameter values to the variables */ | |
| 287 sequenNumber = old.sequenNumber; | |
| 288 residNumber = old.residNumber; | |
| 289 | |
| 290 isAligned = old.isAligned; | |
| 291 reverse = old.reverse; | |
| 292 | |
| 293 terminalGapOnly = old.terminalGapOnly; | |
| 294 right_boundary = old.right_boundary; | |
| 295 left_boundary = old.left_boundary; | |
| 296 | |
| 297 iformat = old.iformat; | |
| 298 oformat = old.oformat; | |
| 299 shortNames = old.shortNames; | |
| 300 | |
| 301 dataType = old.dataType; | |
| 302 | |
| 303 ghWindow = old.ghWindow; | |
| 304 shWindow = old.shWindow; | |
| 305 | |
| 306 blockSize = old.blockSize; | |
| 307 | |
| 308 filename = old.filename; | |
| 309 aligInfo = old.aligInfo; | |
| 310 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 311 | |
| 312 keepSequences = old.keepSequences; | |
| 313 keepHeader = old.keepHeader; | |
| 314 | |
| 315 forceCaps = old.forceCaps; | |
| 316 upperCase = old.upperCase; | |
| 317 lowerCase = old.lowerCase; | |
| 318 | |
| 319 gapSymbol = old.gapSymbol; | |
| 320 | |
| 321 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 322 sequences = new string[sequenNumber]; | |
| 323 for(i = 0; i < sequenNumber; i++) | |
| 324 sequences[i] = old.sequences[i]; | |
| 325 | |
| 326 seqsName = new string[sequenNumber]; | |
| 327 for(i = 0; i < sequenNumber; i++) | |
| 328 seqsName[i] = old.seqsName[i]; | |
| 329 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 330 | |
| 331 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 332 delete [] residuesNumber; | |
| 333 if(old.residuesNumber) { | |
| 334 residuesNumber = new int[sequenNumber]; | |
| 335 for(i = 0; i < sequenNumber; i++) | |
| 336 residuesNumber[i] = old.residuesNumber[i]; | |
| 337 } residuesNumber = NULL; | |
| 338 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 339 | |
| 340 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 341 delete [] seqsInfo; | |
| 342 if(old.seqsInfo) { | |
| 343 seqsInfo = new string[sequenNumber]; | |
| 344 for(i = 0; i < sequenNumber; i++) | |
| 345 seqsInfo[i] = old.seqsInfo[i]; | |
| 346 } else seqsInfo = NULL; | |
| 347 | |
| 348 delete [] saveResidues; | |
| 349 if(old.saveResidues) { | |
| 350 saveResidues = new int[residNumber]; | |
| 351 for(i = 0; i < residNumber; i++) | |
| 352 saveResidues[i] = old.saveResidues[i]; | |
| 353 } else saveResidues = NULL; | |
| 354 | |
| 355 delete [] saveSequences; | |
| 356 if(old.saveSequences) { | |
| 357 saveSequences = new int[sequenNumber]; | |
| 358 for(i = 0; i < sequenNumber; i++) | |
| 359 saveSequences[i] = old.saveSequences[i]; | |
| 360 } else saveSequences = NULL; | |
| 361 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 362 | |
| 363 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 364 delete [] identities; | |
| 365 if(old.identities) { | |
| 366 identities = new float*[sequenNumber]; | |
| 367 for(i = 0; i < sequenNumber; i++) { | |
| 368 identities[i] = new float[sequenNumber]; | |
| 369 for(j = 0; j < sequenNumber; j++) | |
| 370 identities[i][j] = old.identities[i][j]; | |
| 371 } | |
| 372 } else identities = NULL; | |
| 373 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 374 | |
| 375 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 376 delete [] overlaps; | |
| 377 if(old.overlaps) { | |
| 378 overlaps = new float*[sequenNumber]; | |
| 379 for(i = 0; i < sequenNumber; i++) { | |
| 380 overlaps[i] = new float[sequenNumber]; | |
| 381 for(j = 0; j < sequenNumber; j++) | |
| 382 overlaps[i][j] = old.overlaps[i][j]; | |
| 383 } | |
| 384 } else overlaps = NULL; | |
| 385 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 386 | |
| 387 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 388 delete sgaps; | |
| 389 sgaps = NULL; | |
| 390 | |
| 391 delete scons; | |
| 392 scons = NULL; | |
| 393 | |
| 394 delete seqMatrix; | |
| 395 seqMatrix = old.seqMatrix; | |
| 396 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 397 } | |
| 398 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 399 return *this; | |
| 400 } | |
| 401 | |
| 402 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 403 /* Class destructor */ | |
| 404 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 405 | |
| 406 alignment::~alignment(void) { | |
| 407 int i; | |
| 408 | |
| 409 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 410 if(sequences != NULL) | |
| 411 delete [] sequences; | |
| 412 sequences = NULL; | |
| 413 | |
| 414 if(seqsName != NULL) | |
| 415 delete [] seqsName; | |
| 416 seqsName = NULL; | |
| 417 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 418 | |
| 419 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 420 if(residuesNumber != NULL) | |
| 421 delete [] residuesNumber; | |
| 422 residuesNumber = NULL; | |
| 423 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 424 | |
| 425 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 426 if(seqsInfo != NULL) | |
| 427 delete [] seqsInfo; | |
| 428 seqsInfo = NULL; | |
| 429 | |
| 430 if(saveResidues != NULL) | |
| 431 delete[] saveResidues; | |
| 432 saveResidues = NULL; | |
| 433 | |
| 434 if(saveSequences != NULL) | |
| 435 delete[] saveSequences; | |
| 436 saveSequences = NULL; | |
| 437 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 438 | |
| 439 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 440 if(identities != NULL) { | |
| 441 for(i = 0; i < sequenNumber; i++) | |
| 442 delete [] identities[i]; | |
| 443 delete [] identities; | |
| 444 } | |
| 445 identities = NULL; | |
| 446 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 447 | |
| 448 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 449 if(overlaps != NULL) { | |
| 450 for(i = 0; i < sequenNumber; i++) | |
| 451 delete [] overlaps[i]; | |
| 452 delete [] overlaps; | |
| 453 } | |
| 454 overlaps = NULL; | |
| 455 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 456 | |
| 457 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 458 if(sgaps != NULL) | |
| 459 delete sgaps; | |
| 460 sgaps = NULL; | |
| 461 | |
| 462 if(scons != NULL) | |
| 463 delete scons; | |
| 464 scons = NULL; | |
| 465 | |
| 466 if(seqMatrix != NULL) | |
| 467 delete seqMatrix; | |
| 468 seqMatrix = NULL; | |
| 469 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 470 | |
| 471 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 472 oldAlignment = false; | |
| 473 | |
| 474 sequenNumber = 0; | |
| 475 residNumber = 0; | |
| 476 | |
| 477 isAligned = false; | |
| 478 reverse = false; | |
| 479 | |
| 480 terminalGapOnly = false; | |
| 481 right_boundary = -1; | |
| 482 left_boundary = -1; | |
| 483 | |
| 484 iformat = 0; | |
| 485 oformat = 0; | |
| 486 shortNames = false; | |
| 487 | |
| 488 dataType = 0; | |
| 489 | |
| 490 ghWindow = 0; | |
| 491 shWindow = 0; | |
| 492 | |
| 493 blockSize = 0; | |
| 494 | |
| 495 filename.clear(); | |
| 496 aligInfo.clear(); | |
| 497 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 498 } | |
| 499 | |
| 500 // Try to load current alignment and inform otherwise | |
| 501 bool alignment::loadAlignment(char *alignmentFile) { | |
| 502 | |
| 503 // Detect input alignment format - it is an strict detection procedure | |
| 504 iformat = formatInputAlignment(alignmentFile); | |
| 505 // Unless it is indicated somewhere else, output alignment format will be | |
| 506 // the same as the input one | |
| 507 oformat = iformat; | |
| 508 | |
| 509 // Use the appropiate function to read input alignment | |
| 510 switch(iformat) { | |
| 511 case 1: | |
| 512 return loadClustalAlignment(alignmentFile); | |
| 513 case 3: | |
| 514 return loadNBRF_PirAlignment(alignmentFile); | |
| 515 case 8: | |
| 516 return loadFastaAlignment(alignmentFile); | |
| 517 case 11: | |
| 518 return loadPhylip3_2Alignment(alignmentFile); | |
| 519 case 12: | |
| 520 return loadPhylipAlignment(alignmentFile); | |
| 521 case 17: | |
| 522 return loadNexusAlignment(alignmentFile); | |
| 523 case 21: | |
| 524 return loadMegaInterleavedAlignment(alignmentFile); | |
| 525 case 22: | |
| 526 return loadMegaNonInterleavedAlignment(alignmentFile); | |
| 527 // Return a FALSE value - meaning the input alignment was not loaded | |
| 528 default: | |
| 529 return false; | |
| 530 } | |
| 531 return false; | |
| 532 } | |
| 533 | |
| 534 // Return input alignment format - it is useful to set-up output alignment one | |
| 535 int alignment::getInputFormat(void) { | |
| 536 return iformat; | |
| 537 } | |
| 538 | |
| 539 // Return output alignment format | |
| 540 int alignment::getOutputFormat(void) { | |
| 541 return oformat; | |
| 542 } | |
| 543 | |
| 544 // Return whether there is one or more alignments at the input file. | |
| 545 // Currently trimAl family only support single alignments per file. | |
| 546 int alignment::typeInputFile(void) { | |
| 547 return SINGLE; | |
| 548 } | |
| 549 | |
| 550 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 551 /* This function prints the alignmnet to the standard output using an | |
| 552 * appropiate function depending on the output format */ | |
| 553 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 554 bool alignment::printAlignment(void){ | |
| 555 | |
| 556 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 557 if(sequences == NULL) | |
| 558 return false; | |
| 559 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 560 | |
| 561 if((residNumber == 0) || (sequenNumber == 0)) { | |
| 562 cerr << endl << "WARNING: Output alignment has not been generated. " | |
| 563 << "It is empty" << endl << endl; | |
| 564 return true; | |
| 565 } | |
| 566 | |
| 567 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 568 switch(oformat) { | |
| 569 case 1: | |
| 570 alignmentClustalToFile(cout); | |
| 571 break; | |
| 572 case 3: | |
| 573 alignmentNBRF_PirToFile(cout); | |
| 574 break; | |
| 575 case 8: | |
| 576 alignmentFastaToFile(cout); | |
| 577 break; | |
| 578 case 11: | |
| 579 alignmentPhylip3_2ToFile(cout); | |
| 580 break; | |
| 581 case 12: | |
| 582 alignmentPhylipToFile(cout); | |
| 583 break; | |
| 584 case 13: | |
| 585 alignmentPhylip_PamlToFile(cout); | |
| 586 break; | |
| 587 case 17: | |
| 588 alignmentNexusToFile(cout); | |
| 589 break; | |
| 590 case 21: case 22: | |
| 591 alignmentMegaToFile(cout); | |
| 592 break; | |
| 593 case 99: | |
| 594 getSequences(cout); | |
| 595 break; | |
| 596 case 100: | |
| 597 alignmentColourHTML(cout); | |
| 598 break; | |
| 599 default: | |
| 600 return false; | |
| 601 } | |
| 602 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 603 return true; | |
| 604 } | |
| 605 | |
| 606 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 607 /* This function puts the alignment in a given file */ | |
| 608 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 609 bool alignment::saveAlignment(char *destFile) { | |
| 610 | |
| 611 ofstream file; | |
| 612 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 613 if(sequences == NULL) | |
| 614 return false; | |
| 615 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 616 | |
| 617 if((residNumber == 0) || (sequenNumber == 0)) { | |
| 618 cerr << endl << "WARNING: Output alignment has not been generated. " | |
| 619 << "It is empty." << endl << endl; | |
| 620 return true; | |
| 621 } | |
| 622 | |
| 623 /* Check whether the input sequences file is aligned or not before creating | |
| 624 * a given output format. */ | |
| 625 switch(oformat) { | |
| 626 case 1: case 11: case 12: case 13: case 17: case 21: case 22: | |
| 627 /* Check whether sequences in the alignment are aligned or not. | |
| 628 * Warn about it if there are not aligned. */ | |
| 629 if (!isAligned) { | |
| 630 cerr << endl << "ERROR: Sequences are not aligned. Output format is " | |
| 631 << "not compatible with unaligned sequences."; | |
| 632 return false; | |
| 633 } | |
| 634 } | |
| 635 | |
| 636 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 637 /* File open and correct open check */ | |
| 638 file.open(destFile); | |
| 639 if(!file) return false; | |
| 640 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 641 | |
| 642 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 643 /* Depending on the output format, we call to the | |
| 644 * appropiate function */ | |
| 645 switch(oformat) { | |
| 646 case 1: | |
| 647 alignmentClustalToFile(file); | |
| 648 break; | |
| 649 case 3: | |
| 650 alignmentNBRF_PirToFile(file); | |
| 651 break; | |
| 652 case 8: | |
| 653 alignmentFastaToFile(file); | |
| 654 break; | |
| 655 case 11: | |
| 656 alignmentPhylip3_2ToFile(file); | |
| 657 break; | |
| 658 case 12: | |
| 659 alignmentPhylipToFile(file); | |
| 660 break; | |
| 661 case 13: | |
| 662 alignmentPhylip_PamlToFile(file); | |
| 663 break; | |
| 664 case 17: | |
| 665 alignmentNexusToFile(file); | |
| 666 break; | |
| 667 case 21: case 22: | |
| 668 alignmentMegaToFile(file); | |
| 669 break; | |
| 670 case 99: | |
| 671 getSequences(file); | |
| 672 break; | |
| 673 case 100: | |
| 674 alignmentColourHTML(file); | |
| 675 break; | |
| 676 default: | |
| 677 return false; | |
| 678 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 679 } | |
| 680 | |
| 681 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 682 /* Close the output file */ | |
| 683 file.close(); | |
| 684 | |
| 685 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 686 /* All is OK, return true */ | |
| 687 return true; | |
| 688 } | |
| 689 | |
| 690 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 691 /* This function trimms a given alignment based on the gap distribution | |
| 692 * values. To trim the alignment, this function uses two parameters: | |
| 693 * | |
| 694 * baseLine: Minimum percentage of columns that should | |
| 695 * be conserved in the new alignment. | |
| 696 * gapsPct: Maximum percentage of gaps per column that | |
| 697 * should be in the new alignment. | |
| 698 * | |
| 699 * The function selects that combination of parameters that maximize the | |
| 700 * final number of columns in the new alignment. There is an extra parameter | |
| 701 * to ask for the complementary alignment. A complementary alignment consits | |
| 702 * of those columns that would delete applying the standard approach. */ | |
| 703 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 704 alignment *alignment::cleanGaps(float baseLine, float gapsPct, bool complementary) { | |
| 705 | |
| 706 alignment *ret; | |
| 707 double cut; | |
| 708 | |
| 709 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 710 /* If gaps statistics are not calculated, we | |
| 711 * calculate them */ | |
| 712 if(calculateGapStats() != true) | |
| 713 return NULL; | |
| 714 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 715 | |
| 716 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 717 /* Obtain the cut point using the given parameters */ | |
| 718 cut = sgaps -> calcCutPoint(baseLine, gapsPct); | |
| 719 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 720 | |
| 721 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 722 /* Once we have the cut value proposed, we call the | |
| 723 * appropiate method to clean the alignment and, then, | |
| 724 * generate the new alignment. */ | |
| 725 ret = cleanByCutValue(cut, baseLine, sgaps -> getGapsWindow(), complementary); | |
| 726 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 727 | |
| 728 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 729 /* Return a reference of the new alignment */ | |
| 730 return ret; | |
| 731 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 732 } | |
| 733 | |
| 734 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 735 /* This function trimms a given alignment based on the similarity distribution | |
| 736 * values. To trim the alignment, this function uses two parameters: | |
| 737 * | |
| 738 * baseLine: Minimum percentage of columns that should | |
| 739 * be conserved in the new alignment. | |
| 740 * conservat: Minimum value of similarity per column that | |
| 741 * should be in the new alignment. | |
| 742 * | |
| 743 * The function selects that combination of parameters that maximize the | |
| 744 * final number of columns in the new alignment. There is an extra parameter | |
| 745 * to ask for the complementary alignment. A complementary alignment consits | |
| 746 * of those columns that would delete applying the standard approach. */ | |
| 747 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 748 alignment *alignment::cleanConservation(float baseLine, float conservationPct, bool complementary) { | |
| 749 | |
| 750 alignment *ret; | |
| 751 float cut; | |
| 752 | |
| 753 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 754 /* If conservation's statistics are not calculated, | |
| 755 * we calculate them */ | |
| 756 if(calculateConservationStats() != true) | |
| 757 return NULL; | |
| 758 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 759 | |
| 760 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 761 /* Calculate the cut point using the given parameters */ | |
| 762 cut = (float) scons -> calcCutPoint(baseLine, conservationPct); | |
| 763 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 764 | |
| 765 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 766 /* Once we have the cut value, we call the appropiate | |
| 767 * method to clean the alignment and, then, generate | |
| 768 the new alignment */ | |
| 769 ret = cleanByCutValue(cut, baseLine, scons -> getMdkwVector(), complementary); | |
| 770 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 771 | |
| 772 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 773 /* Return a reference of the new alignment */ | |
| 774 return ret; | |
| 775 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 776 } | |
| 777 | |
| 778 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 779 /* This function trimms a given alignment based on the similarity and gaps | |
| 780 * distribution values. To trim the alignment, this function uses three | |
| 781 * parameters: | |
| 782 * | |
| 783 * baseLine: Minimum percentage of columns that should | |
| 784 * be conserved in the new alignment. | |
| 785 * gapsPct: Maximum percentage of gaps per column that | |
| 786 * should be in the new alignment. | |
| 787 * conservat: Minimum value of similarity per column that | |
| 788 * should be in the new alignment. | |
| 789 * | |
| 790 * The function selects that combination of parameters that maximize the | |
| 791 * final number of columns in the new alignment. If the baseLine parameter | |
| 792 * is the most strict, the other two ones would be relaxed to keep columns | |
| 793 * that would not be selected. There is an extra parameter to ask for the | |
| 794 * complementary alignment. A complementary alignment consits of those | |
| 795 * columns that would delete applying the standard approach. */ | |
| 796 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 797 alignment *alignment::clean(float baseLine, float GapsPct, float conservationPct, bool complementary) { | |
| 798 | |
| 799 alignment *ret; | |
| 800 float cutCons; | |
| 801 double cutGaps; | |
| 802 | |
| 803 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 804 /* If gaps statistics are not calculated, we calculate | |
| 805 * them */ | |
| 806 if(calculateGapStats() != true) | |
| 807 return NULL; | |
| 808 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 809 | |
| 810 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 811 /* If conservation's statistics are not calculated, | |
| 812 * we calculate them */ | |
| 813 if(calculateConservationStats() != true) | |
| 814 return NULL; | |
| 815 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 816 | |
| 817 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 818 /* Calculate the two cut points using the parameters */ | |
| 819 cutGaps = sgaps->calcCutPoint(baseLine, GapsPct); | |
| 820 cutCons = scons->calcCutPoint(baseLine, conservationPct); | |
| 821 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 822 | |
| 823 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 824 /* Clean the alingment using the two cut values, the | |
| 825 * gapsWindow and MDK_Windows vectors and the baseline | |
| 826 * value */ | |
| 827 ret = cleanByCutValue(cutGaps, sgaps -> getGapsWindow(), baseLine, cutCons, scons -> getMdkwVector(), complementary); | |
| 828 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 829 | |
| 830 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 831 /* Return a reference of the clean alignment object */ | |
| 832 return ret; | |
| 833 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 834 } | |
| 835 | |
| 836 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 837 /* This function cleans a given alignment based on the consistency values | |
| 838 * asses from a dataset of alignments. The method takes three parameters: | |
| 839 * | |
| 840 * cutpoint: Lower limit (0-1) of comparefile value admits in | |
| 841 * the new alignment. | |
| 842 * baseline: Minimum percentage of columns to have in the new alignment | |
| 843 * vectValues: A vector with alignment's consistency values. | |
| 844 * | |
| 845 * The function computes the optimal parameter combination value to trim | |
| 846 * an alignment based on the consistency value from the comparison among | |
| 847 * a dataset of alignments with the same sequences. */ | |
| 848 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 849 alignment *alignment::cleanCompareFile(float cutpoint, float baseLine, float *vectValues, bool complementary) { | |
| 850 | |
| 851 alignment *ret; | |
| 852 float cut, *vectAux; | |
| 853 | |
| 854 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 855 /* Allocate memory */ | |
| 856 vectAux = new float[residNumber]; | |
| 857 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 858 | |
| 859 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 860 /* Sort a copy of the vectValues vector, and take the | |
| 861 * value at 100% - baseline position. */ | |
| 862 utils::copyVect((float *) vectValues, vectAux, residNumber); | |
| 863 utils::quicksort(vectAux, 0, residNumber-1); | |
| 864 cut = vectAux[(int) ((float)(residNumber - 1) * (100.0 - baseLine)/100.0)]; | |
| 865 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 866 | |
| 867 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 868 /* We have to decide which is the smallest value | |
| 869 * between the cutpoint value and the value from | |
| 870 * the minimum percentage threshold */ | |
| 871 cut = cutpoint < cut ? cutpoint : cut; | |
| 872 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 873 | |
| 874 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 875 /* Deallocate memory */ | |
| 876 delete [] vectAux; | |
| 877 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 878 | |
| 879 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 880 /* Clean the selected alignment using the input parameters. */ | |
| 881 ret = cleanByCutValue(cut, baseLine, vectValues, complementary); | |
| 882 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 883 | |
| 884 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 885 /* Return a refernce of the new alignment */ | |
| 886 return ret; | |
| 887 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 888 } | |
| 889 | |
| 890 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 891 /* This function removes those sequences that are misaligned with the rest | |
| 892 * of sequences in the alignment at given sequence and residue overlaps. | |
| 893 * | |
| 894 * overlapColumn: Set the minimum similarity fraction value that has | |
| 895 * to have a position from a given sequence to be | |
| 896 * considered as an hit. | |
| 897 * minimumOverlap: Set the minimum proportion of hits that has to be | |
| 898 * a sequence to keep it in the new alignment. | |
| 899 * | |
| 900 * At the same time, it's possible to get an alignment with those columns | |
| 901 * that this method should remove setting for that purpose the complementary | |
| 902 * parameter to True */ | |
| 903 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 904 alignment *alignment::cleanSpuriousSeq(float overlapColumn, float minimumOverlap, bool complementary) { | |
| 905 | |
| 906 float *overlapVector; | |
| 907 alignment *newAlig; | |
| 908 | |
| 909 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 910 /* Allocate local memory */ | |
| 911 overlapVector = new float[sequenNumber]; | |
| 912 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 913 | |
| 914 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 915 /* Compute the overlap's vector using the overlap | |
| 916 * column's value */ | |
| 917 if(!calculateSpuriousVector(overlapColumn, overlapVector)) | |
| 918 return NULL; | |
| 919 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 920 | |
| 921 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 922 /* Select and remove the sequences with a overlap less | |
| 923 * than threshold's overlap and create a new alignemnt */ | |
| 924 newAlig = cleanOverlapSeq(minimumOverlap, overlapVector, complementary); | |
| 925 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 926 | |
| 927 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 928 /* Deallocate local memory */ | |
| 929 delete [] overlapVector; | |
| 930 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 931 | |
| 932 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 933 /* Return a reference of the clean alignment object */ | |
| 934 return newAlig; | |
| 935 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 936 } | |
| 937 | |
| 938 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 939 /* This function implements the gappyout approach. To trim the alignment, this | |
| 940 * method computes the slope from the gaps distribution from the input alig. | |
| 941 * Using this information, the method asses the most abrupt change between | |
| 942 * three consecutive points from that distribution and selects the first of | |
| 943 * this point as aan cut-off point. | |
| 944 * | |
| 945 * The method also can return the complementary alignment that consists of | |
| 946 * those columns that would be delete applying this algorithm. */ | |
| 947 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 948 alignment *alignment::clean2ndSlope(bool complementarity) { | |
| 949 | |
| 950 alignment *ret; | |
| 951 int cut; | |
| 952 | |
| 953 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 954 /* If gaps statistics are not calculated, we calculate | |
| 955 * them */ | |
| 956 if(calculateGapStats() != true) | |
| 957 return NULL; | |
| 958 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 959 | |
| 960 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 961 /* We get the cut point using a automatic method for | |
| 962 * this purpose. */ | |
| 963 cut = sgaps -> calcCutPoint2ndSlope(); | |
| 964 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 965 | |
| 966 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 967 /* Using the cut point calculates in last steps, we | |
| 968 * clean the alignment and generate a new alignment */ | |
| 969 ret = cleanByCutValue(cut, 0, sgaps->getGapsWindow(), complementarity); | |
| 970 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 971 | |
| 972 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 973 /* Returns the new alignment. */ | |
| 974 return ret; | |
| 975 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 976 } | |
| 977 | |
| 978 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 979 /* This method implements the strict and strictplus approaches, to apply | |
| 980 * these algorithms, the program computes the gaps and similarity distribution | |
| 981 * values to decide which ones are the optimal cutoff points. To compute these | |
| 982 * cutoff points, the method applies those steps described in the manual. */ | |
| 983 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 984 alignment *alignment::cleanCombMethods(bool complementarity, bool variable) { | |
| 985 | |
| 986 float simCut, first20Point, last80Point, *simil, *vectAux; | |
| 987 int i, j, acm, gapCut, *positions, *gaps; | |
| 988 double inic, fin, vlr; | |
| 989 | |
| 990 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 991 /* If gaps statistics are not calculated, we calculate | |
| 992 * them */ | |
| 993 if(calculateGapStats() != true) | |
| 994 return NULL; | |
| 995 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 996 | |
| 997 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 998 /* Computes the gap cut point using a automatic method | |
| 999 * and at the same time, we get the gaps values from | |
| 1000 * the alignment. */ | |
| 1001 gapCut = sgaps -> calcCutPoint2ndSlope(); | |
| 1002 gaps = sgaps -> getGapsWindow(); | |
| 1003 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1004 | |
| 1005 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1006 /* If conservation's statistics are not calculated, | |
| 1007 * we calculate them */ | |
| 1008 if(calculateConservationStats() != true) | |
| 1009 return NULL; | |
| 1010 | |
| 1011 // Once the conservation stats have been calculated, generate the vector | |
| 1012 // containing the similarity value for each column considering parameters | |
| 1013 // such as windows size | |
| 1014 simil = scons -> getMdkwVector(); | |
| 1015 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1016 | |
| 1017 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1018 /* Allocate local memory and initializate it to -1 */ | |
| 1019 positions = new int[residNumber]; | |
| 1020 utils::initlVect(positions, residNumber, -1); | |
| 1021 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1022 | |
| 1023 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1024 /* The method only selects columns with gaps number | |
| 1025 * less or equal than the gap's cut point. Counts the | |
| 1026 * number of columns that have been selected */ | |
| 1027 for(i = 0, acm = 0; i < residNumber; i++) { | |
| 1028 if(gaps[i] <= gapCut) { | |
| 1029 positions[i] = i; | |
| 1030 acm++; | |
| 1031 } | |
| 1032 } | |
| 1033 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1034 | |
| 1035 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1036 /* Allocate local memory and save the similaritys | |
| 1037 * values for the columns that have been selected */ | |
| 1038 vectAux = new float[acm]; | |
| 1039 for(i = 0, j = 0; i < residNumber; i++) | |
| 1040 if(positions[i] != -1) | |
| 1041 vectAux[j++] = simil[i]; | |
| 1042 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1043 | |
| 1044 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1045 /* Sort the conservation's value vector. */ | |
| 1046 utils::quicksort(vectAux, 0, acm-1); | |
| 1047 | |
| 1048 /* ...and search for the vector points at the 20 and | |
| 1049 * 80% of length. */ | |
| 1050 first20Point = 0; | |
| 1051 last80Point = 0; | |
| 1052 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1053 | |
| 1054 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1055 for(i = acm - 1, j = 1; i >= 0; i--, j++) { | |
| 1056 if((((float) j/acm) * 100.0) <= 20.0) | |
| 1057 first20Point = vectAux[i]; | |
| 1058 if((((float) j/acm) * 100.0) <= 80.0) | |
| 1059 last80Point = vectAux[i]; | |
| 1060 } | |
| 1061 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1062 | |
| 1063 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1064 /* Computes the logaritmic's values for those points. | |
| 1065 * Finally the method computes the similarity cut | |
| 1066 * point using these values. */ | |
| 1067 inic = log10(first20Point); | |
| 1068 fin = log10(last80Point); | |
| 1069 vlr = ((inic - fin) / 10) + fin; | |
| 1070 simCut = (float) pow(10, vlr); | |
| 1071 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1072 | |
| 1073 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1074 /* Clean the alignment and generate a new alignment | |
| 1075 * object using the gaps cut and the similaritys cut | |
| 1076 * values */ | |
| 1077 alignment *ret = cleanStrict(gapCut, sgaps -> getGapsWindow(), | |
| 1078 simCut, scons -> getMdkwVector(), complementarity, variable); | |
| 1079 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1080 | |
| 1081 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1082 /* Deallocate local memory */ | |
| 1083 delete [] vectAux; | |
| 1084 delete [] positions; | |
| 1085 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1086 | |
| 1087 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1088 /* Return a reference of the new alignment */ | |
| 1089 return ret; | |
| 1090 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1091 } | |
| 1092 | |
| 1093 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1094 /* This function allows to delete those columns consistents of only gaps. | |
| 1095 * This function is specially useful when we remove misaligned sequences from | |
| 1096 * a given alignmnet. As the rest of functions, this function can return the | |
| 1097 * complementary alignment but this alignment would have only columns with | |
| 1098 * all gaps. */ | |
| 1099 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1100 alignment *alignment::cleanNoAllGaps(bool complementarity) { | |
| 1101 | |
| 1102 alignment *ret; | |
| 1103 | |
| 1104 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1105 /* If gaps statistics are not calculated, we calculate | |
| 1106 * them */ | |
| 1107 if(calculateGapStats() != true) | |
| 1108 return NULL; | |
| 1109 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1110 | |
| 1111 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1112 /* We want to conserve the columns with gaps' number | |
| 1113 * less or equal than sequences' number - 1 */ | |
| 1114 ret = cleanByCutValue((sequenNumber - 1), 0, sgaps->getGapsWindow(), complementarity); | |
| 1115 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1116 | |
| 1117 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1118 /* Returns the new alignment. */ | |
| 1119 return ret; | |
| 1120 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1121 } | |
| 1122 | |
| 1123 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1124 /* This method computes the overlap for each sequence given a overlap value. | |
| 1125 * This overlap set the minimum fraction that has to be a position for the | |
| 1126 * selected sequence to count as an hit. This proportion measures how much | |
| 1127 * is similar, (in terms of residues, indetermination and gaps), a given | |
| 1128 * position respect to the element on its column. */ | |
| 1129 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1130 bool alignment::calculateSpuriousVector(float overlap, float *spuriousVector) { | |
| 1131 | |
| 1132 int i, j, k, seqValue, ovrlap, hit; | |
| 1133 float floatOverlap; | |
| 1134 char indet; | |
| 1135 | |
| 1136 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1137 /* Compute the overlap */ | |
| 1138 floatOverlap = overlap * float(sequenNumber-1); | |
| 1139 ovrlap = int(overlap * (sequenNumber-1)); | |
| 1140 | |
| 1141 if(floatOverlap > float(ovrlap)) | |
| 1142 ovrlap++; | |
| 1143 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1144 | |
| 1145 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1146 /* If the spurious vectos is NULL, returns false. */ | |
| 1147 if(spuriousVector == NULL) | |
| 1148 return false; | |
| 1149 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1150 | |
| 1151 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1152 /* Depending on the kind of alignment, we have | |
| 1153 * different indetermination symbol */ | |
| 1154 if(getTypeAlignment() == AAType) | |
| 1155 indet = 'X'; | |
| 1156 else | |
| 1157 indet = 'N'; | |
| 1158 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1159 | |
| 1160 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1161 /* For each alignment's sequence, computes its overlap */ | |
| 1162 for(i = 0, seqValue = 0; i < sequenNumber; i++, seqValue = 0) { | |
| 1163 | |
| 1164 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1165 /* For each alignment's column, computes the overlap | |
| 1166 * between the selected sequence and the other ones */ | |
| 1167 for(j = 0; j < residNumber; j++) { | |
| 1168 | |
| 1169 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1170 /* For sequences are before the sequence selected */ | |
| 1171 for(k = 0, hit = 0; k < i; k++) { | |
| 1172 | |
| 1173 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1174 /* If the element of sequence selected is the same | |
| 1175 * that the element of sequence considered, computes | |
| 1176 * a hit */ | |
| 1177 if(sequences[i][j] == sequences[k][j]) | |
| 1178 hit++; | |
| 1179 | |
| 1180 /* If the element of sequence selected isn't a 'X' nor | |
| 1181 * 'N' (indetermination) or a '-' (gap) and the element | |
| 1182 * of sequence considered isn't a a 'X' nor 'N' | |
| 1183 * (indetermination) or a '-' (gap), computes a hit */ | |
| 1184 else if((sequences[i][j] != indet) && (sequences[i][j] != '-') | |
| 1185 && (sequences[k][j] != indet) && (sequences[k][j] != '-')) | |
| 1186 hit++; | |
| 1187 } | |
| 1188 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1189 | |
| 1190 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1191 /* For sequences are after the sequence selected */ | |
| 1192 for(k = (i + 1); k < sequenNumber; k++) { | |
| 1193 | |
| 1194 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1195 /* If the element of sequence selected is the same | |
| 1196 * that the element of sequence considered, computes | |
| 1197 * a hit */ | |
| 1198 if(sequences[i][j] == sequences[k][j]) | |
| 1199 hit++; | |
| 1200 | |
| 1201 /* If the element of sequence selected isn't a 'X' nor | |
| 1202 * 'N' (indetermination) or a '-' (gap) and the element | |
| 1203 * of sequence considered isn't a a 'X' nor 'N' | |
| 1204 * (indetermination) or a '-' (gap), computes a hit */ | |
| 1205 else if((sequences[i][j] != indet) && (sequences[i][j] != '-') | |
| 1206 && (sequences[k][j] != indet) && (sequences[k][j] != '-')) | |
| 1207 hit++; | |
| 1208 } | |
| 1209 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1210 | |
| 1211 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1212 /* Finally, if the hit's number divided by number of | |
| 1213 * sequences minus one is greater or equal than | |
| 1214 * overlap's value, computes a column's hit. */ | |
| 1215 if(hit >= ovrlap) | |
| 1216 seqValue++; | |
| 1217 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1218 } | |
| 1219 | |
| 1220 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1221 /* For each alignment's sequence, computes its spurious's | |
| 1222 * or overlap's value as the column's hits -for that | |
| 1223 sequence- divided by column's number. */ | |
| 1224 spuriousVector[i] = ((float) seqValue / residNumber); | |
| 1225 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1226 } | |
| 1227 | |
| 1228 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1229 /* If there is not problem in the method, return true */ | |
| 1230 return true; | |
| 1231 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1232 } | |
| 1233 | |
| 1234 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1235 /* This method compute the cluster belongs to a given alignment at certain | |
| 1236 * identity threshold. To know the clusters from the alignment, those | |
| 1237 * clusters are calculated as follow: 1) Select the longest sequences. 2) | |
| 1238 * Using the previous computed identity values, compare if the second | |
| 1239 * longest sequence belongs, because the identity value with the cluster | |
| 1240 * representative is equal or greater than a given threshold, to this cluster | |
| 1241 * or not. 3) If the sequence belongs to this cluster, we add it, if not | |
| 1242 * belongs, we create a new cluster and fix this sequence as its representative | |
| 1243 * 4) Continue with the rest of sequences. In the case that a given sequence | |
| 1244 * can belong to more than one cluster, we choose the cluster which one | |
| 1245 * maximize the identity value respect to its representative sequence */ | |
| 1246 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1247 int *alignment::calculateRepresentativeSeq(float maximumIdent) { | |
| 1248 | |
| 1249 int i, j, pos, clusterNum, **seqs; | |
| 1250 int *cluster; | |
| 1251 static int *repres; | |
| 1252 float max; | |
| 1253 | |
| 1254 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1255 /* Ask for the sequence identities assesment */ | |
| 1256 if(identities == NULL) | |
| 1257 calculateSeqIdentity(); | |
| 1258 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1259 | |
| 1260 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1261 seqs = new int*[sequenNumber]; | |
| 1262 for(i = 0; i < sequenNumber; i++) { | |
| 1263 seqs[i] = new int[2]; | |
| 1264 seqs[i][0] = utils::removeCharacter('-', sequences[i]).size(); | |
| 1265 seqs[i][1] = i; | |
| 1266 } | |
| 1267 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1268 | |
| 1269 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1270 utils::quicksort(seqs, 0, sequenNumber-1); | |
| 1271 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1272 | |
| 1273 cluster = new int[sequenNumber]; | |
| 1274 cluster[0] = seqs[sequenNumber - 1][1]; | |
| 1275 clusterNum = 1; | |
| 1276 | |
| 1277 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1278 for(i = sequenNumber - 2; i >= 0; i--) { | |
| 1279 | |
| 1280 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1281 for(j = 0, max = 0, pos = -1; j < clusterNum; j++) { | |
| 1282 if(identities[seqs[i][1]][cluster[j]] > maximumIdent) { | |
| 1283 if(identities[seqs[i][1]][cluster[j]] > max) { | |
| 1284 max = identities[seqs[i][1]][cluster[j]]; | |
| 1285 pos = j; | |
| 1286 } | |
| 1287 } | |
| 1288 } | |
| 1289 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1290 | |
| 1291 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1292 if(pos == -1) { | |
| 1293 cluster[j] = seqs[i][1]; | |
| 1294 clusterNum++; | |
| 1295 } | |
| 1296 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1297 } | |
| 1298 | |
| 1299 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1300 repres = new int[clusterNum + 1]; | |
| 1301 repres[0] = clusterNum; | |
| 1302 for(i = 0; i < clusterNum; i++) | |
| 1303 repres[i+1] = cluster[i]; | |
| 1304 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1305 | |
| 1306 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1307 /* Deallocate dinamic memory */ | |
| 1308 for(i = 0; i < sequenNumber; i++) | |
| 1309 delete [] seqs[i]; | |
| 1310 | |
| 1311 delete [] cluster; | |
| 1312 delete [] seqs; | |
| 1313 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1314 | |
| 1315 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1316 return repres; | |
| 1317 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1318 } | |
| 1319 | |
| 1320 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1321 /* This method looks for the optimal cut point for a given clusters number. | |
| 1322 * The idea is to find a identity cut-off point that can be used to get a | |
| 1323 * number of representative sequences similar to the input parameter */ | |
| 1324 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1325 float alignment::getCutPointClusters(int clusterNumber) { | |
| 1326 | |
| 1327 float max, min, avg, gMax, gMin, startingPoint, prevValue = 0, iter = 0; | |
| 1328 int i, j, clusterNum, *cluster, **seqs; | |
| 1329 | |
| 1330 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1331 /* If the user wants only one cluster means that all | |
| 1332 * of sequences have to be in the same clauster. | |
| 1333 * Otherwise, if the users wants the maximum number of | |
| 1334 * clusters means that each sequence have to be in their | |
| 1335 * own cluster */ | |
| 1336 if(clusterNumber == sequenNumber) return 1; | |
| 1337 else if(clusterNumber == 1) return 0; | |
| 1338 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1339 | |
| 1340 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1341 /* Ask for the sequence identities assesment */ | |
| 1342 if(identities == NULL) | |
| 1343 calculateSeqIdentity(); | |
| 1344 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1345 | |
| 1346 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1347 /* Compute the maximum, the minimum and the average | |
| 1348 * identity values from the sequences */ | |
| 1349 for(i = 0,gMax = 0, gMin = 1, startingPoint = 0; i < sequenNumber; i++) { | |
| 1350 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1351 for(j = 0, max = 0, avg = 0, min = 1; j < i; j++) { | |
| 1352 if(max < identities[i][j]) | |
| 1353 max = identities[i][j]; | |
| 1354 if(min > identities[i][j]) | |
| 1355 min = identities[i][j]; | |
| 1356 avg += identities[i][j]; | |
| 1357 } | |
| 1358 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1359 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1360 for(j = i + 1; j < sequenNumber; j++) { | |
| 1361 if(max < identities[i][j]) | |
| 1362 max = identities[i][j]; | |
| 1363 if(min > identities[i][j]) | |
| 1364 min = identities[i][j]; | |
| 1365 avg += identities[i][j]; | |
| 1366 } | |
| 1367 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1368 startingPoint += avg / (sequenNumber - 1); | |
| 1369 if(max > gMax) gMax = max; | |
| 1370 if(min < gMin) gMin = min; | |
| 1371 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1372 } | |
| 1373 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1374 | |
| 1375 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1376 /* Take the starting point as the average value */ | |
| 1377 startingPoint /= sequenNumber; | |
| 1378 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1379 | |
| 1380 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1381 /* Compute and sort the sequence length */ | |
| 1382 seqs = new int*[sequenNumber]; | |
| 1383 for(i = 0; i < sequenNumber; i++) { | |
| 1384 seqs[i] = new int[2]; | |
| 1385 seqs[i][0] = utils::removeCharacter('-', sequences[i]).size(); | |
| 1386 seqs[i][1] = i; | |
| 1387 } | |
| 1388 utils::quicksort(seqs, 0, sequenNumber-1); | |
| 1389 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1390 | |
| 1391 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1392 /* Create the data structure to store the different | |
| 1393 * clusters for a given thresholds */ | |
| 1394 cluster = new int[sequenNumber]; | |
| 1395 cluster[0] = seqs[sequenNumber - 1][1]; | |
| 1396 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1397 | |
| 1398 /**** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1399 /* Look for the optimal identity value to get the | |
| 1400 * number of cluster set by the user. To do that, the | |
| 1401 * method starts in the average identity value and moves | |
| 1402 * to the right or lefe side of this value depending on | |
| 1403 * if this value is so strict or not. We set an flag to | |
| 1404 * avoid enter in an infinite loop, if we get the same | |
| 1405 * value for more than 10 consecutive point without get | |
| 1406 * the given cluster number means that it's impossible | |
| 1407 * to get this cut-off and we need to stop the search */ | |
| 1408 do { | |
| 1409 clusterNum = 1; | |
| 1410 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1411 /* Start the search */ | |
| 1412 for(i = sequenNumber - 2; i >= 0; i--) { | |
| 1413 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1414 for(j = 0; j < clusterNum; j++) | |
| 1415 if(identities[seqs[i][1]][cluster[j]] > startingPoint) | |
| 1416 break; | |
| 1417 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1418 | |
| 1419 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1420 if(j == clusterNum) { | |
| 1421 cluster[j] = seqs[i][1]; | |
| 1422 clusterNum++; | |
| 1423 } | |
| 1424 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1425 } | |
| 1426 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1427 | |
| 1428 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1429 /* Given the cutoff point, if we get the same cluster | |
| 1430 * number or we have been getting for more than 10 | |
| 1431 * consecutive times the same cutoff point, we stop */ | |
| 1432 if((clusterNum == clusterNumber) || (iter > 10)) | |
| 1433 break; | |
| 1434 else { | |
| 1435 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1436 /* We have to move to the left side from the average | |
| 1437 * to decrease the cutoff value and get a smaller number | |
| 1438 * of clusters */ | |
| 1439 if(clusterNum > clusterNumber) { | |
| 1440 gMax = startingPoint; | |
| 1441 startingPoint = (gMax + gMin)/2; | |
| 1442 } else { | |
| 1443 /* In the opposite side, we have to move to the right | |
| 1444 * side from the cutting point to be more strict and get | |
| 1445 * a bigger number of clusters */ | |
| 1446 gMin = startingPoint; | |
| 1447 startingPoint = (gMax + gMin)/2; | |
| 1448 } | |
| 1449 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1450 | |
| 1451 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1452 /* If the clusters number from the previous iteration | |
| 1453 * is different from the current number, we put the | |
| 1454 * iteration number to 0 and store this new value */ | |
| 1455 if(prevValue != clusterNum) { | |
| 1456 iter = 0; | |
| 1457 prevValue = clusterNum; | |
| 1458 /* Otherwise, we increase the iteration number to | |
| 1459 * avoid enter in an infinitve loop */ | |
| 1460 } else iter++; | |
| 1461 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1462 } | |
| 1463 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1464 } while(true); | |
| 1465 | |
| 1466 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1467 /* Deallocate dinamic memory */ | |
| 1468 for(i = 0; i < sequenNumber; i++) delete [] seqs[i]; | |
| 1469 delete [] seqs; | |
| 1470 delete [] cluster; | |
| 1471 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1472 | |
| 1473 return startingPoint; | |
| 1474 } | |
| 1475 | |
| 1476 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1477 /* This method select one representative sequence (the longest one) per each | |
| 1478 * cluster from the input alignment and generate a new alignment */ | |
| 1479 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1480 alignment *alignment::getClustering(float identityThreshold) { | |
| 1481 | |
| 1482 string *matrixAux, *newSeqsName; | |
| 1483 int i, j, *clustering; | |
| 1484 alignment *newAlig; | |
| 1485 | |
| 1486 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1487 /* Get the representative member for each cluster | |
| 1488 * given a maximum identity threshold */ | |
| 1489 clustering = calculateRepresentativeSeq(identityThreshold); | |
| 1490 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1491 | |
| 1492 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1493 /* Put all sequences to be deleted and get back those | |
| 1494 * sequences that are representative for each cluster | |
| 1495 * */ | |
| 1496 for(i = 0; i < sequenNumber; i ++) | |
| 1497 saveSequences[i] = -1; | |
| 1498 for(i = 1; i <= clustering[0]; i ++) | |
| 1499 saveSequences[clustering[i]] = clustering[i]; | |
| 1500 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1501 | |
| 1502 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1503 /* We allocate memory to save the sequences selected */ | |
| 1504 matrixAux = new string[clustering[0]]; | |
| 1505 newSeqsName = new string[clustering[0]]; | |
| 1506 | |
| 1507 /* Copy to new structures the information that have | |
| 1508 * been selected previously. */ | |
| 1509 for(i = 0, j = 0; i < sequenNumber; i++) | |
| 1510 if(saveSequences[i] != -1) { | |
| 1511 newSeqsName[j] = seqsName[i]; | |
| 1512 matrixAux[j] = sequences[i]; | |
| 1513 j++; | |
| 1514 } | |
| 1515 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1516 | |
| 1517 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1518 /* When we have all parameters, we create the new | |
| 1519 * alignment */ | |
| 1520 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
| 1521 clustering[0], residNumber, iformat, oformat, shortNames, dataType, isAligned, | |
| 1522 reverse, terminalGapOnly, left_boundary, right_boundary, | |
| 1523 keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber, | |
| 1524 saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities, | |
| 1525 overlaps); | |
| 1526 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1527 | |
| 1528 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1529 /* Deallocated auxiliar memory */ | |
| 1530 delete [] matrixAux; | |
| 1531 delete [] newSeqsName; | |
| 1532 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1533 | |
| 1534 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1535 /* Return the new alignment reference */ | |
| 1536 return newAlig; | |
| 1537 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1538 } | |
| 1539 | |
| 1540 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1541 /* This function returns the backtranslation for a given protein processed | |
| 1542 * alignment into its CDS alignment. To do this convertion, the function needs | |
| 1543 * the Coding sequences as well the original composition of each protein | |
| 1544 * sequence. Also, the function needs to know which columns/sequences will be | |
| 1545 * in the final alignment to carray out the conversion. */ | |
| 1546 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1547 alignment *alignment::getTranslationCDS(int newResidues, int newSequences, int *ColumnsToKeep, string *oldSeqsName, sequencesMatrix *seqMatrix, alignment *ProtAlig) { | |
| 1548 | |
| 1549 string *matrixAux; | |
| 1550 alignment *newAlig; | |
| 1551 int i, j, k, l, oldResidues; | |
| 1552 int *mappedSeqs, *tmpSequence, *selectedRes; | |
| 1553 | |
| 1554 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1555 /* Map the selected protein sequences to the input | |
| 1556 * coding sequences */ | |
| 1557 mappedSeqs = new int[newSequences]; | |
| 1558 for(i = 0; i < sequenNumber; i++) | |
| 1559 for(j = 0; j < newSequences; j++) | |
| 1560 if(!seqsName[i].compare(oldSeqsName[j])) { | |
| 1561 mappedSeqs[j] = i; | |
| 1562 break; | |
| 1563 } | |
| 1564 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1565 | |
| 1566 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1567 /* Get the original alignment size as well the original | |
| 1568 * selected/non-selected columns */ | |
| 1569 oldResidues = seqMatrix -> getResidNumber(); | |
| 1570 selectedRes = new int[oldResidues]; | |
| 1571 | |
| 1572 for(i = 0; i < oldResidues; i++) | |
| 1573 selectedRes[i] = i; | |
| 1574 | |
| 1575 for(j = 0; j < ColumnsToKeep[0]; j++) | |
| 1576 selectedRes[j] = -1; | |
| 1577 | |
| 1578 for(i = 0; i < newResidues - 1; i++) | |
| 1579 for(j = ColumnsToKeep[i] + 1; j < ColumnsToKeep[i+1]; j++) | |
| 1580 selectedRes[j] = -1; | |
| 1581 | |
| 1582 for(j = ColumnsToKeep[newResidues - 1] + 1; j < oldResidues; j++) | |
| 1583 selectedRes[j] = -1; | |
| 1584 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1585 | |
| 1586 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1587 /* We allocate memory to save the columns selected */ | |
| 1588 matrixAux = new string[newSequences]; | |
| 1589 tmpSequence = new int[oldResidues]; | |
| 1590 | |
| 1591 /* Using the information about which residues for each | |
| 1592 * sequence was selected by others function, we process | |
| 1593 * these residues to recover the corresponding codons */ | |
| 1594 for(i = 0; i < newSequences; i++) | |
| 1595 if(seqMatrix -> getSequence(oldSeqsName[i], tmpSequence)) { | |
| 1596 for(j = 0; j < oldResidues; j++) { | |
| 1597 if((selectedRes[j] != -1) && (tmpSequence[j] != 0)) { | |
| 1598 | |
| 1599 for(k = 3 * (tmpSequence[j] - 1), l = 0; l < 3; k++, l++) { | |
| 1600 /* Check whether the nucleotide sequences end has been reached or not. | |
| 1601 * If it has been reached, complete backtranslation using indetermination | |
| 1602 * symbols 'N' */ | |
| 1603 if((int) sequences[mappedSeqs[i]].length() > k) | |
| 1604 matrixAux[i].resize(matrixAux[i].size() + 1, sequences[mappedSeqs[i]][k]); | |
| 1605 else | |
| 1606 matrixAux[i].resize(matrixAux[i].size() + 1, 'N'); | |
| 1607 } | |
| 1608 } else if(selectedRes[j] != -1) { | |
| 1609 matrixAux[i].resize(matrixAux[i].size() + 3, '-'); | |
| 1610 } | |
| 1611 } | |
| 1612 /* If there is any problems with a sequence then | |
| 1613 * the function returns an error */ | |
| 1614 } else { | |
| 1615 return NULL; | |
| 1616 } | |
| 1617 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1618 | |
| 1619 | |
| 1620 /* Remove columns/sequences composed only by gaps before making the retrotranslation */ | |
| 1621 | |
| 1622 | |
| 1623 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1624 /* When we have all parameters, we create the new | |
| 1625 * alignment */ | |
| 1626 newAlig = new alignment(filename, "", matrixAux, oldSeqsName, NULL, newSequences, | |
| 1627 newResidues * 3, ProtAlig -> getInputFormat(), ProtAlig -> getOutputFormat(), | |
| 1628 ProtAlig -> getShortNames(), DNAType, true, ProtAlig -> getReverse(), | |
| 1629 terminalGapOnly, left_boundary, right_boundary, | |
| 1630 keepSequences, keepHeader, sequenNumber, oldResidues * 3, NULL, NULL, | |
| 1631 NULL, 0, 0, ProtAlig -> getBlockSize(), NULL, NULL); | |
| 1632 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1633 | |
| 1634 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1635 /* Deallocated auxiliar memory */ | |
| 1636 delete [] matrixAux; | |
| 1637 delete mappedSeqs; | |
| 1638 delete tmpSequence; | |
| 1639 delete selectedRes; | |
| 1640 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1641 | |
| 1642 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1643 /* Return the new alignment reference */ | |
| 1644 return newAlig; | |
| 1645 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1646 } | |
| 1647 | |
| 1648 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1649 /* This function computes the gaps statistics for the input alignment. */ | |
| 1650 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1651 bool alignment::calculateGapStats(void) { | |
| 1652 | |
| 1653 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1654 /* If alignment matrix is not created, return false */ | |
| 1655 if(sequences == NULL) | |
| 1656 return false; | |
| 1657 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1658 | |
| 1659 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1660 /* If sgaps object is not created, we create them | |
| 1661 and calculate the statistics */ | |
| 1662 if(sgaps == NULL) { | |
| 1663 sgaps = new statisticsGaps(sequences, sequenNumber, residNumber, dataType); | |
| 1664 sgaps -> applyWindow(ghWindow); | |
| 1665 } | |
| 1666 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1667 | |
| 1668 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1669 return true; | |
| 1670 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1671 } | |
| 1672 | |
| 1673 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1674 /* Print the gaps value for each column in the alignment */ | |
| 1675 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1676 void alignment::printStatisticsGapsColumns(void) { | |
| 1677 | |
| 1678 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1679 /* Check if there is computed the gaps statistics */ | |
| 1680 if(calculateGapStats()) | |
| 1681 /* then prints the information */ | |
| 1682 sgaps -> printGapsColumns(); | |
| 1683 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1684 } | |
| 1685 | |
| 1686 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1687 /* Print the acumulative gaps distribution value from the input alignment */ | |
| 1688 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1689 void alignment::printStatisticsGapsTotal(void) { | |
| 1690 | |
| 1691 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1692 /* Check it there is computed the gaps statistics */ | |
| 1693 if(calculateGapStats()) | |
| 1694 /* then prints the information */ | |
| 1695 sgaps -> printGapsAcl(); | |
| 1696 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1697 } | |
| 1698 | |
| 1699 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1700 /* Set the similarity matrix. This matrix is necessary for some methods in | |
| 1701 * the program */ | |
| 1702 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1703 bool alignment::setSimilarityMatrix(similarityMatrix *sm) { | |
| 1704 | |
| 1705 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1706 /* If scons object is not created, we create them */ | |
| 1707 if(scons == NULL) | |
| 1708 scons = new statisticsConservation(sequences, sequenNumber, residNumber, dataType); | |
| 1709 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1710 | |
| 1711 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1712 /* Associate the matrix to the similarity statistics | |
| 1713 * object */ | |
| 1714 if(!scons -> setSimilarityMatrix(sm)) | |
| 1715 return false; | |
| 1716 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1717 | |
| 1718 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1719 /* If it's OK, we return true */ | |
| 1720 return true; | |
| 1721 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1722 } | |
| 1723 | |
| 1724 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1725 /* This method compute the similarity values from the input alignment if it | |
| 1726 * has not been computed before */ | |
| 1727 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1728 bool alignment::calculateConservationStats(void) { | |
| 1729 | |
| 1730 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1731 /* It the gaps statistics object has not been created | |
| 1732 * we create it */ | |
| 1733 if(calculateGapStats() != true) | |
| 1734 return false; | |
| 1735 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1736 | |
| 1737 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1738 /* It the similarity statistics object has not been | |
| 1739 * created we create it */ | |
| 1740 if(scons == NULL) | |
| 1741 return false; | |
| 1742 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1743 | |
| 1744 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1745 /* Ask for the similarity matrix */ | |
| 1746 if(scons -> isSimMatrixDef() != true) | |
| 1747 return false; | |
| 1748 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1749 | |
| 1750 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1751 /* Compute the similarity statistics from the input | |
| 1752 * alignment */ | |
| 1753 if(!scons -> calculateVectors(sequences, sgaps->getGapsWindow())) | |
| 1754 return false; | |
| 1755 | |
| 1756 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1757 /* Ask to know if it is necessary to apply any window | |
| 1758 * method. If it's necessary, we apply it */ | |
| 1759 if(scons->isDefinedWindow()) | |
| 1760 return true; | |
| 1761 else | |
| 1762 return scons->applyWindow(shWindow); | |
| 1763 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1764 } | |
| 1765 | |
| 1766 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1767 /* Print the similarity value for each column from the alignment */ | |
| 1768 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1769 void alignment::printStatisticsConservationColumns(void) { | |
| 1770 | |
| 1771 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1772 /* Check if the similarity statistics object has been | |
| 1773 * created */ | |
| 1774 if(calculateConservationStats()) | |
| 1775 /* then prints the information */ | |
| 1776 scons -> printConservationColumns(); | |
| 1777 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1778 } | |
| 1779 | |
| 1780 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1781 /* Print the accumulative similarity distribution values from the alignment */ | |
| 1782 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1783 void alignment::printStatisticsConservationTotal(void) { | |
| 1784 | |
| 1785 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1786 /* Check if the similarity statistics object has been | |
| 1787 * created */ | |
| 1788 if(calculateConservationStats()) | |
| 1789 /* then prints the information */ | |
| 1790 scons -> printConservationAcl(); | |
| 1791 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1792 } | |
| 1793 | |
| 1794 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1795 /* This method prints the correspondece between the columns in the original | |
| 1796 * and in the trimmed alignment */ | |
| 1797 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1798 void alignment::printCorrespondence(void) { | |
| 1799 int i; | |
| 1800 | |
| 1801 cout << "#ColumnsMap\t"; | |
| 1802 /* Print the saveResidues relathionship */ | |
| 1803 for(i = 0; i < residNumber - 1; i++) | |
| 1804 cout << saveResidues[i] << ", "; | |
| 1805 cout << saveResidues[i] << endl; | |
| 1806 | |
| 1807 } | |
| 1808 | |
| 1809 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1810 /* Return the number of the sequences in the alignment */ | |
| 1811 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1812 int alignment::getNumSpecies(void) { | |
| 1813 return sequenNumber; | |
| 1814 } | |
| 1815 | |
| 1816 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1817 /* Return the number of residues in the alignment */ | |
| 1818 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1819 int alignment::getNumAminos(void) { | |
| 1820 return residNumber; | |
| 1821 } | |
| 1822 | |
| 1823 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1824 /* Sets the condition to remove only terminal gaps after applying any | |
| 1825 * trimming method or not. */ | |
| 1826 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1827 void alignment::trimTerminalGaps(bool terminalOnly_, int *boundaries_) { | |
| 1828 terminalGapOnly = terminalOnly_; | |
| 1829 | |
| 1830 /* We are only interested in the first and last number of the vector - this | |
| 1831 * vector is generated by other function where it is important to store all | |
| 1832 * intervals */ | |
| 1833 if (boundaries_ != NULL) { | |
| 1834 left_boundary = boundaries_[0]; | |
| 1835 right_boundary = boundaries_[1]; | |
| 1836 } | |
| 1837 } | |
| 1838 | |
| 1839 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1840 /* This method stores the diferent windows values in the alignment object */ | |
| 1841 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1842 void alignment::setWindowsSize(int ghWindow_, int shWindow_) { | |
| 1843 ghWindow = ghWindow_; | |
| 1844 shWindow = shWindow_; | |
| 1845 } | |
| 1846 | |
| 1847 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1848 /* This method lets to change the output format */ | |
| 1849 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1850 void alignment::setOutputFormat(int format_, bool shortNames_) { | |
| 1851 oformat = format_; | |
| 1852 shortNames = shortNames_; | |
| 1853 } | |
| 1854 | |
| 1855 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1856 /* This method sets a new block size value */ | |
| 1857 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1858 void alignment::setBlockSize(int blockSize_) { | |
| 1859 blockSize = blockSize_; | |
| 1860 } | |
| 1861 | |
| 1862 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1863 /* Return true if the shortNames flag has been setted */ | |
| 1864 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1865 int alignment::getShortNames(void) { | |
| 1866 return shortNames; | |
| 1867 } | |
| 1868 | |
| 1869 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1870 /* Return true if the reverse flag has been setted. */ | |
| 1871 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1872 int alignment::getReverse(void) { | |
| 1873 return reverse; | |
| 1874 } | |
| 1875 | |
| 1876 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1877 /* This method lets to change the output alignment orientation */ | |
| 1878 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1879 void alignment::setReverse(void) { | |
| 1880 reverse = true; | |
| 1881 } | |
| 1882 | |
| 1883 | |
| 1884 /* Set appropiate flag to decide whether sequences composed only by gaps should | |
| 1885 * be kept or not */ | |
| 1886 void alignment::setKeepSequencesFlag(bool flag) { | |
| 1887 keepSequences = flag; | |
| 1888 } | |
| 1889 | |
| 1890 /* Set appropiate flag to decide whether original sequences header should be | |
| 1891 * dumped into the output stream with or without any preprocessing step */ | |
| 1892 void alignment::setKeepSeqsHeaderFlag(bool flag) { | |
| 1893 keepHeader = flag; | |
| 1894 } | |
| 1895 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1896 /* Return the block size value */ | |
| 1897 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1898 int alignment::getBlockSize(void) { | |
| 1899 return blockSize; | |
| 1900 } | |
| 1901 | |
| 1902 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1903 /* This method returns the sequences name */ | |
| 1904 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1905 void alignment::getSequences(string *Names) { | |
| 1906 for(int i = 0; i < sequenNumber; i++) | |
| 1907 Names[i] = seqsName[i]; | |
| 1908 } | |
| 1909 | |
| 1910 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1911 void alignment::getSequences(string *Names, int *lengths) { | |
| 1912 for(int i = 0; i < sequenNumber; i++) { | |
| 1913 lengths[i] = (int) utils::removeCharacter('-', sequences[i]).length(); | |
| 1914 Names[i] = seqsName[i]; | |
| 1915 } | |
| 1916 } | |
| 1917 | |
| 1918 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1919 void alignment::getSequences(string *Names, string *Sequences, int *Lengths) { | |
| 1920 for(int i = 0; i < sequenNumber; i++) { | |
| 1921 Names[i] = seqsName[i]; | |
| 1922 Sequences[i] = utils::removeCharacter('-', sequences[i]); | |
| 1923 Lengths[i] = (int) Sequences[i].length(); | |
| 1924 } | |
| 1925 } | |
| 1926 | |
| 1927 | |
| 1928 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1929 /* For a given set of sequences name, check for its correspondence with | |
| 1930 * the current sequences name. If there is not a complete correspondence | |
| 1931 * between both sets, the method return false, otherwise, return true */ | |
| 1932 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1933 bool alignment::getSeqNameOrder(string *names, int *orderVector) { | |
| 1934 int i, j, numNames; | |
| 1935 | |
| 1936 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1937 /* For each name in the current alignment, we look | |
| 1938 * for its correspondence in the input set */ | |
| 1939 for(i = 0, numNames = 0; i < sequenNumber; i++) { | |
| 1940 for(j = 0; j < sequenNumber; j++) { | |
| 1941 if(seqsName[i] == names[j]) { | |
| 1942 orderVector[i] = j; | |
| 1943 numNames++; | |
| 1944 break; | |
| 1945 } | |
| 1946 } | |
| 1947 } | |
| 1948 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1949 | |
| 1950 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1951 /* Depending on if we get the complete correspondence | |
| 1952 * between both sets of names, we return true or not */ | |
| 1953 if(numNames == sequenNumber) return true; | |
| 1954 else return false; | |
| 1955 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1956 } | |
| 1957 | |
| 1958 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1959 /* This method lets to build a sequence matrix. A sequence matrix contains | |
| 1960 * the residue position for each sequence without taking into account the | |
| 1961 * gaps in the sequence. This means that at position 10 we have the residue | |
| 1962 * 1 for that sequence */ | |
| 1963 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1964 void alignment::sequenMatrix(void) { | |
| 1965 if(seqMatrix == NULL) | |
| 1966 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
| 1967 } | |
| 1968 | |
| 1969 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1970 /* Use this method to destroy a given sequence matrix */ | |
| 1971 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1972 void alignment::destroySequenMatrix(void) { | |
| 1973 if(seqMatrix != NULL) | |
| 1974 delete seqMatrix; | |
| 1975 seqMatrix = NULL; | |
| 1976 } | |
| 1977 | |
| 1978 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1979 /* Print the alignment's sequence matrix */ | |
| 1980 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1981 void alignment::printSequenMatrix(void) { | |
| 1982 if(seqMatrix == NULL) | |
| 1983 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
| 1984 seqMatrix -> printMatrix(); | |
| 1985 } | |
| 1986 | |
| 1987 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1988 /* Given an index, this method returns the sequence matrix column for that | |
| 1989 * index */ | |
| 1990 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1991 void alignment::getColumnSeqMatrix(int column, int *columnSeqMatrix) { | |
| 1992 if(seqMatrix == NULL) | |
| 1993 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
| 1994 seqMatrix -> getColumn(column, columnSeqMatrix); | |
| 1995 } | |
| 1996 | |
| 1997 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 1998 /* This function looks if a given value belongs a given row. In the affirmative | |
| 1999 * case, returns the columns value for that row and value combination */ | |
| 2000 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2001 void alignment::getColumnSeqMatrix(int value, int row, int *columnSeqMatrix) { | |
| 2002 if(seqMatrix == NULL) | |
| 2003 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
| 2004 seqMatrix -> getColumn(value, row, columnSeqMatrix); | |
| 2005 } | |
| 2006 | |
| 2007 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2008 /* This function change the rows order in the sequence matrix */ | |
| 2009 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2010 void alignment::setSeqMatrixOrder(int *order) { | |
| 2011 if(seqMatrix == NULL) | |
| 2012 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
| 2013 seqMatrix -> setOrder(order); | |
| 2014 } | |
| 2015 | |
| 2016 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2017 /* This function change the rows order in the sequence matrix */ | |
| 2018 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2019 sequencesMatrix *alignment::getSeqMatrix(void) { | |
| 2020 if(seqMatrix == NULL) | |
| 2021 seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber); | |
| 2022 return seqMatrix; | |
| 2023 } | |
| 2024 | |
| 2025 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2026 /* Computes, if it's necessary, and return the alignment's type */ | |
| 2027 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2028 int alignment::getTypeAlignment(void) { | |
| 2029 if(dataType == 0) | |
| 2030 dataType = utils::checkTypeAlignment(sequenNumber, residNumber, sequences); | |
| 2031 return dataType; | |
| 2032 } | |
| 2033 | |
| 2034 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2035 /* Returns the correspondence between the columns in the original and in the | |
| 2036 * trimmed alignment */ | |
| 2037 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2038 int *alignment::getCorrespResidues(void) { | |
| 2039 return saveResidues; | |
| 2040 } | |
| 2041 | |
| 2042 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2043 /* Returns the correspondence between the sequences in the original and in | |
| 2044 * the trimmed alignment */ | |
| 2045 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2046 int *alignment::getCorrespSequences(void) { | |
| 2047 return saveSequences; | |
| 2048 } | |
| 2049 | |
| 2050 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2051 /* Returns if the alignment is aligned or not */ | |
| 2052 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2053 bool alignment::isFileAligned(void) { | |
| 2054 return isAligned; | |
| 2055 } | |
| 2056 | |
| 2057 /* ***************************************************************************** | |
| 2058 * ***************************************************************************** | |
| 2059 * ***************************************************************************** | |
| 2060 * ************************************************************************** */ | |
| 2061 | |
| 2062 /* This method removes those columns that exceed a given threshold. If the | |
| 2063 * number of columns in the trimmed alignment is lower than a given percentage | |
| 2064 * of the original alignment. The program relaxes the threshold until to add | |
| 2065 * enough columns to achieve this percentage, to add those columns the program | |
| 2066 * start in the middle of the original alignment and make, alternatively, | |
| 2067 * movements to the left and right side from that point. This method also can | |
| 2068 * return the complementary alignment. */ | |
| 2069 alignment *alignment::cleanByCutValue(double cut, float baseLine, | |
| 2070 const int *gInCol, bool complementary) { | |
| 2071 | |
| 2072 int i, j, k, jn, oth, pos, block, *vectAux; | |
| 2073 string *matrixAux, *newSeqsName; | |
| 2074 alignment *newAlig; | |
| 2075 newValues counter; | |
| 2076 | |
| 2077 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2078 /* Select the columns with a gaps value less or equal | |
| 2079 * than the cut point. */ | |
| 2080 for(i = 0, counter.residues = 0; i < residNumber; i++) | |
| 2081 if(gInCol[i] <= cut) counter.residues++; | |
| 2082 else saveResidues[i] = -1; | |
| 2083 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2084 | |
| 2085 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2086 /* Compute, if it's necessary, the number of columns | |
| 2087 * necessary to achieve the minimum number of columns | |
| 2088 * fixed by coverage parameter. */ | |
| 2089 oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber); | |
| 2090 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2091 | |
| 2092 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2093 /* if it's necessary to recover some columns, we | |
| 2094 * applied this instructions to recover it */ | |
| 2095 if(oth > 0) { | |
| 2096 counter.residues += oth; | |
| 2097 | |
| 2098 /* Allocate memory */ | |
| 2099 vectAux = new int[residNumber]; | |
| 2100 | |
| 2101 /* Sort a copy of the gInCol vector, and take the value of the column that marks the % baseline */ | |
| 2102 utils::copyVect((int *) gInCol, vectAux, residNumber); | |
| 2103 utils::quicksort(vectAux, 0, residNumber-1); | |
| 2104 cut = vectAux[(int) ((float)(residNumber - 1) * (baseLine)/100.0)]; | |
| 2105 | |
| 2106 /* Deallocate memory */ | |
| 2107 delete [] vectAux; | |
| 2108 } | |
| 2109 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2110 | |
| 2111 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2112 /* Fixed the initial size of blocks as 0.5% of | |
| 2113 * alignment's length */ | |
| 2114 for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) { | |
| 2115 | |
| 2116 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2117 /* We start in the alignment middle then we move on | |
| 2118 * right and left side at the same time. */ | |
| 2119 for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) { | |
| 2120 | |
| 2121 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2122 /* Left side. Here, we compute the block's size. */ | |
| 2123 for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ; | |
| 2124 | |
| 2125 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2126 /* if block's size is greater or equal than the fixed | |
| 2127 * size then we save all columns that have not been | |
| 2128 * saved previously. */ | |
| 2129 if((i - jn) >= k) { | |
| 2130 for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) { | |
| 2131 if(gInCol[jn] <= cut) { | |
| 2132 saveResidues[jn] = jn; | |
| 2133 oth--; | |
| 2134 } else | |
| 2135 break; | |
| 2136 } | |
| 2137 } | |
| 2138 i = jn; | |
| 2139 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2140 | |
| 2141 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2142 /* Right side. Here, we compute the block's size. */ | |
| 2143 for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ; | |
| 2144 | |
| 2145 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2146 /* if block's size is greater or equal than the fixed | |
| 2147 * size then we save all columns that have not been | |
| 2148 * saved previously. */ | |
| 2149 if((jn - j) >= k) { | |
| 2150 for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) { | |
| 2151 if(gInCol[jn] <= cut) { | |
| 2152 saveResidues[jn] = jn; | |
| 2153 oth--; | |
| 2154 } else | |
| 2155 break; | |
| 2156 } | |
| 2157 } | |
| 2158 j = jn; | |
| 2159 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2160 } | |
| 2161 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2162 } | |
| 2163 | |
| 2164 /* Keep only columns blocks bigger than an input columns block size */ | |
| 2165 if(blockSize != 0) { | |
| 2166 | |
| 2167 /* Traverse all alignment looking for columns blocks greater than LONGBLOCK, | |
| 2168 * everytime than a column is not selected by the trimming method, check | |
| 2169 * whether the current block size until that point is big enough to be kept | |
| 2170 * or it should be removed from the final alignment */ | |
| 2171 for(i = 0, pos = 0, block = 0; i < residNumber; i++) { | |
| 2172 if(saveResidues[i] != -1) | |
| 2173 block++; | |
| 2174 else { | |
| 2175 /* Remove columns from blocks smaller than input blocks size */ | |
| 2176 if(block < blockSize) | |
| 2177 for(j = pos; j <= i; j++) | |
| 2178 saveResidues[j] = -1; | |
| 2179 pos = i + 1; | |
| 2180 block = 0; | |
| 2181 } | |
| 2182 } | |
| 2183 /* Check final block separately since it could happen than last block is not | |
| 2184 * big enough but because the loop end could provoke to ignore it */ | |
| 2185 if(block < blockSize) | |
| 2186 for(j = pos; j < i; j++) | |
| 2187 saveResidues[j] = -1; | |
| 2188 } | |
| 2189 | |
| 2190 /* If the flag -terminalony is set, apply a method to look for internal | |
| 2191 * boundaries and get back columns inbetween them, if they exist */ | |
| 2192 if(terminalGapOnly == true) | |
| 2193 if(!removeOnlyTerminal()) | |
| 2194 return NULL; | |
| 2195 | |
| 2196 /* Once the columns/sequences selection is done, turn it around | |
| 2197 * if complementary flag is active */ | |
| 2198 if(complementary == true) | |
| 2199 computeComplementaryAlig(true, false); | |
| 2200 | |
| 2201 /* Check for any additional column/sequence to be removed */ | |
| 2202 /* Compute new sequences and columns numbers */ | |
| 2203 counter = removeCols_SeqsAllGaps(); | |
| 2204 | |
| 2205 /* Allocate memory for selected sequences/columns */ | |
| 2206 matrixAux = new string[counter.sequences]; | |
| 2207 newSeqsName = new string[counter.sequences]; | |
| 2208 | |
| 2209 /* Fill local allocated memory with previously selected data */ | |
| 2210 fillNewDataStructure(matrixAux, newSeqsName); | |
| 2211 | |
| 2212 /* When we have all parameters, we create the new alignment */ | |
| 2213 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
| 2214 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
| 2215 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
| 2216 keepSequences, keepHeader, sequenNumber, residNumber, | |
| 2217 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
| 2218 identities, overlaps); | |
| 2219 | |
| 2220 /* Deallocate local memory */ | |
| 2221 delete[] matrixAux; | |
| 2222 delete[] newSeqsName; | |
| 2223 | |
| 2224 /* Return the new alignment reference */ | |
| 2225 return newAlig; | |
| 2226 } | |
| 2227 | |
| 2228 /* This method removes those columns that not achieve a given threshold. If the | |
| 2229 * number of columns in the trimmed alignment is lower than a given percentage | |
| 2230 * of the original alignment. The program relaxes the threshold until to add | |
| 2231 * enough columns to achieve this percentage, to add those columns the program | |
| 2232 * start in the middle of the original alignment and make, alternatively, | |
| 2233 * movements to the left and right side from that point. This method also can | |
| 2234 * return the complementary alignment. */ | |
| 2235 alignment *alignment::cleanByCutValue(float cut, float baseLine, | |
| 2236 const float *ValueVect, bool complementary) { | |
| 2237 | |
| 2238 int i, j, k, jn, oth, pos, block; | |
| 2239 string *matrixAux, *newSeqsName; | |
| 2240 alignment *newAlig; | |
| 2241 newValues counter; | |
| 2242 | |
| 2243 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2244 /* Select the columns with a conservation's value | |
| 2245 * greater than the cut point. */ | |
| 2246 for(i = 0, counter.residues = 0; i < residNumber; i++) | |
| 2247 if(ValueVect[i] > cut) counter.residues++; | |
| 2248 else saveResidues[i] = -1; | |
| 2249 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2250 | |
| 2251 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2252 /* Compute, if it's necessary, the number of columns | |
| 2253 * necessary to achieve the minimum number of columns | |
| 2254 * fixed by coverage value. */ | |
| 2255 oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber); | |
| 2256 if(oth > 0) counter.residues += oth; | |
| 2257 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2258 | |
| 2259 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2260 /* Fixed the initial size of blocks as 0.5% of | |
| 2261 * alignment's length */ | |
| 2262 for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) { | |
| 2263 | |
| 2264 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2265 /* We start in the alignment middle then we move on | |
| 2266 * right and left side at the same time. */ | |
| 2267 for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) { | |
| 2268 | |
| 2269 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2270 /* Left side. Here, we compute the block's size. */ | |
| 2271 for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ; | |
| 2272 | |
| 2273 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2274 /* if block's size is greater or equal than the fixed | |
| 2275 * size then we save all columns that have not been | |
| 2276 * saved previously. */ | |
| 2277 /* Here, we only accept column with a conservation's | |
| 2278 * value equal to conservation cut point. */ | |
| 2279 if((i - jn) >= k) { | |
| 2280 for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) { | |
| 2281 if(ValueVect[jn] == cut) { | |
| 2282 saveResidues[jn] = jn; | |
| 2283 oth--; | |
| 2284 } else | |
| 2285 break; | |
| 2286 } | |
| 2287 } | |
| 2288 i = jn; | |
| 2289 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2290 | |
| 2291 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2292 /* Right side. Here, we compute the block's size. */ | |
| 2293 for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ; | |
| 2294 | |
| 2295 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2296 /* if block's size is greater or equal than the fixed | |
| 2297 * size then we select the column and save the block's | |
| 2298 * size for the next iteraction. it's obvius that we | |
| 2299 * decrease the column's number needed to finish. */ | |
| 2300 /* Here, we only accept column with a conservation's | |
| 2301 * value equal to conservation cut point. */ | |
| 2302 if((jn - j) >= k) { | |
| 2303 for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) { | |
| 2304 if(ValueVect[jn] == cut) { | |
| 2305 saveResidues[jn] = jn; | |
| 2306 oth--; | |
| 2307 } else | |
| 2308 break; | |
| 2309 } | |
| 2310 } | |
| 2311 j = jn; | |
| 2312 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2313 } | |
| 2314 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2315 } | |
| 2316 | |
| 2317 /* Keep only columns blocks bigger than an input columns block size */ | |
| 2318 if(blockSize != 0) { | |
| 2319 | |
| 2320 /* Traverse all alignment looking for columns blocks greater than LONGBLOCK, | |
| 2321 * everytime than a column is not selected by the trimming method, check | |
| 2322 * whether the current block size until that point is big enough to be kept | |
| 2323 * or it should be removed from the final alignment */ | |
| 2324 for(i = 0, pos = 0, block = 0; i < residNumber; i++) { | |
| 2325 if(saveResidues[i] != -1) | |
| 2326 block++; | |
| 2327 else { | |
| 2328 /* Remove columns from blocks smaller than input blocks size */ | |
| 2329 if(block < blockSize) | |
| 2330 for(j = pos; j <= i; j++) | |
| 2331 saveResidues[j] = -1; | |
| 2332 pos = i + 1; | |
| 2333 block = 0; | |
| 2334 } | |
| 2335 } | |
| 2336 /* Check final block separately since it could happen than last block is not | |
| 2337 * big enough but because the loop end could provoke to ignore it */ | |
| 2338 if(block < blockSize) | |
| 2339 for(j = pos; j < i; j++) | |
| 2340 saveResidues[j] = -1; | |
| 2341 } | |
| 2342 | |
| 2343 /* If the flag -terminalony is set, apply a method to look for internal | |
| 2344 * boundaries and get back columns inbetween them, if they exist */ | |
| 2345 if(terminalGapOnly == true) | |
| 2346 if(!removeOnlyTerminal()) | |
| 2347 return NULL; | |
| 2348 | |
| 2349 /* Once the columns/sequences selection is done, turn it around | |
| 2350 * if complementary flag is active */ | |
| 2351 if(complementary == true) | |
| 2352 computeComplementaryAlig(true, false); | |
| 2353 | |
| 2354 /* Check for any additional column/sequence to be removed */ | |
| 2355 /* Compute new sequences and columns numbers */ | |
| 2356 counter = removeCols_SeqsAllGaps(); | |
| 2357 | |
| 2358 /* Allocate memory for selected sequences/columns */ | |
| 2359 matrixAux = new string[counter.sequences]; | |
| 2360 newSeqsName = new string[counter.sequences]; | |
| 2361 | |
| 2362 /* Fill local allocated memory with previously selected data */ | |
| 2363 fillNewDataStructure(matrixAux, newSeqsName); | |
| 2364 | |
| 2365 /* When we have all parameters, we create the new alignment */ | |
| 2366 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
| 2367 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
| 2368 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
| 2369 keepSequences, keepHeader, sequenNumber, residNumber, | |
| 2370 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
| 2371 identities, overlaps); | |
| 2372 | |
| 2373 /* Deallocate local memory */ | |
| 2374 delete[] matrixAux; | |
| 2375 delete[] newSeqsName; | |
| 2376 | |
| 2377 /* Return the new alignment reference */ | |
| 2378 return newAlig; | |
| 2379 } | |
| 2380 | |
| 2381 /* This method removes those columns that not achieve the similarity threshond, | |
| 2382 * by one hand, and exceed the gaps threshold. If the number of columns that | |
| 2383 * have been selected is lower that a given coverage, the program relaxes both | |
| 2384 * thresholds in order to get back some columns and achieve this minimum | |
| 2385 * number of columns in the new alignment. For that purpose, the program | |
| 2386 * starts at the middle of the original alignment and makes, alternatively, | |
| 2387 * movememts to the right and left sides looking for those columns necessary | |
| 2388 * to achieve the minimum number of columns set by the coverage parameter. | |
| 2389 * This method also can return the complementary alignmnet consists of those | |
| 2390 * columns that will be deleted from the original alignment applying the | |
| 2391 * standard method. */ | |
| 2392 alignment *alignment::cleanByCutValue(double cutGaps, const int *gInCol, | |
| 2393 float baseLine, float cutCons, const float *MDK_Win, bool complementary) { | |
| 2394 | |
| 2395 int i, j, k, oth, pos, block, jn, blGaps, *vectAuxGaps; | |
| 2396 string *matrixAux, *newSeqsName; | |
| 2397 float blCons, *vectAuxCons; | |
| 2398 alignment *newAlig; | |
| 2399 newValues counter; | |
| 2400 | |
| 2401 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2402 /* Select the columns with a conservation's value | |
| 2403 * greater than the conservation cut point AND | |
| 2404 * less or equal than the gap cut point. */ | |
| 2405 for(i = 0, counter.residues = 0; i < residNumber; i++) | |
| 2406 if((MDK_Win[i] > cutCons) && (gInCol[i] <= cutGaps)) counter.residues++; | |
| 2407 else saveResidues[i] = -1; | |
| 2408 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2409 | |
| 2410 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2411 /* Compute, if it's necessary, the number of columns | |
| 2412 * necessary to achieve the minimum number of it fixed | |
| 2413 * by the coverage parameter. */ | |
| 2414 oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber); | |
| 2415 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2416 | |
| 2417 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2418 /* If it's needed to add new columns, we compute the | |
| 2419 * news thresholds */ | |
| 2420 if(oth > 0) { | |
| 2421 counter.residues += oth; | |
| 2422 | |
| 2423 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2424 /* Allocate memory */ | |
| 2425 vectAuxCons = new float[residNumber]; | |
| 2426 vectAuxGaps = new int[residNumber]; | |
| 2427 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2428 | |
| 2429 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2430 /* Sort a copy of the MDK_Win vector and of the gInCol | |
| 2431 * vector, and take the value of the column that marks | |
| 2432 * the % baseline */ | |
| 2433 utils::copyVect((float *) MDK_Win, vectAuxCons, residNumber); | |
| 2434 utils::copyVect((int *) gInCol, vectAuxGaps, residNumber); | |
| 2435 | |
| 2436 utils::quicksort(vectAuxCons, 0, residNumber-1); | |
| 2437 utils::quicksort(vectAuxGaps, 0, residNumber-1); | |
| 2438 | |
| 2439 blCons = vectAuxCons[(int) ((float)(residNumber - 1) * (100.0 - baseLine)/100.0)]; | |
| 2440 blGaps = vectAuxGaps[(int) ((float)(residNumber - 1) * (baseLine)/100.0)]; | |
| 2441 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2442 | |
| 2443 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2444 /* Deallocate memory */ | |
| 2445 delete [] vectAuxCons; | |
| 2446 delete [] vectAuxGaps; | |
| 2447 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2448 } | |
| 2449 | |
| 2450 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2451 /* Fixed the initial size of blocks as 0.5% of | |
| 2452 * alignment's length */ | |
| 2453 for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) { | |
| 2454 | |
| 2455 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2456 /* We start in the alignment middle then we move on | |
| 2457 * right and left side at the same time. */ | |
| 2458 for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) { | |
| 2459 | |
| 2460 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2461 /* Left side. Here, we compute the block's size. */ | |
| 2462 for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ; | |
| 2463 | |
| 2464 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2465 /* if block's size is greater or equal than the fixed | |
| 2466 * size then we select the column and save the block's | |
| 2467 * size for the next iteraction. it's obvius that we | |
| 2468 * decrease the column's number needed to finish. */ | |
| 2469 /* Here, we accept column with a conservation's value | |
| 2470 * greater or equal than the conservation cut point OR | |
| 2471 * less or equal than the gap cut point. */ | |
| 2472 if((i - jn) >= k) { | |
| 2473 for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) { | |
| 2474 if((MDK_Win[jn] >= blCons) || (gInCol[jn] <= blGaps)) { | |
| 2475 saveResidues[jn] = jn; | |
| 2476 oth--; | |
| 2477 } else | |
| 2478 break; | |
| 2479 } | |
| 2480 } | |
| 2481 i = jn; | |
| 2482 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2483 | |
| 2484 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2485 /* Right side. Here, we compute the block's size. */ | |
| 2486 for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ; | |
| 2487 | |
| 2488 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2489 /* if block's size is greater or equal than the fixed | |
| 2490 * size then we select the column and save the block's | |
| 2491 * size for the next iteraction. it's obvius that we | |
| 2492 * decrease the column's number needed to finish. */ | |
| 2493 /* Here, we accept column with a conservation's value | |
| 2494 * greater or equal than the conservation cut point OR | |
| 2495 * less or equal than the gap cut point. */ | |
| 2496 if((jn - j) >= k) { | |
| 2497 for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) { | |
| 2498 if((MDK_Win[jn] >= blCons) || (gInCol[jn] <= blGaps)) { | |
| 2499 saveResidues[jn] = jn; | |
| 2500 oth--; | |
| 2501 } else | |
| 2502 break; | |
| 2503 } | |
| 2504 } | |
| 2505 j = jn; | |
| 2506 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2507 } | |
| 2508 /* ***** ***** ***** ***** ***** ***** ***** ***** */ | |
| 2509 } | |
| 2510 | |
| 2511 /* Keep only columns blocks bigger than an input columns block size */ | |
| 2512 if(blockSize != 0) { | |
| 2513 | |
| 2514 /* Traverse all alignment looking for columns blocks greater than LONGBLOCK, | |
| 2515 * everytime than a column is not selected by the trimming method, check | |
| 2516 * whether the current block size until that point is big enough to be kept | |
| 2517 * or it should be removed from the final alignment */ | |
| 2518 for(i = 0, pos = 0, block = 0; i < residNumber; i++) { | |
| 2519 if(saveResidues[i] != -1) | |
| 2520 block++; | |
| 2521 else { | |
| 2522 /* Remove columns from blocks smaller than input blocks size */ | |
| 2523 if(block < blockSize) | |
| 2524 for(j = pos; j <= i; j++) | |
| 2525 saveResidues[j] = -1; | |
| 2526 pos = i + 1; | |
| 2527 block = 0; | |
| 2528 } | |
| 2529 } | |
| 2530 /* Check final block separately since it could happen than last block is not | |
| 2531 * big enough but because the loop end could provoke to ignore it */ | |
| 2532 if(block < blockSize) | |
| 2533 for(j = pos; j < i; j++) | |
| 2534 saveResidues[j] = -1; | |
| 2535 } | |
| 2536 | |
| 2537 /* If the flag -terminalony is set, apply a method to look for internal | |
| 2538 * boundaries and get back columns inbetween them, if they exist */ | |
| 2539 if(terminalGapOnly == true) | |
| 2540 if(!removeOnlyTerminal()) | |
| 2541 return NULL; | |
| 2542 | |
| 2543 /* Once the columns/sequences selection is done, turn it around | |
| 2544 * if complementary flag is active */ | |
| 2545 if(complementary == true) | |
| 2546 computeComplementaryAlig(true, false); | |
| 2547 | |
| 2548 /* Check for any additional column/sequence to be removed */ | |
| 2549 /* Compute new sequences and columns numbers */ | |
| 2550 counter = removeCols_SeqsAllGaps(); | |
| 2551 | |
| 2552 /* Allocate memory for selected sequences/columns */ | |
| 2553 matrixAux = new string[counter.sequences]; | |
| 2554 newSeqsName = new string[counter.sequences]; | |
| 2555 | |
| 2556 /* Fill local allocated memory with previously selected data */ | |
| 2557 fillNewDataStructure(matrixAux, newSeqsName); | |
| 2558 | |
| 2559 /* When we have all parameters, we create the new alignment */ | |
| 2560 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
| 2561 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
| 2562 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
| 2563 keepSequences, keepHeader, sequenNumber, residNumber, | |
| 2564 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
| 2565 identities, overlaps); | |
| 2566 | |
| 2567 /* Deallocate local memory */ | |
| 2568 delete[] matrixAux; | |
| 2569 delete[] newSeqsName; | |
| 2570 | |
| 2571 /* Return the new alignment reference */ | |
| 2572 return newAlig; | |
| 2573 } | |
| 2574 | |
| 2575 /* This method carries out the strict and strict plus method. To trim the | |
| 2576 * alignment in an automated way, the method uses as an input a given gaps | |
| 2577 * thresholds over a gaps vector values, a given similarity threshold over a | |
| 2578 * similarity vector values. With a flag, we can decide which method the | |
| 2579 * program shall apply. With another one, the user can ask for the | |
| 2580 * complementary alignment. In this method, those columns that has been marked | |
| 2581 * to be deleted but has, at least, 3 of 4 surronding neighbour selected are | |
| 2582 * get back to the new alignmnet. At other part of the method, those column | |
| 2583 * blocks that do not have a minimum size fix by the method will be deleted | |
| 2584 * from the trimmed alignment. */ | |
| 2585 alignment *alignment::cleanStrict(int gapCut, const int *gInCol, float simCut, | |
| 2586 const float *MDK_W, bool complementary, bool variable) { | |
| 2587 | |
| 2588 int i, num, lenBlock; | |
| 2589 alignment *newAlig; | |
| 2590 | |
| 2591 deque<int> neighboursBlock; | |
| 2592 | |
| 2593 /* Reject columns with gaps number greater than the gap threshold. */ | |
| 2594 for(i = 0; i < residNumber; i++) | |
| 2595 if(gInCol[i] > gapCut) | |
| 2596 saveResidues[i] = -1; | |
| 2597 | |
| 2598 /* Reject columns with similarity score under the threshold. */ | |
| 2599 for(i = 0; i < residNumber; i++) | |
| 2600 if(MDK_W[i] < simCut) | |
| 2601 saveResidues[i] = -1; | |
| 2602 | |
| 2603 /* Search for those columns that have been removed but have, | |
| 2604 * at least, 3 out of 4 adjacent columns selected. */ | |
| 2605 | |
| 2606 /* Load the first 5 columns into the list */ | |
| 2607 for(i = 0; i < (residNumber) && i < 5; i++) | |
| 2608 neighboursBlock.push_back(saveResidues[i]); | |
| 2609 | |
| 2610 /* Special case #1 - we analyze it before processing the rest positions | |
| 2611 * Column: 1 - Save this column considering its neighbours */ | |
| 2612 if((saveResidues[0] == -1) && (neighboursBlock.size() > 2) && | |
| 2613 (neighboursBlock[1] != -1) && | |
| 2614 (neighboursBlock[2] != -1)) | |
| 2615 saveResidues[0] = 0; | |
| 2616 | |
| 2617 /* Special case #2A - we analyze it before processing the rest positions | |
| 2618 * Column: 2 - Save this column considering its neighbours */ | |
| 2619 if((saveResidues[1] == -1) && (neighboursBlock.size() > 3) && | |
| 2620 ((neighboursBlock[0] != -1) && | |
| 2621 (neighboursBlock[2] != -1) && | |
| 2622 (neighboursBlock[3] != -1))) | |
| 2623 saveResidues[1] = 1; | |
| 2624 | |
| 2625 /* Special case #2B - when alignment is 3 columns long - we analyze it before | |
| 2626 * processing the rest positions. | |
| 2627 * Column: 2 - Save this column considering its neighbours */ | |
| 2628 if((saveResidues[1] == -1) && (neighboursBlock.size() > 2) && | |
| 2629 ((neighboursBlock[0] != -1) && | |
| 2630 (neighboursBlock[2] != -1))) | |
| 2631 saveResidues[1] = 1; | |
| 2632 | |
| 2633 /* Normal cases */ | |
| 2634 for(i = 2, num = 0; i < (residNumber - 2); i++, num = 0) { | |
| 2635 if(saveResidues[i] == -1) { | |
| 2636 if(neighboursBlock[0] != -1) | |
| 2637 num++; | |
| 2638 if(neighboursBlock[1] != -1) | |
| 2639 num++; | |
| 2640 if(neighboursBlock[3] != -1) | |
| 2641 num++; | |
| 2642 if(neighboursBlock[4] != -1) | |
| 2643 num++; | |
| 2644 saveResidues[i] = (num >= 3) ? i : -1; | |
| 2645 | |
| 2646 } | |
| 2647 /* We slide the window one position to the right */ | |
| 2648 if((i+3) < residNumber) { | |
| 2649 neighboursBlock.pop_front(); | |
| 2650 neighboursBlock.push_back(saveResidues[i+3]); | |
| 2651 } | |
| 2652 } | |
| 2653 | |
| 2654 /* Special case #3 - | |
| 2655 * Column: 2nd last one - We consider the list rather than the saveResidues | |
| 2656 * vector in order to consider only unmodified values before the previous | |
| 2657 * loop */ | |
| 2658 if((saveResidues[(residNumber - 2)] == -1) && (neighboursBlock.size() > 3) && | |
| 2659 ((neighboursBlock[neighboursBlock.size() - 4] != -1) && | |
| 2660 (neighboursBlock[neighboursBlock.size() - 3] != -1) && | |
| 2661 (neighboursBlock[neighboursBlock.size() - 1] != -1))) | |
| 2662 saveResidues[(residNumber - 2)] = (residNumber - 2); | |
| 2663 | |
| 2664 /* Special case #4 - | |
| 2665 * Column: last one - We consider the list rather than the saveResidues | |
| 2666 * vector in order to consider only unmodified values before the previous | |
| 2667 * loop */ | |
| 2668 if((saveResidues[(residNumber - 1)] == -1) && (neighboursBlock.size() > 2) && | |
| 2669 ((neighboursBlock[neighboursBlock.size() - 3] != -1) && | |
| 2670 (neighboursBlock[neighboursBlock.size() - 2] != -1))) | |
| 2671 saveResidues[(residNumber - 1)] = (residNumber - 1); | |
| 2672 | |
| 2673 /* Select blocks size based on user input. It can be set either to 5 or to a | |
| 2674 * variable number between 3 and 12 depending on alignment length (1% alig) */ | |
| 2675 if(!variable) | |
| 2676 lenBlock = 5; | |
| 2677 else { | |
| 2678 lenBlock = utils::roundInt(residNumber * 0.01); | |
| 2679 lenBlock = lenBlock > 3 ? (lenBlock < 12 ? lenBlock : 12) : 3; | |
| 2680 } | |
| 2681 | |
| 2682 /* Allow to change minimal block size */ | |
| 2683 blockSize = blockSize > 0 ? blockSize : lenBlock; | |
| 2684 | |
| 2685 /* Keep only columns blocks bigger than either a computed dinamically or | |
| 2686 * set by the user block size */ | |
| 2687 removeSmallerBlocks(blockSize); | |
| 2688 | |
| 2689 /* If the flag -terminalony is set, apply a method to look for internal | |
| 2690 * boundaries and get back columns inbetween them, if they exist */ | |
| 2691 if(terminalGapOnly == true) | |
| 2692 if(!removeOnlyTerminal()) | |
| 2693 return NULL; | |
| 2694 | |
| 2695 /* Once the columns/sequences selection is done, turn it around | |
| 2696 * if complementary flag is active */ | |
| 2697 if(complementary == true) | |
| 2698 computeComplementaryAlig(true, false); | |
| 2699 | |
| 2700 /* Check for any additional column/sequence to be removed */ | |
| 2701 /* Compute new sequences and columns numbers */ | |
| 2702 | |
| 2703 newValues counter; | |
| 2704 counter = removeCols_SeqsAllGaps(); | |
| 2705 | |
| 2706 /* Allocate memory for selected sequences/columns */ | |
| 2707 //~ matrixAux = new string[counter.sequences]; | |
| 2708 //~ newSeqsName = new string[counter.sequences]; | |
| 2709 | |
| 2710 /* Fill local allocated memory with previously selected data */ | |
| 2711 //~ fillNewDataStructure(matrixAux, newSeqsName); | |
| 2712 | |
| 2713 fillNewDataStructure(&counter); | |
| 2714 | |
| 2715 //~ cerr << counter -> seqsName[counter -> sequences - 1] << endl; | |
| 2716 | |
| 2717 /* When we have all parameters, we create the new alignment */ | |
| 2718 //~ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, counter.sequences, counter.residues, | |
| 2719 newAlig = new alignment(filename, aligInfo, counter.matrix, counter.seqsName, | |
| 2720 seqsInfo, counter.sequences, counter.residues, iformat, oformat, shortNames, | |
| 2721 dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
| 2722 keepSequences, keepHeader, sequenNumber, | |
| 2723 residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, | |
| 2724 blockSize, identities, overlaps); | |
| 2725 | |
| 2726 /* Deallocate local memory */ | |
| 2727 //~ delete[] matrixAux; | |
| 2728 //~ delete[] newSeqsName; | |
| 2729 | |
| 2730 /* Return the new alignment reference */ | |
| 2731 return newAlig; | |
| 2732 } | |
| 2733 | |
| 2734 /* Remove those sequences with an overlap less than a given threshold. It can | |
| 2735 * return the complementary alignment if appropiate flag is set. */ | |
| 2736 alignment *alignment::cleanOverlapSeq(float minimumOverlap, float *overlapSeq, | |
| 2737 bool complementary) { | |
| 2738 | |
| 2739 string *matrixAux, *newSeqsName; | |
| 2740 alignment *newAlig; | |
| 2741 newValues counter; | |
| 2742 int i; | |
| 2743 | |
| 2744 /* Keep only those sequences with an overlap value equal or greater than | |
| 2745 * the minimum overlap value set by the user. */ | |
| 2746 for(i = 0; i < sequenNumber; i++) | |
| 2747 if(overlapSeq[i] < minimumOverlap) | |
| 2748 saveSequences[i] = -1; | |
| 2749 | |
| 2750 /* Once the columns/sequences selection is done, turn it around | |
| 2751 * if complementary flag is active */ | |
| 2752 if(complementary == true) | |
| 2753 computeComplementaryAlig(false, true); | |
| 2754 | |
| 2755 /* Check for any additional column/sequence to be removed */ | |
| 2756 /* Compute new sequences and columns numbers */ | |
| 2757 counter = removeCols_SeqsAllGaps(); | |
| 2758 | |
| 2759 /* Allocate memory for selected sequences/columns */ | |
| 2760 matrixAux = new string[counter.sequences]; | |
| 2761 newSeqsName = new string[counter.sequences]; | |
| 2762 | |
| 2763 /* Fill local allocated memory with previously selected data */ | |
| 2764 fillNewDataStructure(matrixAux, newSeqsName); | |
| 2765 | |
| 2766 /* When we have all parameters, we create the new alignment */ | |
| 2767 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
| 2768 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
| 2769 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
| 2770 keepSequences, keepHeader, sequenNumber, residNumber, | |
| 2771 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
| 2772 identities, overlaps); | |
| 2773 | |
| 2774 /* Deallocate local memory */ | |
| 2775 delete [] matrixAux; | |
| 2776 delete [] newSeqsName; | |
| 2777 | |
| 2778 /* Return the new alignment reference */ | |
| 2779 return newAlig; | |
| 2780 } | |
| 2781 | |
| 2782 /* Remove those columns, expressed as range, set by the user. It can return | |
| 2783 * the complementary alignmnet if appropiate flags is set. */ | |
| 2784 alignment *alignment::removeColumns(int *columns, int init, int size, | |
| 2785 bool complementary) { | |
| 2786 | |
| 2787 string *matrixAux, *newSeqsName; | |
| 2788 alignment *newAlig; | |
| 2789 newValues counter; | |
| 2790 int i, j; | |
| 2791 | |
| 2792 /* Delete those range columns defines in the columns vector */ | |
| 2793 for(i = init; i < size + init; i += 2) | |
| 2794 for(j = columns[i]; j <= columns[i+1]; j++) | |
| 2795 saveResidues[j] = -1; | |
| 2796 | |
| 2797 /* Once the columns/sequences selection is done, turn it around | |
| 2798 * if complementary flag is active */ | |
| 2799 if(complementary == true) | |
| 2800 computeComplementaryAlig(true, false); | |
| 2801 | |
| 2802 /* Check for any additional column/sequence to be removed */ | |
| 2803 /* Compute new sequences and columns numbers */ | |
| 2804 counter = removeCols_SeqsAllGaps(); | |
| 2805 | |
| 2806 /* Allocate memory for selected sequences/columns */ | |
| 2807 matrixAux = new string[counter.sequences]; | |
| 2808 newSeqsName = new string[counter.sequences]; | |
| 2809 | |
| 2810 /* Fill local allocated memory with previously selected data */ | |
| 2811 fillNewDataStructure(matrixAux, newSeqsName); | |
| 2812 | |
| 2813 /* When we have all parameters, we create the new alignment */ | |
| 2814 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
| 2815 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
| 2816 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
| 2817 keepSequences, keepHeader, sequenNumber, residNumber, | |
| 2818 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
| 2819 identities, overlaps); | |
| 2820 | |
| 2821 /* Deallocate local memory */ | |
| 2822 delete[] matrixAux; | |
| 2823 delete[] newSeqsName; | |
| 2824 | |
| 2825 /* Return the new alignment reference */ | |
| 2826 return newAlig; | |
| 2827 } | |
| 2828 | |
| 2829 /* This method removes those sequences, expressed as range of sequences, set by | |
| 2830 * the user. The method also can return the complementary alignment, it means, | |
| 2831 * those sequences that originally should be removed from the input alignment */ | |
| 2832 alignment *alignment::removeSequences(int *seqs, int init, int size, | |
| 2833 bool complementary) { | |
| 2834 | |
| 2835 string *matrixAux, *newSeqsName; | |
| 2836 alignment *newAlig; | |
| 2837 newValues counter; | |
| 2838 int i, j; | |
| 2839 | |
| 2840 /* Delete those range of sequences defines by the seqs vector */ | |
| 2841 for(i = init; i < size + init; i += 2) | |
| 2842 for(j = seqs[i]; j <= seqs[i+1]; j++) | |
| 2843 saveSequences[j] = -1; | |
| 2844 | |
| 2845 /* Once the columns/sequences selection is done, turn it around | |
| 2846 * if complementary flag is active */ | |
| 2847 if(complementary == true) | |
| 2848 computeComplementaryAlig(false, true); | |
| 2849 | |
| 2850 /* Check for any additional column/sequence to be removed */ | |
| 2851 /* Compute new sequences and columns numbers */ | |
| 2852 counter = removeCols_SeqsAllGaps(); | |
| 2853 | |
| 2854 /* Allocate memory for selected sequences/columns */ | |
| 2855 matrixAux = new string[counter.sequences]; | |
| 2856 newSeqsName = new string[counter.sequences]; | |
| 2857 | |
| 2858 /* Fill local allocated memory with previously selected data */ | |
| 2859 fillNewDataStructure(matrixAux, newSeqsName); | |
| 2860 | |
| 2861 /* When we have all parameters, we create the new alignment */ | |
| 2862 newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, | |
| 2863 counter.sequences, counter.residues, iformat, oformat, shortNames, dataType, | |
| 2864 isAligned, reverse, terminalGapOnly, left_boundary, right_boundary, | |
| 2865 keepSequences, keepHeader, sequenNumber, residNumber, | |
| 2866 residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize, | |
| 2867 identities, overlaps); | |
| 2868 | |
| 2869 /* Free local memory */ | |
| 2870 delete [] matrixAux; | |
| 2871 delete [] newSeqsName; | |
| 2872 | |
| 2873 /* Return the new alignment reference */ | |
| 2874 return newAlig; | |
| 2875 } | |
| 2876 | |
| 2877 /* Function for computing the complementary alignment. It just turn around the | |
| 2878 * current columns/sequences selection */ | |
| 2879 void alignment::computeComplementaryAlig(bool residues, bool sequences) { | |
| 2880 int i; | |
| 2881 | |
| 2882 for(i = 0; i < residNumber && residues; i++) | |
| 2883 saveResidues[i] = (saveResidues[i] == -1) ? i : -1; | |
| 2884 | |
| 2885 for(i = 0; i < sequenNumber && sequences; i++) | |
| 2886 saveSequences[i] = (saveSequences[i] == -1) ? i : -1; | |
| 2887 } | |
| 2888 | |
| 2889 /* Function designed for identifying right and left borders between central | |
| 2890 * and terminal regions in the alignment. The borders are those columns, first | |
| 2891 * and last, composed by only residues. Everything inbetween left and right | |
| 2892 * borders are keept independently of the applied methods */ | |
| 2893 bool alignment::removeOnlyTerminal(void) { | |
| 2894 | |
| 2895 int i; | |
| 2896 const int *gInCol; | |
| 2897 | |
| 2898 if((left_boundary == -1) and (right_boundary == -1)) { | |
| 2899 /* Get alignments gaps stats and copy it */ | |
| 2900 if(calculateGapStats() != true) { | |
| 2901 cerr << endl << "WARNING: Impossible to apply 'terminal-only' method" | |
| 2902 << endl << endl; | |
| 2903 return false; | |
| 2904 } | |
| 2905 gInCol = sgaps -> getGapsWindow(); | |
| 2906 | |
| 2907 /* Identify left and right borders. First and last columns with no gaps */ | |
| 2908 for(i = 0; i < residNumber && gInCol[i] != 0; i++) ; | |
| 2909 left_boundary = i; | |
| 2910 | |
| 2911 for(i = residNumber - 1; i > -1 && gInCol[i] != 0; i--) ; | |
| 2912 right_boundary = i; | |
| 2913 } | |
| 2914 | |
| 2915 else if(left_boundary >= right_boundary) { | |
| 2916 cerr << endl << "ERROR: Check your manually set left '"<< left_boundary | |
| 2917 << "' and right '" << right_boundary << "' boundaries'" << endl << endl; | |
| 2918 return false; | |
| 2919 } | |
| 2920 | |
| 2921 /* We increase the right boundary in one position because we use lower strict | |
| 2922 * condition to get back columns inbetween boundaries removed by any method */ | |
| 2923 right_boundary += 1; | |
| 2924 | |
| 2925 /* Once the interal boundaries have been established, if these limits exist | |
| 2926 * then retrieved all columns inbetween both boundaries. Columns out of these | |
| 2927 * limits will remain selected or not depending on the algorithm applied */ | |
| 2928 for(i = left_boundary; i < right_boundary; i++) | |
| 2929 saveResidues[i] = i; | |
| 2930 | |
| 2931 return true; | |
| 2932 } | |
| 2933 | |
| 2934 /* Function designed to identify and remove those columns blocks smaller than | |
| 2935 * a given size */ | |
| 2936 void alignment::removeSmallerBlocks(int blockSize) { | |
| 2937 | |
| 2938 int i, j, pos, block; | |
| 2939 | |
| 2940 if(blockSize == 0) | |
| 2941 return ; | |
| 2942 | |
| 2943 /* Traverse the alignment looking for blocks greater than BLOCKSIZE, everytime | |
| 2944 * than a column hasn't been selected, check whether the current block is big | |
| 2945 * enough to be kept or it should be removed from the final alignment */ | |
| 2946 for(i = 0, pos = 0, block = 0; i < residNumber; i++) { | |
| 2947 if(saveResidues[i] != -1) | |
| 2948 block++; | |
| 2949 else { | |
| 2950 /* Remove columns from blocks smaller than input blocks size */ | |
| 2951 if(block < blockSize) | |
| 2952 for(j = pos; j <= i; j++) | |
| 2953 saveResidues[j] = -1; | |
| 2954 pos = i + 1; | |
| 2955 block = 0; | |
| 2956 } | |
| 2957 } | |
| 2958 | |
| 2959 /* Check final block separately since it could happen than last block is not | |
| 2960 * big enough but because the loop end could provoke to ignore it */ | |
| 2961 if(block < blockSize) | |
| 2962 for(j = pos; j < i; j++) | |
| 2963 saveResidues[j] = -1; | |
| 2964 return ; | |
| 2965 } | |
| 2966 | |
| 2967 /* Function designed to identify columns and sequences composed only by gaps. | |
| 2968 * Once these columns/sequences have been identified, they are removed from | |
| 2969 * final alignment. */ | |
| 2970 newValues alignment::removeCols_SeqsAllGaps(void) { | |
| 2971 int i, j, valid, gaps; | |
| 2972 bool warnings = false; | |
| 2973 newValues counter; | |
| 2974 | |
| 2975 /* Check all valid columns looking for those composed by only gaps */ | |
| 2976 for(i = 0, counter.residues = 0; i < residNumber; i++) { | |
| 2977 if(saveResidues[i] == -1) | |
| 2978 continue; | |
| 2979 | |
| 2980 for(j = 0, valid = 0, gaps = 0; j < sequenNumber; j++) { | |
| 2981 if (saveSequences[j] == -1) | |
| 2982 continue; | |
| 2983 if (sequences[j][i] == '-') | |
| 2984 gaps ++; | |
| 2985 valid ++; | |
| 2986 } | |
| 2987 /* Once a column has been identified, warm about it and remove it */ | |
| 2988 if(gaps == valid) { | |
| 2989 if(!warnings) | |
| 2990 cerr << endl; | |
| 2991 warnings = true; | |
| 2992 cerr << "WARNING: Removing column '" << i << "' composed only by gaps" | |
| 2993 << endl; | |
| 2994 saveResidues[i] = -1; | |
| 2995 } else { | |
| 2996 counter.residues ++; | |
| 2997 } | |
| 2998 } | |
| 2999 | |
| 3000 /* Check for those selected sequences to see whether there is anyone with | |
| 3001 * only gaps */ | |
| 3002 for(i = 0, counter.sequences = 0; i < sequenNumber; i++) { | |
| 3003 if(saveSequences[i] == -1) | |
| 3004 continue; | |
| 3005 | |
| 3006 for(j = 0, valid = 0, gaps = 0; j < residNumber; j++) { | |
| 3007 if(saveResidues[j] == -1) | |
| 3008 continue; | |
| 3009 if (sequences[i][j] == '-') | |
| 3010 gaps ++; | |
| 3011 valid ++; | |
| 3012 } | |
| 3013 /* Warm about it and remove each sequence composed only by gaps */ | |
| 3014 if(gaps == valid) { | |
| 3015 if(!warnings) | |
| 3016 cerr << endl; | |
| 3017 warnings = true; | |
| 3018 | |
| 3019 if(keepSequences) { | |
| 3020 cerr << "WARNING: Keeping sequence '" << seqsName[i] | |
| 3021 << "' composed only by gaps" << endl; | |
| 3022 counter.sequences ++; | |
| 3023 } else { | |
| 3024 cerr << "WARNING: Removing sequence '" << seqsName[i] | |
| 3025 << "' composed only by gaps" << endl; | |
| 3026 saveSequences[i] = -1; | |
| 3027 } | |
| 3028 } else { | |
| 3029 counter.sequences ++; | |
| 3030 } | |
| 3031 } | |
| 3032 if(warnings) | |
| 3033 cerr << endl; | |
| 3034 | |
| 3035 counter.matrix = new string[counter.sequences]; | |
| 3036 counter.seqsName = new string[counter.sequences]; | |
| 3037 | |
| 3038 return counter; | |
| 3039 } | |
| 3040 | |
| 3041 /* Function for copying to previously allocated memory those data selected | |
| 3042 * for being in the final alignment */ | |
| 3043 void alignment::fillNewDataStructure(string *newMatrix, string *newNames) { | |
| 3044 int i, j, k; | |
| 3045 | |
| 3046 /* Copy only those sequences/columns selected */ | |
| 3047 for(i = 0, j = 0; i < sequenNumber; i++) { | |
| 3048 if(saveSequences[i] == -1) | |
| 3049 continue; | |
| 3050 | |
| 3051 newNames[j] = seqsName[i]; | |
| 3052 for(k = 0; k < residNumber; k++) { | |
| 3053 if(saveResidues[k] == -1) | |
| 3054 continue; | |
| 3055 newMatrix[j].resize(newMatrix[j].size() + 1, sequences[i][k]); | |
| 3056 } | |
| 3057 j++; | |
| 3058 } | |
| 3059 } | |
| 3060 | |
| 3061 /* Function for copying to previously allocated memory those data selected | |
| 3062 * for being in the final alignment */ | |
| 3063 void alignment::fillNewDataStructure(newValues *data) { | |
| 3064 int i, j, k; | |
| 3065 | |
| 3066 /* Copy only those sequences/columns selected */ | |
| 3067 for(i = 0, j = 0; i < sequenNumber; i++) { | |
| 3068 if(saveSequences[i] == -1) | |
| 3069 continue; | |
| 3070 | |
| 3071 data -> seqsName[j] = seqsName[i]; | |
| 3072 for(k = 0; k < residNumber; k++) { | |
| 3073 if(saveResidues[k] == -1) | |
| 3074 continue; | |
| 3075 data -> matrix[j].resize(data -> matrix[j].size() + 1, sequences[i][k]); | |
| 3076 } | |
| 3077 j++; | |
| 3078 } | |
| 3079 // cerr << data -> seqsName[j-1] << endl; | |
| 3080 } | |
| 3081 | |
| 3082 /* Check if CDS file is correct based on: Residues are DNA/RNA (at most). There | |
| 3083 * is not gaps in the whole dataset. Each sequence is multiple of 3. At the same | |
| 3084 * time, the function will remove stop codons if appropiate flags are used */ | |
| 3085 bool alignment::prepareCodingSequence(bool splitByStopCodon, bool ignStopCodon,\ | |
| 3086 alignment *proteinAlig) { | |
| 3087 | |
| 3088 bool warning = false; | |
| 3089 size_t found; | |
| 3090 int i; | |
| 3091 | |
| 3092 /* New code: We now care about the presence of wildcards/indeterminations | |
| 3093 * characters such as 'X' or 'B' into protein sequences as well as about the | |
| 3094 * presence of Selenocysteines ('U') or Pyrrolysines ('O'). It only works, by | |
| 3095 * now, with the universal genetic code */ | |
| 3096 int *protSeqsLengths, numbProtSeqs, current_prot; | |
| 3097 string *protSeqsNames, *protSequences; | |
| 3098 char aminoAcid; | |
| 3099 | |
| 3100 numbProtSeqs = proteinAlig -> getNumSpecies(); | |
| 3101 protSeqsNames = new string[numbProtSeqs]; | |
| 3102 protSequences = new string[numbProtSeqs]; | |
| 3103 protSeqsLengths = new int[numbProtSeqs]; | |
| 3104 | |
| 3105 proteinAlig -> getSequences(protSeqsNames, protSequences, protSeqsLengths); | |
| 3106 | |
| 3107 /* Check read sequences are real DNA/RNA */ | |
| 3108 if (getTypeAlignment() == AAType) { | |
| 3109 cerr << endl << "ERROR: Check input CDS file. It seems to content protein " | |
| 3110 << "residues." << endl << endl; | |
| 3111 return false; | |
| 3112 } | |
| 3113 | |
| 3114 for(i = 0; i < sequenNumber; i++) { | |
| 3115 | |
| 3116 /* Get protein sequence to compare against any potential stop codon in the | |
| 3117 * coding sequence. If there is not protein sequence for current coding | |
| 3118 * sequence, skip its analysis */ | |
| 3119 for(current_prot = 0; current_prot < numbProtSeqs; current_prot++) | |
| 3120 if(protSeqsNames[current_prot] == seqsName[i]) | |
| 3121 break; | |
| 3122 if(current_prot == numbProtSeqs) | |
| 3123 continue; | |
| 3124 | |
| 3125 if(sequences[i].find("-") != string::npos) { | |
| 3126 if (!warning) | |
| 3127 cerr << endl; | |
| 3128 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has, at least, one gap" | |
| 3129 << endl << endl; | |
| 3130 return false; | |
| 3131 } | |
| 3132 | |
| 3133 if((sequences[i].length() % 3) != 0) { | |
| 3134 if (!warning) | |
| 3135 cerr << endl; | |
| 3136 warning = true; | |
| 3137 cerr << "WARNING: Sequence length \"" << seqsName[i] << "\" is not " | |
| 3138 << "multiple of 3 (length: " << sequences[i].length() << ")" << endl; | |
| 3139 } | |
| 3140 | |
| 3141 /* Ignore stop codons from the CDS if set by the user */ | |
| 3142 if (ignStopCodon) | |
| 3143 continue; | |
| 3144 | |
| 3145 /* Detect universal stop codons in the CDS. Then, compare those stop codons | |
| 3146 * against the protein sequence to see whether they are real stop codons or | |
| 3147 * are representing rare amino-acids such as Selenocysteines or Pyrrolysines | |
| 3148 * It also allows stop-codons when there are wildcards/indet characters in | |
| 3149 * the protein sequence. CDS sequences could be splitted using stop codons | |
| 3150 * from the sequence itself */ | |
| 3151 | |
| 3152 /* Initialize first appearence of a given stop codon to -1. | |
| 3153 * That means that it has not been found yet */ | |
| 3154 found = -1; | |
| 3155 do { | |
| 3156 found = sequences[i].find("TGA", found + 1); | |
| 3157 | |
| 3158 /* If a stop codon has been found and its position is multiple of 3. | |
| 3159 * Analize it */ | |
| 3160 if((found != string::npos) && (((int) found % 3) == 0)) { | |
| 3161 | |
| 3162 aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]); | |
| 3163 /* It may be a Selenocysteine ('TGA') which should be represented as 'U' | |
| 3164 * or wildcard/indet characters such as 'X' or 'B' */ | |
| 3165 //~ if ((aminoAcid == 'U') || (aminoAcid == 'X') || (aminoAcid == 'B')) | |
| 3166 /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character | |
| 3167 * such as 'B'/'X' is present, skip current stop codon */ | |
| 3168 if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \ | |
| 3169 (aminoAcid == 'B')) | |
| 3170 continue; | |
| 3171 | |
| 3172 /* If split_by_stop_codon flag is activated then cut input CDS sequence | |
| 3173 * up to first appearance of a stop codon */ | |
| 3174 else if(splitByStopCodon) { | |
| 3175 if (!warning) | |
| 3176 cerr << endl; | |
| 3177 warning = true; | |
| 3178 cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first" | |
| 3179 << " appearance of stop codon \"TGA\" (residue \"" << aminoAcid | |
| 3180 << "\") at position " << (int) found + 1 << " (length: " | |
| 3181 << sequences[i].length() << ")" << endl; | |
| 3182 sequences[i].resize((int) found); | |
| 3183 } | |
| 3184 /* Otherwise, warn about it and return an error */ | |
| 3185 else { | |
| 3186 if (!warning) | |
| 3187 cerr << endl; | |
| 3188 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \"" | |
| 3189 << "TGA\" (residue \"" << aminoAcid << "\") at position " | |
| 3190 << (int) found + 1 << " (length: " << sequences[i].length() << ")" | |
| 3191 << endl << endl; | |
| 3192 return false; | |
| 3193 } | |
| 3194 } | |
| 3195 /* Iterate over the CDS until not stop codon is found */ | |
| 3196 } while(found != string::npos); | |
| 3197 | |
| 3198 /* Initialize first appearence of a given stop codon to -1. | |
| 3199 * That means that it has not been found yet */ | |
| 3200 found = -1; | |
| 3201 do { | |
| 3202 found = sequences[i].find("TAA", found + 1); | |
| 3203 | |
| 3204 /* If a stop codon has been found and its position is multiple of 3. | |
| 3205 * Analize it */ | |
| 3206 if((found != string::npos) && (((int) found % 3) == 0)) { | |
| 3207 | |
| 3208 aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]); | |
| 3209 /* Check if there is any wildcard/indet characters such as 'X' or 'B' */ | |
| 3210 //~ if ((aminoAcid == 'X') || (aminoAcid == 'B')) | |
| 3211 /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character | |
| 3212 * such as 'B'/'X' is present, skip current stop codon */ | |
| 3213 if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \ | |
| 3214 (aminoAcid == 'B')) | |
| 3215 continue; | |
| 3216 | |
| 3217 /* If split_by_stop_codon flag is activated then cut input CDS sequence | |
| 3218 * up to first appearance of a stop codon */ | |
| 3219 else if(splitByStopCodon) { | |
| 3220 if (!warning) | |
| 3221 cerr << endl; | |
| 3222 warning = true; | |
| 3223 cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first" | |
| 3224 << " appearance of stop codon \"TAA\" (residue \"" << aminoAcid | |
| 3225 << "\") at position " << (int) found + 1 << " (length: " | |
| 3226 << sequences[i].length() << ")" << endl; | |
| 3227 sequences[i].resize((int) found); | |
| 3228 } | |
| 3229 /* Otherwise, warn about it and return an error */ | |
| 3230 else { | |
| 3231 if (!warning) | |
| 3232 cerr << endl; | |
| 3233 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \"" | |
| 3234 << "TAA\" (residue \"" << aminoAcid << "\") at position " | |
| 3235 << (int) found + 1 << " (length: " << sequences[i].length() << ")" | |
| 3236 << endl << endl; | |
| 3237 return false; | |
| 3238 } | |
| 3239 } | |
| 3240 /* Iterate over the CDS until not stop codon is found */ | |
| 3241 } while(found != string::npos); | |
| 3242 | |
| 3243 /* Initialize first appearence of a given stop codon to -1. | |
| 3244 * That means that it has not been found yet */ | |
| 3245 found = -1; | |
| 3246 do { | |
| 3247 found = sequences[i].find("TAG", found + 1); | |
| 3248 /* If a stop codon has been found and its position is multiple of 3. | |
| 3249 * Analize it */ | |
| 3250 if((found != string::npos) && (((int) found % 3) == 0)) { | |
| 3251 | |
| 3252 aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]); | |
| 3253 /* It may be a Pyrrolysine ('TAG') which should be represented as 'O' | |
| 3254 * or wildcard/indet characters such as 'X' or 'B' */ | |
| 3255 //~ if ((aminoAcid == 'O') || (aminoAcid == 'X') || (aminoAcid == 'B')) | |
| 3256 /* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character | |
| 3257 * such as 'B'/'X' is present, skip current stop codon */ | |
| 3258 if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \ | |
| 3259 (aminoAcid == 'B')) | |
| 3260 continue; | |
| 3261 | |
| 3262 /* If split_by_stop_codon flag is activated then cut input CDS sequence | |
| 3263 * up to first appearance of a stop codon */ | |
| 3264 else if(splitByStopCodon) { | |
| 3265 if (!warning) | |
| 3266 cerr << endl; | |
| 3267 warning = true; | |
| 3268 cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first" | |
| 3269 << " appearance of stop codon \"TAG\" (residue \"" << aminoAcid | |
| 3270 << "\") at position " << (int) found + 1 << " (length: " | |
| 3271 << sequences[i].length() << ")" << endl; | |
| 3272 sequences[i].resize((int) found); | |
| 3273 } | |
| 3274 /* Otherwise, warn about it and return an error */ | |
| 3275 else { | |
| 3276 if (!warning) | |
| 3277 cerr << endl; | |
| 3278 cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \"" | |
| 3279 << "TAG\" (residue \"" << aminoAcid << "\") at position " | |
| 3280 << (int) found + 1 << " (length: " << sequences[i].length() << ")" | |
| 3281 << endl << endl; | |
| 3282 return false; | |
| 3283 } | |
| 3284 } | |
| 3285 /* Iterate over the CDS until not stop codon is found */ | |
| 3286 } while(found != string::npos); | |
| 3287 } | |
| 3288 | |
| 3289 /* If everything was return an OK to informat about it. */ | |
| 3290 return true; | |
| 3291 } | |
| 3292 | |
| 3293 /* Function designed to check whether input CDS file is correct or not based on | |
| 3294 * some features: Sequences are in both files (it could be more on CDS file), | |
| 3295 * they have (more or less) same ength. Otherwise, some nucleotides could be | |
| 3296 * excluded or some 'N's added to fit protein length. */ | |
| 3297 bool alignment::checkCorrespondence(string *names, int *lengths, int \ | |
| 3298 totalInputSeqs, int multiple = 1) { | |
| 3299 | |
| 3300 int i, j, seqLength, indet; | |
| 3301 bool warnings = false; | |
| 3302 string tmp; | |
| 3303 | |
| 3304 /* For each sequence in the current protein alignment, look for its coding | |
| 3305 * DNA sequence checking that they have the same size. */ | |
| 3306 for(i = 0; i < sequenNumber; i++) { | |
| 3307 | |
| 3308 /* Get protein sequence length removing any possible gap. Get as well last | |
| 3309 * residue from current sequence */ | |
| 3310 | |
| 3311 tmp = utils::removeCharacter('-', sequences[i]); | |
| 3312 seqLength = tmp.length() * multiple; | |
| 3313 indet = ((int) tmp.length() - utils::min((int) tmp.find_last_not_of("X"), \ | |
| 3314 (int) tmp.find_last_not_of("x"))) - 1; | |
| 3315 | |
| 3316 /* Go through all available CDS looking for the one with the same ID */ | |
| 3317 for(j = 0; j < totalInputSeqs; j++) { | |
| 3318 | |
| 3319 /* Once both ID matchs, compare its lengths */ | |
| 3320 if(seqsName[i] == names[j]) { | |
| 3321 | |
| 3322 /* If both sequences have the same length, stop the search */ | |
| 3323 if(seqLength == lengths[j]) | |
| 3324 break; | |
| 3325 | |
| 3326 /* If nucleotide sequence is larger than protein sequence, warn about | |
| 3327 * it and continue the verification process. It will used the 'Nth' | |
| 3328 * first nucleotides for the conversion */ | |
| 3329 else if(seqLength < lengths[j]) { | |
| 3330 if (!warnings) | |
| 3331 cerr << endl; | |
| 3332 warnings = true; | |
| 3333 cerr << "WARNING: Sequence \"" << seqsName[i] << "\" will be cut at " | |
| 3334 << "position " << seqLength << " (length: "<< lengths[j] << ")" | |
| 3335 << endl; | |
| 3336 break; | |
| 3337 } | |
| 3338 | |
| 3339 /* It has been detected some indeterminations at the end of the protein | |
| 3340 * sequence. That issue could be cause by some incomplete codons in the | |
| 3341 * nucleotide sequences. This issue is solved adding as much 'N' symbols | |
| 3342 * as it is needed to preserve the backtranslated alignment */ | |
| 3343 else if((indet > 0) && (indet > (seqLength - lengths[j])/3)) { | |
| 3344 if (!warnings) | |
| 3345 cerr << endl; | |
| 3346 warnings = true; | |
| 3347 cerr << "WARNING: Sequence \"" << seqsName[i] << "\" has some inde" | |
| 3348 << "termination symbols 'X' at the end of sequence. They will be" | |
| 3349 << " included in the final alignment." << endl; | |
| 3350 break; | |
| 3351 } | |
| 3352 | |
| 3353 /* If nucleotide sequence is shorter than protein sequence, return an | |
| 3354 * error since it is not feasible to cut the input protein aligment to | |
| 3355 * fit it into CDNA sequences size */ | |
| 3356 else { | |
| 3357 if (!warnings) | |
| 3358 cerr << endl; | |
| 3359 warnings = true; | |
| 3360 cerr << "WARNING: Sequence \"" << seqsName[i] << "\" has less nucleo" | |
| 3361 << "tides (" << lengths[j] << ") than expected (" << seqLength | |
| 3362 << "). It will be added N's to complete the sequence" << endl; | |
| 3363 break; | |
| 3364 } | |
| 3365 } | |
| 3366 } | |
| 3367 | |
| 3368 /* Warn about a mismatch a sequences name level */ | |
| 3369 if(j == totalInputSeqs) { | |
| 3370 cerr << endl << "ERROR: Sequence \"" << seqsName[i] << "\" is not in " | |
| 3371 << "CDS file." << endl << endl; | |
| 3372 return false; | |
| 3373 } | |
| 3374 } | |
| 3375 | |
| 3376 /* If everything is OK, return an appropiate flag */ | |
| 3377 return true; | |
| 3378 } |
