comparison clustalomega/clustal-omega-0.2.0/src/clustal-omega.c @ 0:ff1768533a07

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 17:04:25 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ff1768533a07
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
5 *
6 * Copyright (C) 2010 University College Dublin
7 *
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This file is part of Clustal-Omega.
14 *
15 ********************************************************************/
16
17 /*
18 * RCS $Id: clustal-omega.c 241 2011-05-04 14:37:17Z fabian $
19 */
20
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24
25 #include <assert.h>
26
27 #include "clustal-omega.h"
28 #include "hhalign/general.h"
29
30 /* The following comment block contains the frontpage/mainpage of the doxygen
31 * documentation. Please add some more info. FIXME add more
32 */
33
34 /**
35 *
36 * @mainpage Clustal-Omega Documentation
37 *
38 * @section intro_sec Introduction
39 *
40 * For more information see http://www.clustal.org/
41 *
42 * @section api_section API
43 *
44 * @subsection example_prog_subsection An Example Program
45 *
46 * To use libclustalo you will have to include the clustal-omega.h header and
47 * link against libclustalo. For linking against libclustalo you will have to
48 * use a C++ compiler, no matter if your program was written in C or C++.
49 *
50 * First compile (no linking) your source (for an example see section "\ref
51 * example_src_subsubsec"):
52 *
53 * @code
54 * $ gcc -c -ansi -Wall clustalo-api-test.c
55 * @endcode
56 *
57 * Then link against libclustalo (we recommend the use of pkg-config as
58 * explained in \ref pkgconfig_subsubsec). Assuming Clustal Omega was installed
59 * in system-wide default directory (e.g. /usr) just type:
60 *
61 * @code
62 * $ g++ -ansi -Wall -o clustalo-api-test clustalo-api-test.o -lclustalo
63 * @endcode
64 *
65 * Voila! Now you have your own alignment program which can be run with
66 *
67 * @code
68 * $ ./clustalo-api-test <your-sequence-input>
69 * @endcode
70 *
71 * It's best to use the same compiler that you used for compiling libclustal.
72 * If libclustal was compiled with OpenMP support, you will have to use OpenMP
73 * flags for you program as well.
74 *
75 *
76 * @subsubsection pkgconfig_subsubsec Using pkg-config / Figuring out compiler flags
77 *
78 * Clustal Omega comes with support for <a
79 * href="http://pkg-config.freedesktop.org">pkg-config</a>, which means you
80 * can run
81 *
82 * @code
83 * $ pkg-config --cflags --libs clustalo
84 * @endcode
85 *
86 * to figure out cflags and library flags needed to compile and link against
87 * libclustalo. This is especially handy if Clustal Omega was installed to a
88 * non-standard directory.
89 *
90 * You might have to change PKG_CONFIG_PATH. For example, if you used the prefix $HOME/local/ for
91 * installation then you will first need to set PKG_CONFIG_PATH:
92 *
93 * @code
94 * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig
95 * $ pkg-config --cflags --libs clustalo
96 * @endcode
97 *
98 *
99 * To compile your source use:
100 *
101 * @code
102 * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig
103 * $ gcc -c -ansi -Wall clustalo-api-test.c $(pkg-config --cflags clustalo)
104 * $ g++ -ansi -Wall -o clustalo-api-test clustalo-api-test.o $(pkg-config --libs clustalo)
105 * @endcode
106 *
107 *
108 * @subsubsection example_src_subsubsec Example Source Code
109 *
110 * @include "clustalo-api-test.c"
111 *
112 *
113 */
114
115
116
117
118 /* FIXME: doc */
119 /* the following are temporary flags while the code is still under construction;
120 had problems internalising hhmake, so as temporary crutch
121 write alignment to file and get external hmmer/hhmake via system call
122 to read alignment and convert into HMM
123 All this will go, once hhmake is properly internalised */
124 #define INDIRECT_HMM 0 /* temp flag: (1) write aln to file, use system(hmmer/hhmake), (0) internal hhmake */
125 #define USEHMMER 1 /* temp flag: use system(hmmer) to build HMM */
126 #define USEHHMAKE (!USEHMMER) /* temp flag: use system(hhmake) to build HMM */
127
128
129 /* shuffle order of input sequences */
130 #define SHUFFLE_INPUT_SEQ_ORDER 0
131
132 /* sort input sequences by length */
133 #define SORT_INPUT_SEQS 0
134
135
136 int iNumberOfThreads;
137
138 /* broken, unused and lonely */
139 static const int ITERATION_SCORE_IMPROVEMENT_THRESHOLD = 0.01;
140
141
142 /**
143 * @brief Print Long version information to pre-allocated char.
144 *
145 * @note short version
146 * information is equivalent to PACKAGE_VERSION
147 *
148 * @param[out] pcStr
149 * char pointer to write to preallocated to hold iSize chars.
150 * @param[in] iSize
151 * size of pcStr
152 */
153 void
154 PrintLongVersion(char *pcStr, int iSize)
155 {
156 snprintf(pcStr, iSize, "version %s; code-name '%s'; build date %s",
157 PACKAGE_VERSION, PACKAGE_CODENAME, __DATE__);
158 }
159 /* end of PrintLongVersion() */
160
161
162
163 /**
164 * @brief free aln opts members
165 *
166 */
167 void
168 FreeAlnOpts(opts_t *prAlnOpts) {
169 if (NULL != prAlnOpts->pcGuidetreeInfile) {
170 CKFREE(prAlnOpts->pcGuidetreeInfile);
171 }
172 if (NULL != prAlnOpts->pcGuidetreeOutfile) {
173 CKFREE(prAlnOpts->pcGuidetreeOutfile);
174 }
175 if (NULL != prAlnOpts->pcDistmatOutfile) {
176 CKFREE(prAlnOpts->pcDistmatOutfile);
177 }
178 if (NULL != prAlnOpts->pcDistmatInfile) {
179 CKFREE(prAlnOpts->pcDistmatInfile);
180 }
181 }
182 /* end of FreeAlnOpts() */
183
184
185
186 /**
187 * @brief Sets members of given user opts struct to default values
188 *
189 * @param[out] prOpts
190 * User opt struct to initialise
191 *
192 */
193 void
194 SetDefaultAlnOpts(opts_t *prOpts) {
195 prOpts->bAutoOptions = FALSE;
196
197 prOpts->pcDistmatInfile = NULL;
198 prOpts->pcDistmatOutfile = NULL;
199
200 prOpts->iClusteringType = CLUSTERING_UPGMA;
201 prOpts->iPairDistType = PAIRDIST_KTUPLE;
202 prOpts->bUseMbed = FALSE;
203 prOpts->bUseMbedForIteration = FALSE;
204 prOpts->pcGuidetreeOutfile = NULL;
205 prOpts->pcGuidetreeInfile = NULL;
206
207 prOpts->ppcHMMInput = NULL;
208 prOpts->iHMMInputFiles = 0;
209
210 prOpts->iNumIterations = 0;
211 prOpts->bIterationsAuto = FALSE;
212 prOpts->iMaxGuidetreeIterations = INT_MAX;
213 prOpts->iMaxHMMIterations = INT_MAX;
214 prOpts->iMacRam = 2048; /* give 2GB to MAC algorithm. FS, r240 -> r241 */
215 }
216 /* end of SetDefaultAlnOpts() */
217
218
219
220 /**
221 * @brief Check logic of parsed user options. Will exit (call Log(&rLog, LOG_FATAL, ))
222 * on Fatal logic error
223 *
224 * @param[in] prOpts
225 * Already parsed user options
226 *
227 */
228 void
229 AlnOptsLogicCheck(opts_t *prOpts)
230 {
231 /* guide-tree & distmat
232 *
233 */
234 if (prOpts->pcDistmatInfile && prOpts->pcGuidetreeInfile) {
235 Log(&rLog, LOG_FATAL, "Read distances *and* guide-tree from file doesn't make sense.");
236 }
237
238 if (prOpts->pcDistmatOutfile && prOpts->pcGuidetreeInfile) {
239 Log(&rLog, LOG_FATAL, "Won't be able to save distances to file, because I got a guide-tree as input.");
240 }
241
242 /* combination of options that don't make sense when not iterating
243 */
244 if (prOpts->iNumIterations==0 && prOpts->bIterationsAuto != TRUE) {
245
246 if (prOpts->pcGuidetreeInfile && prOpts->pcGuidetreeOutfile) {
247 Log(&rLog, LOG_FATAL, "Got a guide-tree as input and output which doesn't make sense when not iterating.");
248 }
249 /*
250 if (prOpts->pcGuidetreeInfile && prOpts->bUseMbed > 0) {
251 Log(&rLog, LOG_FATAL, "Got a guide-tree as input and was requested to cluster with mBed, which doesn't make sense when not iterating.");
252 }
253 */
254 if (prOpts->bUseMbedForIteration > 0) {
255 Log(&rLog, LOG_FATAL, "No iteration requested, but mbed for iteration was set. Paranoia exit.");
256 }
257 }
258
259 if (prOpts->iMacRam < 512) {
260
261 Log(&rLog, LOG_INFO, "Memory for MAC Algorithm quite low, Viterbi Algorithm may be triggered.");
262
263 if (prOpts->iMacRam < 1) {
264
265 Log(&rLog, LOG_WARN, "Viterbi Algorithm always turned on, increase MAC-RAM to turn on MAC.");
266 }
267 }
268
269 return;
270 }
271 /* end of AlnOptsLogicCheck() */
272
273
274 /**
275 * @brief FIXME doc
276 */
277 void
278 PrintAlnOpts(FILE *prFile, opts_t *prOpts)
279 {
280 int iAux;
281
282
283 /* keep in same order as struct */
284 fprintf(prFile, "option: auto-options = %d\n", prOpts->bAutoOptions);
285 fprintf(prFile, "option: distmat-infile = %s\n",
286 NULL != prOpts->pcDistmatInfile? prOpts->pcDistmatInfile: "(null)");
287 fprintf(prFile, "option: distmat-outfile = %s\n",
288 NULL != prOpts->pcDistmatOutfile? prOpts->pcDistmatOutfile: "(null)");
289 fprintf(prFile, "option: clustering-type = %d\n", prOpts->iClusteringType);
290 fprintf(prFile, "option: pair-dist-type = %d\n", prOpts->iPairDistType);
291 fprintf(prFile, "option: use-mbed = %d\n", prOpts->bUseMbed);
292 fprintf(prFile, "option: use-mbed-for-iteration = %d\n", prOpts->bUseMbedForIteration);
293 fprintf(prFile, "option: guidetree-outfile = %s\n",
294 NULL != prOpts->pcGuidetreeOutfile? prOpts->pcGuidetreeOutfile: "(null)");
295 fprintf(prFile, "option: guidetree-infile = %s\n",
296 NULL != prOpts->pcGuidetreeInfile? prOpts->pcGuidetreeInfile: "(null)");
297 for (iAux=0; iAux<prOpts->iHMMInputFiles; iAux++) {
298 fprintf(prFile, "option: hmm-input no %d = %s\n", iAux, prOpts->ppcHMMInput[iAux]);
299 }
300 fprintf(prFile, "option: hmm-input-files = %d\n", prOpts->iHMMInputFiles);
301 fprintf(prFile, "option: num-iterations = %d\n", prOpts->iNumIterations);
302 fprintf(prFile, "option: iterations-auto = %d\n", prOpts->bIterationsAuto);
303 fprintf(prFile, "option: max-hmm-iterations = %d\n", prOpts->iMaxHMMIterations);
304 fprintf(prFile, "option: max-guidetree-iterations = %d\n", prOpts->iMaxGuidetreeIterations);
305 }
306 /* end of PrintAlnOpts() */
307
308
309
310 /**
311 * @brief Returns major version of HMMER. Whichever hmmbuild version
312 * is found first in your PATH will be used
313 *
314 * @return -1 on error, major hmmer version otherwise
315 *
316 */
317 int
318 HmmerVersion()
319 {
320 char zcHmmerTestCall[] = "hmmbuild -h";
321 FILE *fp = NULL;
322 int iMajorVersion = 0;
323 char zcLine[16384];
324
325 if (NULL == (fp = popen(zcHmmerTestCall, "r"))) {
326 Log(&rLog, LOG_ERROR, "Couldn't exec %s", zcHmmerTestCall);
327 return -1;
328 }
329 while (fgets(zcLine, sizeof(zcLine), fp)) {
330 char *pcLocate;
331 if ((pcLocate = strstr(zcLine, "HMMER "))) {
332 iMajorVersion = atoi(&pcLocate[6]);
333 break;
334 }
335 }
336 pclose(fp);
337
338 return iMajorVersion;
339 }
340 /* end of HmmerVersion() */
341
342
343
344 /**
345 * @brief Create a HHM file from aligned sequences
346 *
347 * @warning Should be eliminated in the future
348 * as building routine should not create intermediate files
349 *
350 * @param[in] prMSeq
351 * Aligned mseq_t
352 * @param[in] pcHMMOut
353 * HMM output file name
354 *
355 * @return Non-zero on error
356 *
357 */
358 int
359 AlnToHHMFile(mseq_t *prMSeq, char *pcHMMOut)
360 {
361 char *tmp_aln = NULL;
362 int retcode = OK;
363
364 assert(NULL!=prMSeq);
365 assert(NULL!=pcHMMOut);
366
367 if (FALSE == prMSeq->aligned) {
368 Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM");
369 return FAILURE;
370 }
371
372 /* Convert alignment to a2m, and call hhmake
373 *
374 * can't be static templates, or mktemp fails (at least on os x
375 * (with a bus error))
376 *
377 * gcc says we should use mkstemp to avoid race conditions,
378 * but that returns a file descriptor, which is of no use to
379 * us
380 */
381 /* NOTE: the following won't work on windows: missing /tmp/ */
382 tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX");
383 if (NULL == mktemp(tmp_aln)) {
384 Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename");
385 retcode = FAILURE;
386 goto cleanup_and_return;
387 }
388 if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_A2M)) {
389 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln);
390 retcode = FAILURE;
391 goto cleanup_and_return;
392 }
393
394 if (HHMake_Wrapper(tmp_aln, pcHMMOut)){
395 Log(&rLog, LOG_ERROR, "Could not convert alignment %s into HHM", tmp_aln);
396 retcode = FAILURE;
397 goto cleanup_and_return;
398 }
399
400
401 cleanup_and_return:
402
403 if (NULL != tmp_aln) {
404 if (FileExists(tmp_aln)) {
405 if (remove(tmp_aln)) {
406 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln);
407 }
408 }
409 CKFREE(tmp_aln);
410 }
411
412 return retcode;
413
414 } /* end of AlnToHHMFile() */
415
416
417
418 /**
419 * @brief Create a HMM file from aligned sequences
420 *
421 * @warning Should be replaced in the future by some internal HMM
422 * building routine that does not call external programs
423 *
424 * @param[in] prMSeq
425 * Aligned mseq_t
426 * @param[in] pcHMMOut
427 * HMM output file name
428 *
429 * @return Non-zero on error
430 *
431
432 */
433 int
434 AlnToHMMFile(mseq_t *prMSeq, const char *pcHMMOut)
435 {
436 char *tmp_aln = NULL;
437 char *tmp_hmm = NULL; /* only needed for hmmer3 to hmmer2 conversion */
438 char cmdbuf[16384];
439 int iHmmerVersion = 0;
440 int retcode = OK;
441
442 assert(NULL!=prMSeq);
443 assert(NULL!=pcHMMOut);
444
445 if (FALSE == prMSeq->aligned) {
446 Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM");
447 return FAILURE;
448 }
449
450 iHmmerVersion = HmmerVersion();
451 if (2 != iHmmerVersion && 3 != iHmmerVersion) {
452 Log(&rLog, LOG_ERROR, "Could not find suitable HMMER binaries");
453 return FAILURE;
454 }
455
456 /* Convert alignment to stockholm, call hmmbuild and then
457 * either hmmconvert (hmmer3) or hmmcalibrate (hmmer2)
458 *
459 * can't be static templates, or mktemp fails (at least on os x
460 * (with a bus error))
461 *
462 * gcc says we should use mkstemp to avoid race conditions,
463 * but that returns a file descriptor, which is of no use to
464 * us
465 */
466 /* NOTE: the following won't work on windows: missing /tmp/ */
467 tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX");
468 if (NULL == mktemp(tmp_aln)) {
469 Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename");
470 retcode = FAILURE;
471 goto cleanup_and_return;
472 }
473 if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_STOCKHOLM)) {
474 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln);
475 retcode = FAILURE;
476 goto cleanup_and_return;
477 }
478
479 if (2 == iHmmerVersion) {
480 sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmcalibrate %s >/dev/null",
481 pcHMMOut, tmp_aln, pcHMMOut);
482 if (system(cmdbuf)) {
483 Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf);
484 retcode = FAILURE;
485 goto cleanup_and_return;
486 }
487 } else if (3 == iHmmerVersion) {
488 /* NOTE: the following won't work on windows: missing /tmp/ */
489 tmp_hmm = CkStrdup("/tmp/clustalo_tmphmm2_XXXXXX");
490 if (NULL == mktemp(tmp_hmm)) {
491 Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename");
492 retcode = FAILURE;
493 goto cleanup_and_return;
494 }
495 sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmconvert -2 %s > %s",
496 tmp_hmm, tmp_aln, tmp_hmm, pcHMMOut);
497 if (system(cmdbuf)) {
498 Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf);
499 retcode = FAILURE;
500 goto cleanup_and_return;
501 }
502 } else {
503 CKFREE(tmp_aln);
504 Log(&rLog, LOG_FATAL, "Internal error: Unknown Hmmer version %d", iHmmerVersion);
505 }
506
507
508 cleanup_and_return:
509
510 if (NULL != tmp_aln) {
511 if (FileExists(tmp_aln)) {
512 if (remove(tmp_aln)) {
513 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln);
514 }
515 }
516 CKFREE(tmp_aln);
517 }
518 if (NULL != tmp_hmm) {
519 if (FileExists(tmp_hmm)) {
520 if (remove(tmp_hmm)) {
521 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_hmm);
522 }
523 }
524 CKFREE(tmp_hmm);
525 }
526
527 return retcode;
528 }
529 /* end of AlnToHMMFile() */
530
531
532
533 /**
534 * @brief Convert a multiple sequence structure into a HMM
535 *
536 * @param[out] prHMM
537 * Pointer to preallocted HMM which will be set here
538 * @param[in] prMSeq
539 * Pointer to an alignment
540 *
541 * @return 0 on error, non-0 otherwise
542 *
543 * @see AlnToHMMFile()
544 *
545 */
546 int
547 AlnToHMM(hmm_light *prHMM, mseq_t *prMSeq)
548 {
549 char *pcHMM; /* temp hmm file */
550
551 Log(&rLog, LOG_INFO,
552 "Using HMMER version %d to calculate a new HMM.",
553 HmmerVersion());
554 /* FIXME replace all this with internal HMM computation (HHmake) */
555
556 /**
557 * @warning the following probably won't work on windows: missing
558 * /tmp/. Should be ok on Cygwin though
559 */
560 pcHMM = CkStrdup("/tmp/clustalo-hmm-iter_XXXXXX");
561 if (NULL == mktemp(pcHMM)) {
562 Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename");
563 CKFREE(pcHMM);
564 return FAILURE;
565 }
566
567 /* Create a HMM representing the current alignment
568 */
569 #if USEHMMER
570 if (AlnToHMMFile(prMSeq, pcHMM)) {
571 Log(&rLog, LOG_ERROR, "AlnToHMMFile() on %s failed.", pcHMM);
572 CKFREE(pcHMM);
573 return FAILURE;
574 }
575 #elif USEHHMAKE
576 if (AlnToHHMFile(prMSeq, pcHMM)) {
577 Log(&rLog, LOG_ERROR, "AlnToHHMFile() on %s failed.", pcHMM);
578 CKFREE(pcHMM);
579 return FAILURE;
580 }
581 /* Log(&rLog, LOG_FATAL, "Method to create HHM (HMM using hhmake) not installed yet"); */
582 #else
583 Log(&rLog, LOG_FATAL, "Unknown method to create temporary HMM");
584 #endif
585
586 /* Read HMM information
587 */
588 if (OK != readHMMWrapper(prHMM, pcHMM)){
589 Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMM);
590 CKFREE(pcHMM);
591 return FAILURE;
592 }
593
594 if (remove(pcHMM)) {
595 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", pcHMM);
596 }
597 CKFREE(pcHMM);
598
599 return OK;
600 }
601 /* end of AlnToHMM() */
602
603
604
605 /**
606 * @brief FIXME
607 *
608 */
609 void
610 InitClustalOmega(int iNumThreadsRequested)
611 {
612
613 #ifdef HAVE_OPENMP
614 iNumberOfThreads = iNumThreadsRequested;
615 omp_set_num_threads(iNumberOfThreads);
616 #else
617 if (iNumThreadsRequested>1) {
618 Log(&rLog, LOG_FATAL, "Cannot change number of threads to %d. %s was build without OpenMP support.",
619 iNumThreadsRequested, PACKAGE_NAME);
620 }
621 iNumberOfThreads = 1; /* need to set this, even if build without support */
622 #endif
623
624 Log(&rLog, LOG_INFO, "Using %d threads",
625 iNumberOfThreads);
626
627 }
628 /* end of InitClustalOmega() */
629
630
631
632 /**
633 * @brief Defines an alignment order, which adds sequences
634 * sequentially, i.e. one at a time starting with seq 1 & 2
635 *
636 * @param[out] piOrderLR_p
637 * order in which nodes/profiles are to be merged/aligned
638 * @param[in] iNumSeq
639 * Number of sequences
640 *
641 * @see TraverseTree()
642 *
643 */
644 void
645 SequentialAlignmentOrder(int **piOrderLR_p, int iNumSeq)
646 {
647 unsigned int uNodes = iNumSeq*2-1;
648 unsigned int uNodeCounter = 0;
649 unsigned int uSeqCounter = 0;
650
651 Log(&rLog, LOG_FATAL, "FIXME: Untested...");
652
653 (*piOrderLR_p) = (int *)CKCALLOC(DIFF_NODE * uNodes, sizeof(int));
654 /* loop over merge nodes, which have per definition even indices
655 * and set up children which have odd indices
656 */
657 uSeqCounter = 0;
658 for (uNodeCounter=iNumSeq; uNodeCounter<uNodes; uNodeCounter+=1) {
659 unsigned int uLeftChildNodeIndex = uNodeCounter-1;
660 unsigned int uRightChildNodeIndex = uNodeCounter-iNumSeq+1;
661 unsigned int uParentNodeIndex = uNodeCounter+1;
662
663 /* merge node setup */
664 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE] = uLeftChildNodeIndex;
665 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+RGHT_NODE] = uRightChildNodeIndex;
666 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+PRNT_NODE] = uParentNodeIndex;
667 /* only setup left child if at first merge node, all other left childs
668 * should be merge nodes that are already set up. also correct
669 * left node number here.
670 */
671 if (uNodeCounter==iNumSeq) {
672 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE] = 0;
673
674 (*piOrderLR_p)[0+LEFT_NODE] = 0;
675 (*piOrderLR_p)[0+RGHT_NODE] = 0;
676 (*piOrderLR_p)[0+PRNT_NODE] = uNodeCounter;
677 uSeqCounter++;
678
679 Log(&rLog, LOG_FORCED_DEBUG, "Set up first leaf with node counter %d: left=%d right=%d parent=%d",
680 0,
681 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+LEFT_NODE],
682 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+RGHT_NODE],
683 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+PRNT_NODE]);
684 }
685 Log(&rLog, LOG_FORCED_DEBUG, "Set up merge node with node counter %d: left=%d right=%d parent=%d",
686 uNodeCounter, (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE],
687 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+RGHT_NODE],
688 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+PRNT_NODE]);
689
690 /* right child */
691 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+LEFT_NODE] = uSeqCounter;
692 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+RGHT_NODE] = uSeqCounter;
693 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+PRNT_NODE] = uNodeCounter;
694 uSeqCounter++;
695
696 Log(&rLog, LOG_FORCED_DEBUG, "Set up leaf with node counter %d: left=%d right=%d parent=%d",
697 uRightChildNodeIndex, (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+LEFT_NODE],
698 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+RGHT_NODE],
699 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+PRNT_NODE]);
700 }
701 }
702 /* end of SequentialAlignmentOrder() */
703
704
705
706 /**
707 * @brief Defines the alignment order by calculating a guide tree. In
708 * a first-step pairwise distances will be calculated (or read from a
709 * file). In a second step those distances will be clustered and a
710 * guide-tree created. Steps 1 and 2 will be skipped if a guide-tree
711 * file was given, in which case the guide-tree will be just read from
712 * the file.
713 *
714 * @param[out] piOrderLR_p
715 * order in which nodes/profiles are to be merged/aligned
716 * @param[out] pdSeqWeights_p
717 * Sequence weights
718 * @param[out] pdSeqWeights_p
719 * Sequence weights
720 * @param[in] prMSeq
721 * The sequences from which the alignment order is to be calculated
722 * @param[in] iPairDistType
723 * Method of pairwise distance comparison
724 * @param[in] pcDistmatInfile
725 * If not NULL distances will be read from this file instead of being
726 * calculated
727 * @param[in] pcDistmatOutfile
728 * If not NULL computed pairwise distances will be written to this file
729 * @param[in] iClusteringType
730 * Clustering method to be used to cluster the pairwise distances
731 * @param[in] pcGuidetreeInfile
732 * If not NULL guidetree will be read from this file. Skips pairwise
733 * distance and guidetree computation
734 * @param[in] pcGuidetreeOutfile
735 * If not NULL computed guidetree will be written to this file
736 * @param[in] bUseMbed
737 * If TRUE, fast mBed guidetree computation will be employed
738 *
739 * @return Non-zero on error
740 *
741 */
742 int
743 AlignmentOrder(int **piOrderLR_p, double **pdSeqWeights_p, mseq_t *prMSeq,
744 int iPairDistType, char *pcDistmatInfile, char *pcDistmatOutfile,
745 int iClusteringType, char *pcGuidetreeInfile, char *pcGuidetreeOutfile,
746 bool bUseMbed)
747 {
748 /* pairwise distance matrix (tmat in 1.83) */
749 symmatrix_t *distmat = NULL;
750 /* guide tree */
751 tree_t *prTree = NULL;
752 int i = 0;
753
754
755 /* Shortcut for only two sequences: Do not compute k-tuple
756 * distances. Use the same logic as in TraverseTree() to setup
757 * piOrderLR_p. Changes there will have to be reflected here as
758 * well. */
759 if (2==prMSeq->nseqs) {
760 Log(&rLog, LOG_VERBOSE,
761 "Have only two sequences: No need to compute pairwise score and compute a tree.");
762
763 (*piOrderLR_p) = (int*) CKMALLOC(DIFF_NODE * 3 * sizeof(int));
764 (*piOrderLR_p)[DIFF_NODE*0+LEFT_NODE] = 0;
765 (*piOrderLR_p)[DIFF_NODE*0+RGHT_NODE] = 0;
766 (*piOrderLR_p)[DIFF_NODE*0+PRNT_NODE] = 0;
767
768 (*piOrderLR_p)[DIFF_NODE*1+LEFT_NODE] = 1;
769 (*piOrderLR_p)[DIFF_NODE*1+RGHT_NODE] = 1;
770 (*piOrderLR_p)[DIFF_NODE*1+PRNT_NODE] = 1;
771
772 /* root */
773 (*piOrderLR_p)[DIFF_NODE*2+LEFT_NODE] = 0;
774 (*piOrderLR_p)[DIFF_NODE*2+RGHT_NODE] = 1;
775 (*piOrderLR_p)[DIFF_NODE*2+PRNT_NODE] = 2;
776
777 /* Same logic as CalcClustalWeights(). Changes there will
778 have to be reflected here as well. */
779 #if USE_WEIGHTS
780 (*pdWeights_p) = (double *) CKMALLOC(uNodeCount * sizeof(double));
781 (*pdWeights_p)[0] = 0.5;
782 (*pdWeights_p)[1] = 0.5;
783 #endif
784
785 return OK;
786 }
787
788
789 /* compute distance & guide tree, alternatively read distances or
790 * guide tree from file
791 *
792 */
793 if (NULL != pcGuidetreeInfile) {
794 Log(&rLog, LOG_INFO, "Reading guide-tree from %s", pcGuidetreeInfile);
795 if (GuideTreeFromFile(&prTree, prMSeq, pcGuidetreeInfile)) {
796 Log(&rLog, LOG_ERROR, "Reading of guide tree %s failed.", pcGuidetreeInfile);
797 return FAILURE;
798 }
799
800 } else {
801
802 if (bUseMbed) {
803 if (Mbed(&prTree, prMSeq, iPairDistType, pcGuidetreeOutfile)) {
804 Log(&rLog, LOG_ERROR, "mbed execution failed.");
805 return FAILURE;
806 }
807 Log(&rLog, LOG_INFO, "Guide-tree computation (mBed) done.");
808 if (NULL != pcDistmatOutfile) {
809 Log(&rLog, LOG_INFO,
810 "Ignoring request to write distance matrix (am in mBed mode)");
811 }
812 } else {
813
814 if (PairDistances(&distmat, prMSeq, iPairDistType,
815 0, prMSeq->nseqs, 0, prMSeq->nseqs,
816 pcDistmatInfile, pcDistmatOutfile)) {
817 Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
818 return FAILURE;
819 }
820
821 /* clustering of distances to get guide tree
822 */
823 if (CLUSTERING_UPGMA == iClusteringType) {
824 char **labels;
825 labels = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char*));
826 for (i=0; i<prMSeq->nseqs; i++) {
827 labels[i] = prMSeq->sqinfo[i].name;
828 }
829
830 GuideTreeUpgma(&prTree, labels, distmat, pcGuidetreeOutfile);
831 Log(&rLog, LOG_INFO, "Guide-tree computation done.");
832
833 CKFREE(labels);
834 } else {
835 Log(&rLog, LOG_FATAL, "INTERNAL ERROR %s",
836 "clustering method should have been checked before");
837 }
838 }
839 }
840
841 #if USE_WEIGHTS
842 /* derive sequence weights from tree
843 *
844 */
845 Log(&rLog, LOG_INFO, "Calculating sequence weights");
846 CalcClustalWeights(pdSeqWeights_p, prTree);
847 for (i = 0; i < GetLeafCount(prTree); i++) {
848 Log(&rLog, LOG_VERBOSE,
849 "Weight for seq no %d: %s = %f",
850 i, prMSeq->sqinfo[i].name, (*pdSeqWeights_p)[i]);
851 }
852 #else
853 Log(&rLog, LOG_DEBUG, "Not using weights");
854 #endif
855
856
857 /* define traversing order of tree
858 *
859 */
860 TraverseTree(piOrderLR_p, prTree, prMSeq);
861 if (rLog.iLogLevelEnabled <= LOG_DEBUG) {
862 /* FIXME: debug only, FS */
863 uint uNodeIndex;
864 FILE *fp = LogGetFP(&rLog, LOG_INFO);
865 Log(&rLog, LOG_DEBUG, "left/right order after tree traversal");
866 for (uNodeIndex = 0; uNodeIndex < GetNodeCount(prTree); uNodeIndex++) {
867 fprintf(fp, "%3d:\t%2d/%2d -> %d\n", i,
868 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+LEFT_NODE],
869 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+RGHT_NODE],
870 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+PRNT_NODE]);
871 }
872 }
873
874 FreeMuscleTree(prTree);
875 FreeSymMatrix(&distmat);
876
877 #if 0
878 Log(&rLog, LOG_FATAL, "DEBUG EXIT before leaving %s", __FUNCTION__);
879 #endif
880 return OK;
881 }
882 /* end of AlignmentOrder() */
883
884
885
886 /**
887 * @brief Set some options automatically based on number of sequences. Might
888 * overwrite some user-set options.
889 *
890 * @param[out] prOpts
891 * Pointer to alignment options structure
892 * @param[in] iNumSeq
893 * Number of sequences to align
894 */
895 void
896 SetAutoOptions(opts_t *prOpts, int iNumSeq) {
897
898 Log(&rLog, LOG_INFO,
899 "Setting options automatically based on input sequence characteristics (might overwrite some of your options).");
900
901 if (iNumSeq >= 10000) {
902 if (0 != prOpts->iNumIterations) {
903 Log(&rLog, LOG_INFO, "Auto settings: Disabling iterations.");
904 prOpts->iNumIterations = 0;
905 }
906 if (FALSE == prOpts->bUseMbed) {
907 Log(&rLog, LOG_INFO, "Auto settings: Enabling mBed.");
908 prOpts->bUseMbed = TRUE;
909 }
910
911 } else if (iNumSeq >= 1000) {
912 if (1 != prOpts->iNumIterations) {
913 Log(&rLog, LOG_INFO, "Auto settings: Setting iteration to 1.");
914 prOpts->iNumIterations = 1;
915 }
916 if (FALSE == prOpts->bUseMbed) {
917 Log(&rLog, LOG_INFO, "Auto settings: Enabling mBed.");
918 prOpts->bUseMbed = TRUE;
919 }
920
921
922 } else {
923 if (0 != prOpts->iNumIterations) {
924 Log(&rLog, LOG_INFO, "Auto settings: Disabling iterations.");
925 prOpts->iNumIterations = 0;
926 }
927 if (TRUE == prOpts->bUseMbed) {
928 Log(&rLog, LOG_INFO, "Auto settings: Disabling mBed.");
929 prOpts->bUseMbed = FALSE;
930 }
931 }
932 }
933 /* end of */
934
935
936
937 /**
938 * @brief The main alignment function which wraps everything else.
939 *
940 * @param[out] prMSeq
941 * *the* multiple sequences structure
942 * @param[in] prMSeqProfile
943 * optional profile to align against
944 * @param[in] prOpts
945 * alignmemnt options to use
946 *
947 * @return 0 on success, -1 on failure
948 *
949 */
950 int
951 Align(mseq_t *prMSeq,
952 mseq_t *prMSeqProfile,
953 opts_t *prOpts,
954 hhalign_para rHhalignPara) {
955
956 /* HMM
957 */
958 /* structs with pseudocounts etc; one for each HMM infile, i.e.
959 * index range: 0..iHMMInputFiles */
960 hmm_light *prHMMs = NULL;
961
962 /* MSA order in which nodes/profiles are to be merged/aligned
963 (order of nodes in guide tree (left/right)*/
964 int *piOrderLR = NULL;
965
966 /* weights per sequence */
967 double *pdSeqWeights = NULL;
968
969 /* Iteration
970 */
971 int iIterationCounter = 0;
972 double dAlnScore;
973 /* last dAlnScore for iteration */
974 double dLastAlnScore = -666.666;
975
976 int i, j; /* aux */
977
978 assert(NULL != prMSeq);
979 if (NULL != prMSeqProfile) {
980 assert(TRUE == prMSeqProfile->aligned);
981 }
982
983
984 /* automatic setting of options
985 *
986 */
987 if (prOpts->bAutoOptions) {
988 SetAutoOptions(prOpts, prMSeq->nseqs);
989 }
990
991
992 #if SHUFFLE_INPUT_SEQ_ORDER
993 /*
994 * shuffle input: only useful for testing/debugging
995 */
996 Log(&rLog, LOG_WARN, "Shuffling input sequences! (Will also change output order)");
997 ShuffleMSeq(prMSeq);
998 #endif
999
1000
1001 #if SORT_INPUT_SEQS
1002 /*
1003 * sort input:
1004 *
1005 * would ensure we *always* (unless we get into the mbed k-means stage)
1006 * get the same answer. usually you don't, because most pairwise alignment
1007 * scores are in theory not symmetric, therefore sequence ordering might
1008 * have an effect on the guide-tree. Sorting by length should get rid of
1009 * this (and takes no time even for 100k seqs). Benchmark results on
1010 * Balibase show almost no difference after sorting.
1011 */
1012 Log(&rLog, LOG_WARN, "Sorting input seq by length! This will also change the output order");
1013 SortMSeqByLength(prMSeq, 'd');
1014
1015 #endif
1016
1017
1018 /* Read backgrounds HMMs and store in prHMMs
1019 *
1020 */
1021 if (0 < prOpts->iHMMInputFiles) {
1022 int iHMMInfileIndex;
1023
1024 /**
1025 * @warning old structure used to be initialised like this:
1026 * hmm_light rHMM = {0};
1027 */
1028 prHMMs = (hmm_light *) CKMALLOC(prOpts->iHMMInputFiles * sizeof(hmm_light));
1029
1030 for (iHMMInfileIndex=0; iHMMInfileIndex<prOpts->iHMMInputFiles; iHMMInfileIndex++) {
1031 char *pcHMMInput = prOpts->ppcHMMInput[iHMMInfileIndex];
1032 if (OK != readHMMWrapper(&prHMMs[iHMMInfileIndex], pcHMMInput)){
1033 Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMMInput);
1034 return -1;
1035 }
1036
1037 #if 0
1038 Log(&rLog, LOG_FORCED_DEBUG, "HMM length is %d", prHMMs[iHMMInfileIndex].L);
1039 Log(&rLog, LOG_FORCED_DEBUG, "n-display is %d", prHMMs[iHMMInfileIndex].n_display);
1040 for (i = 0; NULL != prHMMs[prOpts->iHMMInputFiles].seq[i]; i++){
1041 printf("seq[%d]: %s\n", i, prHMMs[iHMMInfileIndex].seq[i]);
1042 }
1043 Log(&rLog, LOG_FORCED_DEBUG, "Neff_HMM is %f", prHMMs[iHMMInfileIndex].Neff_HMM);
1044 #endif
1045 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
1046 Log(&rLog, LOG_DEBUG, "print frequencies");
1047 for (i = 0; i < prHMMs[iHMMInfileIndex].L; i++){
1048 #define PRINT_TAIL 5
1049 if ( (PRINT_TAIL+1 == i) && (prHMMs[iHMMInfileIndex].L-PRINT_TAIL != i) ){
1050 printf("....\n");
1051 }
1052 if ( (i > PRINT_TAIL) && (i < prHMMs[iHMMInfileIndex].L-PRINT_TAIL) ){
1053 continue;
1054 }
1055 printf("%3d:", i);
1056 for (j = 0; j < 20; j++){
1057 printf("\t%1.3f", prHMMs[iHMMInfileIndex].f[i][j]);
1058 }
1059 printf("\n");
1060 }
1061 } /* debug print block */
1062
1063 CKFREE(prOpts->ppcHMMInput[iHMMInfileIndex]);
1064 } /* for each background HMM file */
1065 CKFREE(prOpts->ppcHMMInput);
1066 } /* there were background HMM files */
1067
1068
1069
1070 /* If the input ("non-profile") sequences are aligned, then turn
1071 * the alignment into a HMM and add to the list of background HMMs
1072 *
1073 */
1074 if (TRUE == prMSeq->aligned) {
1075 /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */
1076 hmm_light rHMMLocal = {0};
1077
1078 Log(&rLog, LOG_INFO,
1079 "Input sequences are aligned. Will turn alignment into HMM and add it to the user provided background HMMs.");
1080 if (OK !=
1081 #if INDIRECT_HMM
1082 AlnToHMM(&rHMMLocal, prMSeq)
1083 #else
1084 AlnToHMM2(&rHMMLocal, prMSeq->seq, prMSeq->nseqs)
1085 #endif
1086 ) {
1087 Log(&rLog, LOG_ERROR, "Couldn't convert aligned input sequences to HMM. Will try to continue");
1088 } else {
1089 prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light)));
1090 memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light));
1091 prOpts->iHMMInputFiles++;
1092 }
1093 }
1094
1095
1096 /* If we have a profile turn it into a HMM and add to
1097 * the list of background HMMs.
1098 *
1099 */
1100 if (NULL != prMSeqProfile) {
1101 /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */
1102 hmm_light rHMMLocal = {0};
1103 Log(&rLog, LOG_INFO,
1104 "Turning profile1 into HMM and will use it during progressive alignment.");
1105 if (OK !=
1106 #if INDIRECT_HMM
1107 AlnToHMM(&rHMMLocal, prMSeqProfile)
1108 #else
1109 AlnToHMM2(&rHMMLocal, prMSeqProfile->seq, prMSeqProfile->nseqs)
1110 #endif
1111 ) {
1112 Log(&rLog, LOG_ERROR, "Couldn't convert profile1 to HMM. Will try to continue");
1113 } else {
1114 prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light)));
1115 memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light));
1116 prOpts->iHMMInputFiles++;
1117 }
1118 }
1119
1120
1121 /* Now do a first alignment of the input sequences (prMSeq) adding
1122 * all collected background HMMs
1123 *
1124 */
1125 /* Determine progressive alignment order
1126 */
1127 if (TRUE == prMSeq->aligned) {
1128 Log(&rLog, LOG_INFO, "%s %s",
1129 "Input sequences are aligned.",
1130 "Will use Kimura distances of aligned sequences.");
1131 prOpts->iPairDistType = PAIRDIST_SQUIDID_KIMURA;
1132 }
1133
1134 #if 0
1135 Log(&rLog, LOG_WARN, "Using a sequential alignment order.");
1136 SequentialAlignmentOrder(&piOrderLR, prMSeq->nseqs);
1137 #else
1138 if (OK != AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq,
1139 prOpts->iPairDistType,
1140 prOpts->pcDistmatInfile, prOpts->pcDistmatOutfile,
1141 prOpts->iClusteringType,
1142 prOpts->pcGuidetreeInfile, prOpts->pcGuidetreeOutfile,
1143 prOpts->bUseMbed)) {
1144 Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue");
1145 return -1;
1146 }
1147 #endif
1148
1149 /* Progressive alignment of input sequences. Order defined by
1150 * branching of guide tree (piOrderLR). Use optional
1151 * background HMM information (prHMMs[0..prOpts->iHMMInputFiles-1])
1152 *
1153 */
1154 dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights,
1155 2*prMSeq->nseqs -1/* nodes */,
1156 prHMMs, prOpts->iHMMInputFiles, -1, rHhalignPara);
1157 dLastAlnScore = dAlnScore;
1158 Log(&rLog, LOG_VERBOSE,
1159 "Alignment score for first alignment = %f", dAlnScore);
1160
1161
1162
1163
1164 /* ------------------------------------------------------------
1165 *
1166 * prMSeq is aligned now. Now start iterations if requested and save the
1167 * alignment at the very end.
1168 *
1169 * @note We discard the background HMM information at this point,
1170 * because it was already used. Could consider to make this choice
1171 * optional. FIXME
1172 *
1173 * ------------------------------------------------------------ */
1174
1175
1176 /* iteration after first alignment was computed (if not profile-profile
1177 * alignment)
1178 *
1179 */
1180 for (iIterationCounter=0;
1181 (iIterationCounter < prOpts->iNumIterations || prOpts->bIterationsAuto);
1182 iIterationCounter++) {
1183
1184 hmm_light rHMMLocal = {0};
1185 /* FIXME Keep copy of old alignment in case new one sucks? */
1186
1187
1188 if (iIterationCounter >= prOpts->iMaxHMMIterations
1189 &&
1190 iIterationCounter >= prOpts->iMaxGuidetreeIterations) {
1191 Log(&rLog, LOG_VERBOSE, "Reached maximum number of HMM and guide-tree iterations");
1192 break;
1193 }
1194
1195 if (! prOpts->bIterationsAuto) {
1196 Log(&rLog, LOG_INFO, "Iteration step %d out of %d",
1197 iIterationCounter+1, prOpts->iNumIterations);
1198 } else {
1199 Log(&rLog, LOG_INFO, "Iteration step %d out of <auto>",
1200 iIterationCounter+1);
1201 }
1202 #if 0
1203 if (rLog.iLogLevelEnabled <= LOG_VERBOSE) {
1204 char zcIntermediate[1000] = {0};
1205 char *pcFormat = "fasta";
1206 sprintf(zcIntermediate, "clustalo-aln-iter~%d~", iIterationCounter);
1207 if (WriteAlignment(prMSeq, zcIntermediate, MSAFILE_A2M)) {
1208 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", zcIntermediate);
1209 return -1;
1210 }
1211 }
1212 #endif
1213
1214
1215 /* new guide-tree
1216 *
1217 */
1218 if (iIterationCounter < prOpts->iMaxGuidetreeIterations) {
1219 /* determine progressive alignment order
1220 *
1221 * few things are different now when calling AlignmentOrder:
1222 * - we have to ignore prOpts->pcDistmatInfile and pcGuidetreeInfile
1223 * as they were used before
1224 * - the corresponding outfiles are still valid though
1225 */
1226 /* Free stuff that has already been allocated by or further
1227 * downstream of AlignmentOrder()
1228 */
1229 if (NULL != piOrderLR)
1230 CKFREE(piOrderLR);
1231 if (NULL != pdSeqWeights)
1232 CKFREE(pdSeqWeights);
1233 if (AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq,
1234 PAIRDIST_SQUIDID_KIMURA /* override */, NULL, prOpts->pcDistmatOutfile,
1235 prOpts->iClusteringType, NULL, prOpts->pcGuidetreeOutfile,
1236 prOpts->bUseMbedForIteration)) {
1237 Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue");
1238 return -1;
1239 }
1240 } else {
1241 Log(&rLog, LOG_INFO, "Skipping guide-tree iteration at iteration step %d (reached maximum)",
1242 iIterationCounter);
1243 }
1244
1245
1246 /* new local hmm iteration
1247 *
1248 */
1249 if (iIterationCounter < prOpts->iMaxHMMIterations) {
1250 if (OK !=
1251 #if INDIRECT_HMM
1252 AlnToHMM(&rHMMLocal, prMSeq)
1253 #else
1254 AlnToHMM2(&rHMMLocal, prMSeq->seq, prMSeq->nseqs)
1255 #endif
1256 ) {
1257 Log(&rLog, LOG_ERROR, "Couldn't convert alignment to HMM. Will stop iterating now...");
1258 break;
1259 }
1260 } else {
1261 Log(&rLog, LOG_INFO, "Skipping HMM iteration at iteration step %d (reached maximum)",
1262 iIterationCounter);
1263 }
1264
1265
1266 /* align the sequences (again)
1267 */
1268 dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights,
1269 2*prMSeq->nseqs -1/* nodes */, &rHMMLocal, 1, -1, rHhalignPara);
1270 Log(&rLog, LOG_VERBOSE,
1271 "Alignment score for alignmnent in hmm-iteration no %d = %f (last score = %f)",
1272 iIterationCounter+1, dAlnScore, dLastAlnScore);
1273
1274
1275 FreeHMMstruct(&rHMMLocal);
1276
1277 #if 0
1278 /* FIXME: need a better score for automatic iteration */
1279 if (prOpts->bIterationsAuto) {
1280 /* automatic iteration: break if score improvement was not
1281 * big enough
1282 */
1283 double dScoreImprovement = (dAlnScore-dLastAlnScore)/dLastAlnScore;
1284 if (dScoreImprovement < ITERATION_SCORE_IMPROVEMENT_THRESHOLD) {
1285 Log(&rLog, LOG_INFO,
1286 "Stopping after %d guide-tree iterations. No further alignment score improvement achieved.",
1287 iIterationCounter+1);
1288 /* use previous alignment */
1289 FreeMSeq(&prMSeq);
1290 Log(&rLog, LOG_FORCED_DEBUG, "FIXME: %s", "CopyMSeq breaks things in this context");
1291 CopyMSeq(&prMSeq, prMSeqCopy);
1292 /* FIXME: prOpts->pcDistmatOutfile and pcGuidetreeOutfile
1293 * might have been updated, but then discarded here?
1294 */
1295 break;
1296 } else {
1297 Log(&rLog, LOG_INFO,
1298 "Got a %d%% better score in iteration step %d",
1299 (int)dScoreImprovement*100, iIterationCounter+1);
1300 FreeMSeq(&prMSeqCopy);
1301 }
1302 }
1303 dLastAlnScore = dAlnScore;
1304 #endif
1305
1306 }
1307 /* end of iterations */
1308
1309
1310
1311 /* Last step: if a profile was also provided then align now-aligned mseq
1312 * with this profile
1313 *
1314 * Don't use the backgrounds HMMs anymore and don't iterate.
1315 * (which was done before).
1316 *
1317 */
1318 if (NULL != prMSeqProfile) {
1319 if (AlignProfiles(prMSeq, prMSeqProfile, rHhalignPara)) {
1320 Log(&rLog, LOG_ERROR, "An error occured during the profile/profile alignment");
1321 return -1;
1322 }
1323 }
1324
1325
1326 if (NULL != piOrderLR) {
1327 CKFREE(piOrderLR);
1328 }
1329 if (NULL != pdSeqWeights) {
1330 CKFREE(pdSeqWeights);
1331 }
1332 if (0 < prOpts->iHMMInputFiles) {
1333 for (i=0; i<prOpts->iHMMInputFiles; i++) {
1334 FreeHMMstruct(&prHMMs[i]);
1335 }
1336 CKFREE(prHMMs);
1337 }
1338
1339 return 0;
1340 }
1341 /* end of Align() */
1342
1343
1344
1345
1346 /**
1347 * @brief Align two profiles, ie two sets of prealigned sequences. Already
1348 * aligned columns won't be changed.
1349 *
1350 * @param[out] prMSeqProfile1
1351 * First profile/aligned set of sequences. Merged alignment will be found in
1352 * here.
1353 * @param[in] prMSeqProfile2
1354 * First profile/aligned set of sequences
1355 *
1356 * @return 0 on success, -1 on failure
1357 *
1358 */
1359 int
1360 AlignProfiles(mseq_t *prMSeqProfile1,
1361 mseq_t *prMSeqProfile2, hhalign_para rHhalignPara) {
1362
1363 double dAlnScore;
1364
1365 /* number of seqs in first half of joined profile */
1366 int iProfProfSeparator = prMSeqProfile1->nseqs;
1367
1368 assert(TRUE == prMSeqProfile1->aligned);
1369 assert(TRUE == prMSeqProfile2->aligned);
1370
1371 Log(&rLog, LOG_INFO, "Performing profile/profile alignment");
1372
1373 /* Combine the available mseqs into prMSeq
1374 * which will be aligned afterwards.
1375 */
1376 JoinMSeqs(&prMSeqProfile1, prMSeqProfile2);
1377
1378
1379 /* set alignment flag explicitly to FALSE */
1380 prMSeqProfile1->aligned = FALSE;
1381
1382 dAlnScore = HHalignWrapper(prMSeqProfile1,
1383 NULL, /* no order */
1384 NULL, /* no weights */
1385 3, /* nodes: root+2profiles */
1386 NULL, 0 /* no bg-hmms */,
1387 iProfProfSeparator, rHhalignPara);
1388
1389 Log(&rLog, LOG_VERBOSE, "Alignment score is = %f", dAlnScore);
1390
1391 return 0;
1392 }
1393 /* end of AlignProfiles() */
1394
1395