1
|
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
|
|
2
|
|
3 /*********************************************************************
|
|
4 * Clustal Omega - Multiple sequence alignment
|
|
5 *
|
|
6 * Copyright (C) 2010 University College Dublin
|
|
7 *
|
|
8 * Clustal-Omega is free software; you can redistribute it and/or
|
|
9 * modify it under the terms of the GNU General Public License as
|
|
10 * published by the Free Software Foundation; either version 2 of the
|
|
11 * License, or (at your option) any later version.
|
|
12 *
|
|
13 * This file is part of Clustal-Omega.
|
|
14 *
|
|
15 ********************************************************************/
|
|
16
|
|
17 /*
|
|
18 * RCS $Id: clustal-omega.c 254 2011-06-21 13:07:50Z andreas $
|
|
19 */
|
|
20
|
|
21 #ifdef HAVE_CONFIG_H
|
|
22 #include "config.h"
|
|
23 #endif
|
|
24
|
|
25 #include <assert.h>
|
|
26
|
|
27 #include "clustal-omega.h"
|
|
28 #include "hhalign/general.h"
|
|
29
|
|
30 /* The following comment block contains the frontpage/mainpage of the doxygen
|
|
31 * documentation. Please add some more info. FIXME add more
|
|
32 */
|
|
33
|
|
34 /**
|
|
35 *
|
|
36 * @mainpage Clustal-Omega Documentation
|
|
37 *
|
|
38 * @section intro_sec Introduction
|
|
39 *
|
|
40 * For more information see http://www.clustal.org/
|
|
41 *
|
|
42 * @section api_section API
|
|
43 *
|
|
44 * @subsection example_prog_subsection An Example Program
|
|
45 *
|
|
46 * To use libclustalo you will have to include the clustal-omega.h header and
|
|
47 * link against libclustalo. For linking against libclustalo you will have to
|
|
48 * use a C++ compiler, no matter if your program was written in C or C++.
|
|
49 *
|
|
50 * First compile (no linking) your source (for an example see section "\ref
|
|
51 * example_src_subsubsec"):
|
|
52 *
|
|
53 * @code
|
|
54 * $ gcc -c -ansi -Wall clustalo-api-test.c
|
|
55 * @endcode
|
|
56 *
|
|
57 * Then link against libclustalo (we recommend the use of pkg-config as
|
|
58 * explained in \ref pkgconfig_subsubsec). Assuming Clustal Omega was installed
|
|
59 * in system-wide default directory (e.g. /usr) just type:
|
|
60 *
|
|
61 * @code
|
|
62 * $ g++ -ansi -Wall -o clustalo-api-test clustalo-api-test.o -lclustalo
|
|
63 * @endcode
|
|
64 *
|
|
65 * Voila! Now you have your own alignment program which can be run with
|
|
66 *
|
|
67 * @code
|
|
68 * $ ./clustalo-api-test <your-sequence-input>
|
|
69 * @endcode
|
|
70 *
|
|
71 * It's best to use the same compiler that you used for compiling libclustal.
|
|
72 * If libclustal was compiled with OpenMP support, you will have to use OpenMP
|
|
73 * flags for you program as well.
|
|
74 *
|
|
75 *
|
|
76 * @subsubsection pkgconfig_subsubsec Using pkg-config / Figuring out compiler flags
|
|
77 *
|
|
78 * Clustal Omega comes with support for <a
|
|
79 * href="http://pkg-config.freedesktop.org">pkg-config</a>, which means you
|
|
80 * can run
|
|
81 *
|
|
82 * @code
|
|
83 * $ pkg-config --cflags --libs clustalo
|
|
84 * @endcode
|
|
85 *
|
|
86 * to figure out cflags and library flags needed to compile and link against
|
|
87 * libclustalo. This is especially handy if Clustal Omega was installed to a
|
|
88 * non-standard directory.
|
|
89 *
|
|
90 * You might have to change PKG_CONFIG_PATH. For example, if you used the prefix $HOME/local/ for
|
|
91 * installation then you will first need to set PKG_CONFIG_PATH:
|
|
92 *
|
|
93 * @code
|
|
94 * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig
|
|
95 * $ pkg-config --cflags --libs clustalo
|
|
96 * @endcode
|
|
97 *
|
|
98 *
|
|
99 * To compile your source use:
|
|
100 *
|
|
101 * @code
|
|
102 * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig
|
|
103 * $ gcc -c -ansi -Wall clustalo-api-test.c $(pkg-config --cflags clustalo)
|
|
104 * $ g++ -ansi -Wall -o clustalo-api-test clustalo-api-test.o $(pkg-config --libs clustalo)
|
|
105 * @endcode
|
|
106 *
|
|
107 *
|
|
108 * @subsubsection example_src_subsubsec Example Source Code
|
|
109 *
|
|
110 * @include "clustalo-api-test.c"
|
|
111 *
|
|
112 *
|
|
113 */
|
|
114
|
|
115
|
|
116
|
|
117
|
|
118 /* FIXME: doc */
|
|
119 /* the following are temporary flags while the code is still under construction;
|
|
120 had problems internalising hhmake, so as temporary crutch
|
|
121 write alignment to file and get external hmmer/hhmake via system call
|
|
122 to read alignment and convert into HMM
|
|
123 All this will go, once hhmake is properly internalised */
|
|
124 #define INDIRECT_HMM 0 /* temp flag: (1) write aln to file, use system(hmmer/hhmake), (0) internal hhmake */
|
|
125 #define USEHMMER 1 /* temp flag: use system(hmmer) to build HMM */
|
|
126 #define USEHHMAKE (!USEHMMER) /* temp flag: use system(hhmake) to build HMM */
|
|
127
|
|
128
|
|
129 /* shuffle order of input sequences */
|
|
130 #define SHUFFLE_INPUT_SEQ_ORDER 0
|
|
131
|
|
132 /* sort input sequences by length */
|
|
133 #define SORT_INPUT_SEQS 0
|
|
134
|
|
135
|
|
136 int iNumberOfThreads;
|
|
137
|
|
138 /* broken, unused and lonely */
|
|
139 static const int ITERATION_SCORE_IMPROVEMENT_THRESHOLD = 0.01;
|
|
140
|
|
141
|
|
142 /**
|
|
143 * @brief Print Long version information to pre-allocated char.
|
|
144 *
|
|
145 * @note short version
|
|
146 * information is equivalent to PACKAGE_VERSION
|
|
147 *
|
|
148 * @param[out] pcStr
|
|
149 * char pointer to write to preallocated to hold iSize chars.
|
|
150 * @param[in] iSize
|
|
151 * size of pcStr
|
|
152 */
|
|
153 void
|
|
154 PrintLongVersion(char *pcStr, int iSize)
|
|
155 {
|
|
156 snprintf(pcStr, iSize, "version %s; code-name '%s'; build date %s",
|
|
157 PACKAGE_VERSION, PACKAGE_CODENAME, __DATE__);
|
|
158 }
|
|
159 /* end of PrintLongVersion() */
|
|
160
|
|
161
|
|
162
|
|
163 /**
|
|
164 * @brief free aln opts members
|
|
165 *
|
|
166 */
|
|
167 void
|
|
168 FreeAlnOpts(opts_t *prAlnOpts) {
|
|
169 if (NULL != prAlnOpts->pcGuidetreeInfile) {
|
|
170 CKFREE(prAlnOpts->pcGuidetreeInfile);
|
|
171 }
|
|
172 if (NULL != prAlnOpts->pcGuidetreeOutfile) {
|
|
173 CKFREE(prAlnOpts->pcGuidetreeOutfile);
|
|
174 }
|
|
175 if (NULL != prAlnOpts->pcDistmatOutfile) {
|
|
176 CKFREE(prAlnOpts->pcDistmatOutfile);
|
|
177 }
|
|
178 if (NULL != prAlnOpts->pcDistmatInfile) {
|
|
179 CKFREE(prAlnOpts->pcDistmatInfile);
|
|
180 }
|
|
181 }
|
|
182 /* end of FreeAlnOpts() */
|
|
183
|
|
184
|
|
185
|
|
186 /**
|
|
187 * @brief Sets members of given user opts struct to default values
|
|
188 *
|
|
189 * @param[out] prOpts
|
|
190 * User opt struct to initialise
|
|
191 *
|
|
192 */
|
|
193 void
|
|
194 SetDefaultAlnOpts(opts_t *prOpts) {
|
|
195 prOpts->bAutoOptions = FALSE;
|
|
196
|
|
197 prOpts->pcDistmatInfile = NULL;
|
|
198 prOpts->pcDistmatOutfile = NULL;
|
|
199
|
|
200 prOpts->iClusteringType = CLUSTERING_UPGMA;
|
|
201 prOpts->iPairDistType = PAIRDIST_KTUPLE;
|
|
202 prOpts->bUseMbed = TRUE; /* FS, r250 -> */
|
|
203 prOpts->bUseMbedForIteration = TRUE; /* FS, r250 -> */
|
|
204 prOpts->pcGuidetreeOutfile = NULL;
|
|
205 prOpts->pcGuidetreeInfile = NULL;
|
|
206
|
|
207 prOpts->ppcHMMInput = NULL;
|
|
208 prOpts->iHMMInputFiles = 0;
|
|
209
|
|
210 prOpts->iNumIterations = 0;
|
|
211 prOpts->bIterationsAuto = FALSE;
|
|
212 prOpts->iMaxGuidetreeIterations = INT_MAX;
|
|
213 prOpts->iMaxHMMIterations = INT_MAX;
|
|
214 prOpts->iMacRam = 2048; /* give 2GB to MAC algorithm. FS, r240 -> r241 */
|
|
215 }
|
|
216 /* end of SetDefaultAlnOpts() */
|
|
217
|
|
218
|
|
219
|
|
220 /**
|
|
221 * @brief Check logic of parsed user options. Will exit (call Log(&rLog, LOG_FATAL, ))
|
|
222 * on Fatal logic error
|
|
223 *
|
|
224 * @param[in] prOpts
|
|
225 * Already parsed user options
|
|
226 *
|
|
227 */
|
|
228 void
|
|
229 AlnOptsLogicCheck(opts_t *prOpts)
|
|
230 {
|
|
231 /* guide-tree & distmat
|
|
232 *
|
|
233 */
|
|
234 if (prOpts->pcDistmatInfile && prOpts->pcGuidetreeInfile) {
|
|
235 Log(&rLog, LOG_FATAL, "Read distances *and* guide-tree from file doesn't make sense.");
|
|
236 }
|
|
237
|
|
238 if (prOpts->pcDistmatOutfile && prOpts->pcGuidetreeInfile) {
|
|
239 Log(&rLog, LOG_FATAL, "Won't be able to save distances to file, because I got a guide-tree as input.");
|
|
240 }
|
|
241
|
|
242 /* combination of options that don't make sense when not iterating
|
|
243 */
|
|
244 if (prOpts->iNumIterations==0 && prOpts->bIterationsAuto != TRUE) {
|
|
245
|
|
246 if (prOpts->pcGuidetreeInfile && prOpts->pcGuidetreeOutfile) {
|
|
247 Log(&rLog, LOG_FATAL, "Got a guide-tree as input and output which doesn't make sense when not iterating.");
|
|
248 }
|
|
249 /*
|
|
250 if (prOpts->pcGuidetreeInfile && prOpts->bUseMbed > 0) {
|
|
251 Log(&rLog, LOG_FATAL, "Got a guide-tree as input and was requested to cluster with mBed, which doesn't make sense when not iterating.");
|
|
252 }
|
|
253 */
|
|
254 /*
|
|
255 AW: bUseMbedForIteration default since at least R252
|
|
256 if (prOpts->bUseMbedForIteration > 0) {
|
|
257 Log(&rLog, LOG_FATAL, "No iteration requested, but mbed for iteration was set. Paranoia exit.");
|
|
258 }
|
|
259 */
|
|
260 }
|
|
261
|
|
262 if (prOpts->iMacRam < 512) {
|
|
263
|
|
264 Log(&rLog, LOG_INFO, "Memory for MAC Algorithm quite low, Viterbi Algorithm may be triggered.");
|
|
265
|
|
266 if (prOpts->iMacRam < 1) {
|
|
267
|
|
268 Log(&rLog, LOG_WARN, "Viterbi Algorithm always turned on, increase MAC-RAM to turn on MAC.");
|
|
269 }
|
|
270 }
|
|
271
|
|
272 return;
|
|
273 }
|
|
274 /* end of AlnOptsLogicCheck() */
|
|
275
|
|
276
|
|
277 /**
|
|
278 * @brief FIXME doc
|
|
279 */
|
|
280 void
|
|
281 PrintAlnOpts(FILE *prFile, opts_t *prOpts)
|
|
282 {
|
|
283 int iAux;
|
|
284
|
|
285
|
|
286 /* keep in same order as struct */
|
|
287 fprintf(prFile, "option: auto-options = %d\n", prOpts->bAutoOptions);
|
|
288 fprintf(prFile, "option: distmat-infile = %s\n",
|
|
289 NULL != prOpts->pcDistmatInfile? prOpts->pcDistmatInfile: "(null)");
|
|
290 fprintf(prFile, "option: distmat-outfile = %s\n",
|
|
291 NULL != prOpts->pcDistmatOutfile? prOpts->pcDistmatOutfile: "(null)");
|
|
292 fprintf(prFile, "option: clustering-type = %d\n", prOpts->iClusteringType);
|
|
293 fprintf(prFile, "option: pair-dist-type = %d\n", prOpts->iPairDistType);
|
|
294 fprintf(prFile, "option: use-mbed = %d\n", prOpts->bUseMbed);
|
|
295 fprintf(prFile, "option: use-mbed-for-iteration = %d\n", prOpts->bUseMbedForIteration);
|
|
296 fprintf(prFile, "option: guidetree-outfile = %s\n",
|
|
297 NULL != prOpts->pcGuidetreeOutfile? prOpts->pcGuidetreeOutfile: "(null)");
|
|
298 fprintf(prFile, "option: guidetree-infile = %s\n",
|
|
299 NULL != prOpts->pcGuidetreeInfile? prOpts->pcGuidetreeInfile: "(null)");
|
|
300 for (iAux=0; iAux<prOpts->iHMMInputFiles; iAux++) {
|
|
301 fprintf(prFile, "option: hmm-input no %d = %s\n", iAux, prOpts->ppcHMMInput[iAux]);
|
|
302 }
|
|
303 fprintf(prFile, "option: hmm-input-files = %d\n", prOpts->iHMMInputFiles);
|
|
304 fprintf(prFile, "option: num-iterations = %d\n", prOpts->iNumIterations);
|
|
305 fprintf(prFile, "option: iterations-auto = %d\n", prOpts->bIterationsAuto);
|
|
306 fprintf(prFile, "option: max-hmm-iterations = %d\n", prOpts->iMaxHMMIterations);
|
|
307 fprintf(prFile, "option: max-guidetree-iterations = %d\n", prOpts->iMaxGuidetreeIterations);
|
|
308 }
|
|
309 /* end of PrintAlnOpts() */
|
|
310
|
|
311
|
|
312
|
|
313 /**
|
|
314 * @brief Returns major version of HMMER. Whichever hmmbuild version
|
|
315 * is found first in your PATH will be used
|
|
316 *
|
|
317 * @return -1 on error, major hmmer version otherwise
|
|
318 *
|
|
319 */
|
|
320 int
|
|
321 HmmerVersion()
|
|
322 {
|
|
323 char zcHmmerTestCall[] = "hmmbuild -h";
|
|
324 FILE *fp = NULL;
|
|
325 int iMajorVersion = 0;
|
|
326 char zcLine[16384];
|
|
327
|
|
328 if (NULL == (fp = popen(zcHmmerTestCall, "r"))) {
|
|
329 Log(&rLog, LOG_ERROR, "Couldn't exec %s", zcHmmerTestCall);
|
|
330 return -1;
|
|
331 }
|
|
332 while (fgets(zcLine, sizeof(zcLine), fp)) {
|
|
333 char *pcLocate;
|
|
334 if ((pcLocate = strstr(zcLine, "HMMER "))) {
|
|
335 iMajorVersion = atoi(&pcLocate[6]);
|
|
336 break;
|
|
337 }
|
|
338 }
|
|
339 pclose(fp);
|
|
340
|
|
341 return iMajorVersion;
|
|
342 }
|
|
343 /* end of HmmerVersion() */
|
|
344
|
|
345
|
|
346
|
|
347 /**
|
|
348 * @brief Create a HHM file from aligned sequences
|
|
349 *
|
|
350 * @warning Should be eliminated in the future
|
|
351 * as building routine should not create intermediate files
|
|
352 *
|
|
353 * @param[in] prMSeq
|
|
354 * Aligned mseq_t
|
|
355 * @param[in] pcHMMOut
|
|
356 * HMM output file name
|
|
357 *
|
|
358 * @return Non-zero on error
|
|
359 *
|
|
360 */
|
|
361 int
|
|
362 AlnToHHMFile(mseq_t *prMSeq, char *pcHMMOut)
|
|
363 {
|
|
364 char *tmp_aln = NULL;
|
|
365 int retcode = OK;
|
|
366
|
|
367 assert(NULL!=prMSeq);
|
|
368 assert(NULL!=pcHMMOut);
|
|
369
|
|
370 if (FALSE == prMSeq->aligned) {
|
|
371 Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM");
|
|
372 return FAILURE;
|
|
373 }
|
|
374
|
|
375 /* Convert alignment to a2m, and call hhmake
|
|
376 *
|
|
377 * can't be static templates, or mktemp fails (at least on os x
|
|
378 * (with a bus error))
|
|
379 *
|
|
380 * gcc says we should use mkstemp to avoid race conditions,
|
|
381 * but that returns a file descriptor, which is of no use to
|
|
382 * us
|
|
383 */
|
|
384 /* NOTE: the following won't work on windows: missing /tmp/ */
|
|
385 tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX");
|
|
386 if (NULL == mktemp(tmp_aln)) {
|
|
387 Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename");
|
|
388 retcode = FAILURE;
|
|
389 goto cleanup_and_return;
|
|
390 }
|
|
391 if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_A2M)) {
|
|
392 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln);
|
|
393 retcode = FAILURE;
|
|
394 goto cleanup_and_return;
|
|
395 }
|
|
396
|
|
397 if (HHMake_Wrapper(tmp_aln, pcHMMOut)){
|
|
398 Log(&rLog, LOG_ERROR, "Could not convert alignment %s into HHM", tmp_aln);
|
|
399 retcode = FAILURE;
|
|
400 goto cleanup_and_return;
|
|
401 }
|
|
402
|
|
403
|
|
404 cleanup_and_return:
|
|
405
|
|
406 if (NULL != tmp_aln) {
|
|
407 if (FileExists(tmp_aln)) {
|
|
408 if (remove(tmp_aln)) {
|
|
409 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln);
|
|
410 }
|
|
411 }
|
|
412 CKFREE(tmp_aln);
|
|
413 }
|
|
414
|
|
415 return retcode;
|
|
416
|
|
417 } /* end of AlnToHHMFile() */
|
|
418
|
|
419
|
|
420
|
|
421 /**
|
|
422 * @brief Create a HMM file from aligned sequences
|
|
423 *
|
|
424 * @warning Should be replaced in the future by some internal HMM
|
|
425 * building routine that does not call external programs
|
|
426 *
|
|
427 * @param[in] prMSeq
|
|
428 * Aligned mseq_t
|
|
429 * @param[in] pcHMMOut
|
|
430 * HMM output file name
|
|
431 *
|
|
432 * @return Non-zero on error
|
|
433 *
|
|
434
|
|
435 */
|
|
436 int
|
|
437 AlnToHMMFile(mseq_t *prMSeq, const char *pcHMMOut)
|
|
438 {
|
|
439 char *tmp_aln = NULL;
|
|
440 char *tmp_hmm = NULL; /* only needed for hmmer3 to hmmer2 conversion */
|
|
441 char cmdbuf[16384];
|
|
442 int iHmmerVersion = 0;
|
|
443 int retcode = OK;
|
|
444
|
|
445 assert(NULL!=prMSeq);
|
|
446 assert(NULL!=pcHMMOut);
|
|
447
|
|
448 if (FALSE == prMSeq->aligned) {
|
|
449 Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM");
|
|
450 return FAILURE;
|
|
451 }
|
|
452
|
|
453 iHmmerVersion = HmmerVersion();
|
|
454 if (2 != iHmmerVersion && 3 != iHmmerVersion) {
|
|
455 Log(&rLog, LOG_ERROR, "Could not find suitable HMMER binaries");
|
|
456 return FAILURE;
|
|
457 }
|
|
458
|
|
459 /* Convert alignment to stockholm, call hmmbuild and then
|
|
460 * either hmmconvert (hmmer3) or hmmcalibrate (hmmer2)
|
|
461 *
|
|
462 * can't be static templates, or mktemp fails (at least on os x
|
|
463 * (with a bus error))
|
|
464 *
|
|
465 * gcc says we should use mkstemp to avoid race conditions,
|
|
466 * but that returns a file descriptor, which is of no use to
|
|
467 * us
|
|
468 */
|
|
469 /* NOTE: the following won't work on windows: missing /tmp/ */
|
|
470 tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX");
|
|
471 if (NULL == mktemp(tmp_aln)) {
|
|
472 Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename");
|
|
473 retcode = FAILURE;
|
|
474 goto cleanup_and_return;
|
|
475 }
|
|
476 if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_STOCKHOLM)) {
|
|
477 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln);
|
|
478 retcode = FAILURE;
|
|
479 goto cleanup_and_return;
|
|
480 }
|
|
481
|
|
482 if (2 == iHmmerVersion) {
|
|
483 sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmcalibrate %s >/dev/null",
|
|
484 pcHMMOut, tmp_aln, pcHMMOut);
|
|
485 if (system(cmdbuf)) {
|
|
486 Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf);
|
|
487 retcode = FAILURE;
|
|
488 goto cleanup_and_return;
|
|
489 }
|
|
490 } else if (3 == iHmmerVersion) {
|
|
491 /* NOTE: the following won't work on windows: missing /tmp/ */
|
|
492 tmp_hmm = CkStrdup("/tmp/clustalo_tmphmm2_XXXXXX");
|
|
493 if (NULL == mktemp(tmp_hmm)) {
|
|
494 Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename");
|
|
495 retcode = FAILURE;
|
|
496 goto cleanup_and_return;
|
|
497 }
|
|
498 sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmconvert -2 %s > %s",
|
|
499 tmp_hmm, tmp_aln, tmp_hmm, pcHMMOut);
|
|
500 if (system(cmdbuf)) {
|
|
501 Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf);
|
|
502 retcode = FAILURE;
|
|
503 goto cleanup_and_return;
|
|
504 }
|
|
505 } else {
|
|
506 CKFREE(tmp_aln);
|
|
507 Log(&rLog, LOG_FATAL, "Internal error: Unknown Hmmer version %d", iHmmerVersion);
|
|
508 }
|
|
509
|
|
510
|
|
511 cleanup_and_return:
|
|
512
|
|
513 if (NULL != tmp_aln) {
|
|
514 if (FileExists(tmp_aln)) {
|
|
515 if (remove(tmp_aln)) {
|
|
516 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln);
|
|
517 }
|
|
518 }
|
|
519 CKFREE(tmp_aln);
|
|
520 }
|
|
521 if (NULL != tmp_hmm) {
|
|
522 if (FileExists(tmp_hmm)) {
|
|
523 if (remove(tmp_hmm)) {
|
|
524 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_hmm);
|
|
525 }
|
|
526 }
|
|
527 CKFREE(tmp_hmm);
|
|
528 }
|
|
529
|
|
530 return retcode;
|
|
531 }
|
|
532 /* end of AlnToHMMFile() */
|
|
533
|
|
534
|
|
535
|
|
536 /**
|
|
537 * @brief Convert a multiple sequence structure into a HMM
|
|
538 *
|
|
539 * @param[out] prHMM
|
|
540 * Pointer to preallocted HMM which will be set here
|
|
541 * @param[in] prMSeq
|
|
542 * Pointer to an alignment
|
|
543 *
|
|
544 * @return 0 on error, non-0 otherwise
|
|
545 *
|
|
546 * @see AlnToHMMFile()
|
|
547 *
|
|
548 */
|
|
549 int
|
|
550 AlnToHMM(hmm_light *prHMM, mseq_t *prMSeq)
|
|
551 {
|
|
552 char *pcHMM; /* temp hmm file */
|
|
553
|
|
554 Log(&rLog, LOG_INFO,
|
|
555 "Using HMMER version %d to calculate a new HMM.",
|
|
556 HmmerVersion());
|
|
557 /* FIXME replace all this with internal HMM computation (HHmake) */
|
|
558
|
|
559 /**
|
|
560 * @warning the following probably won't work on windows: missing
|
|
561 * /tmp/. Should be ok on Cygwin though
|
|
562 */
|
|
563 pcHMM = CkStrdup("/tmp/clustalo-hmm-iter_XXXXXX");
|
|
564 if (NULL == mktemp(pcHMM)) {
|
|
565 Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename");
|
|
566 CKFREE(pcHMM);
|
|
567 return FAILURE;
|
|
568 }
|
|
569
|
|
570 /* Create a HMM representing the current alignment
|
|
571 */
|
|
572 #if USEHMMER
|
|
573 if (AlnToHMMFile(prMSeq, pcHMM)) {
|
|
574 Log(&rLog, LOG_ERROR, "AlnToHMMFile() on %s failed.", pcHMM);
|
|
575 CKFREE(pcHMM);
|
|
576 return FAILURE;
|
|
577 }
|
|
578 #elif USEHHMAKE
|
|
579 if (AlnToHHMFile(prMSeq, pcHMM)) {
|
|
580 Log(&rLog, LOG_ERROR, "AlnToHHMFile() on %s failed.", pcHMM);
|
|
581 CKFREE(pcHMM);
|
|
582 return FAILURE;
|
|
583 }
|
|
584 /* Log(&rLog, LOG_FATAL, "Method to create HHM (HMM using hhmake) not installed yet"); */
|
|
585 #else
|
|
586 Log(&rLog, LOG_FATAL, "Unknown method to create temporary HMM");
|
|
587 #endif
|
|
588
|
|
589 /* Read HMM information
|
|
590 */
|
|
591 if (OK != readHMMWrapper(prHMM, pcHMM)){
|
|
592 Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMM);
|
|
593 CKFREE(pcHMM);
|
|
594 return FAILURE;
|
|
595 }
|
|
596
|
|
597 if (remove(pcHMM)) {
|
|
598 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", pcHMM);
|
|
599 }
|
|
600 CKFREE(pcHMM);
|
|
601
|
|
602 return OK;
|
|
603 }
|
|
604 /* end of AlnToHMM() */
|
|
605
|
|
606
|
|
607
|
|
608 /**
|
|
609 * @brief FIXME
|
|
610 *
|
|
611 */
|
|
612 void
|
|
613 InitClustalOmega(int iNumThreadsRequested)
|
|
614 {
|
|
615
|
|
616 #ifdef HAVE_OPENMP
|
|
617 iNumberOfThreads = iNumThreadsRequested;
|
|
618 omp_set_num_threads(iNumberOfThreads);
|
|
619 #else
|
|
620 if (iNumThreadsRequested>1) {
|
|
621 Log(&rLog, LOG_FATAL, "Cannot change number of threads to %d. %s was build without OpenMP support.",
|
|
622 iNumThreadsRequested, PACKAGE_NAME);
|
|
623 }
|
|
624 iNumberOfThreads = 1; /* need to set this, even if build without support */
|
|
625 #endif
|
|
626
|
|
627 Log(&rLog, LOG_INFO, "Using %d threads",
|
|
628 iNumberOfThreads);
|
|
629
|
|
630 }
|
|
631 /* end of InitClustalOmega() */
|
|
632
|
|
633
|
|
634
|
|
635 /**
|
|
636 * @brief Defines an alignment order, which adds sequences
|
|
637 * sequentially, i.e. one at a time starting with seq 1 & 2
|
|
638 *
|
|
639 * @param[out] piOrderLR_p
|
|
640 * order in which nodes/profiles are to be merged/aligned
|
|
641 * @param[in] iNumSeq
|
|
642 * Number of sequences
|
|
643 *
|
|
644 * @see TraverseTree()
|
|
645 *
|
|
646 */
|
|
647 void
|
|
648 SequentialAlignmentOrder(int **piOrderLR_p, int iNumSeq)
|
|
649 {
|
|
650 unsigned int uNodes = iNumSeq*2-1;
|
|
651 unsigned int uNodeCounter = 0;
|
|
652 unsigned int uSeqCounter = 0;
|
|
653
|
|
654 Log(&rLog, LOG_FATAL, "FIXME: Untested...");
|
|
655
|
|
656 (*piOrderLR_p) = (int *)CKCALLOC(DIFF_NODE * uNodes, sizeof(int));
|
|
657 /* loop over merge nodes, which have per definition even indices
|
|
658 * and set up children which have odd indices
|
|
659 */
|
|
660 uSeqCounter = 0;
|
|
661 for (uNodeCounter=iNumSeq; uNodeCounter<uNodes; uNodeCounter+=1) {
|
|
662 unsigned int uLeftChildNodeIndex = uNodeCounter-1;
|
|
663 unsigned int uRightChildNodeIndex = uNodeCounter-iNumSeq+1;
|
|
664 unsigned int uParentNodeIndex = uNodeCounter+1;
|
|
665
|
|
666 /* merge node setup */
|
|
667 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE] = uLeftChildNodeIndex;
|
|
668 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+RGHT_NODE] = uRightChildNodeIndex;
|
|
669 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+PRNT_NODE] = uParentNodeIndex;
|
|
670 /* only setup left child if at first merge node, all other left childs
|
|
671 * should be merge nodes that are already set up. also correct
|
|
672 * left node number here.
|
|
673 */
|
|
674 if (uNodeCounter==iNumSeq) {
|
|
675 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE] = 0;
|
|
676
|
|
677 (*piOrderLR_p)[0+LEFT_NODE] = 0;
|
|
678 (*piOrderLR_p)[0+RGHT_NODE] = 0;
|
|
679 (*piOrderLR_p)[0+PRNT_NODE] = uNodeCounter;
|
|
680 uSeqCounter++;
|
|
681
|
|
682 Log(&rLog, LOG_FORCED_DEBUG, "Set up first leaf with node counter %d: left=%d right=%d parent=%d",
|
|
683 0,
|
|
684 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+LEFT_NODE],
|
|
685 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+RGHT_NODE],
|
|
686 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+PRNT_NODE]);
|
|
687 }
|
|
688 Log(&rLog, LOG_FORCED_DEBUG, "Set up merge node with node counter %d: left=%d right=%d parent=%d",
|
|
689 uNodeCounter, (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE],
|
|
690 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+RGHT_NODE],
|
|
691 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+PRNT_NODE]);
|
|
692
|
|
693 /* right child */
|
|
694 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+LEFT_NODE] = uSeqCounter;
|
|
695 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+RGHT_NODE] = uSeqCounter;
|
|
696 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+PRNT_NODE] = uNodeCounter;
|
|
697 uSeqCounter++;
|
|
698
|
|
699 Log(&rLog, LOG_FORCED_DEBUG, "Set up leaf with node counter %d: left=%d right=%d parent=%d",
|
|
700 uRightChildNodeIndex, (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+LEFT_NODE],
|
|
701 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+RGHT_NODE],
|
|
702 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+PRNT_NODE]);
|
|
703 }
|
|
704 }
|
|
705 /* end of SequentialAlignmentOrder() */
|
|
706
|
|
707
|
|
708
|
|
709 /**
|
|
710 * @brief Defines the alignment order by calculating a guide tree. In
|
|
711 * a first-step pairwise distances will be calculated (or read from a
|
|
712 * file). In a second step those distances will be clustered and a
|
|
713 * guide-tree created. Steps 1 and 2 will be skipped if a guide-tree
|
|
714 * file was given, in which case the guide-tree will be just read from
|
|
715 * the file.
|
|
716 *
|
|
717 * @param[out] piOrderLR_p
|
|
718 * order in which nodes/profiles are to be merged/aligned
|
|
719 * @param[out] pdSeqWeights_p
|
|
720 * Sequence weights
|
|
721 * @param[out] pdSeqWeights_p
|
|
722 * Sequence weights
|
|
723 * @param[in] prMSeq
|
|
724 * The sequences from which the alignment order is to be calculated
|
|
725 * @param[in] iPairDistType
|
|
726 * Method of pairwise distance comparison
|
|
727 * @param[in] pcDistmatInfile
|
|
728 * If not NULL distances will be read from this file instead of being
|
|
729 * calculated
|
|
730 * @param[in] pcDistmatOutfile
|
|
731 * If not NULL computed pairwise distances will be written to this file
|
|
732 * @param[in] iClusteringType
|
|
733 * Clustering method to be used to cluster the pairwise distances
|
|
734 * @param[in] pcGuidetreeInfile
|
|
735 * If not NULL guidetree will be read from this file. Skips pairwise
|
|
736 * distance and guidetree computation
|
|
737 * @param[in] pcGuidetreeOutfile
|
|
738 * If not NULL computed guidetree will be written to this file
|
|
739 * @param[in] bUseMbed
|
|
740 * If TRUE, fast mBed guidetree computation will be employed
|
|
741 *
|
|
742 * @return Non-zero on error
|
|
743 *
|
|
744 */
|
|
745 int
|
|
746 AlignmentOrder(int **piOrderLR_p, double **pdSeqWeights_p, mseq_t *prMSeq,
|
|
747 int iPairDistType, char *pcDistmatInfile, char *pcDistmatOutfile,
|
|
748 int iClusteringType, char *pcGuidetreeInfile, char *pcGuidetreeOutfile,
|
|
749 bool bUseMbed)
|
|
750 {
|
|
751 /* pairwise distance matrix (tmat in 1.83) */
|
|
752 symmatrix_t *distmat = NULL;
|
|
753 /* guide tree */
|
|
754 tree_t *prTree = NULL;
|
|
755 int i = 0;
|
|
756
|
|
757
|
|
758 /* Shortcut for only two sequences: Do not compute k-tuple
|
|
759 * distances. Use the same logic as in TraverseTree() to setup
|
|
760 * piOrderLR_p. Changes there will have to be reflected here as
|
|
761 * well. */
|
|
762 if (2==prMSeq->nseqs) {
|
|
763 Log(&rLog, LOG_VERBOSE,
|
|
764 "Have only two sequences: No need to compute pairwise score and compute a tree.");
|
|
765
|
|
766 (*piOrderLR_p) = (int*) CKMALLOC(DIFF_NODE * 3 * sizeof(int));
|
|
767 (*piOrderLR_p)[DIFF_NODE*0+LEFT_NODE] = 0;
|
|
768 (*piOrderLR_p)[DIFF_NODE*0+RGHT_NODE] = 0;
|
|
769 (*piOrderLR_p)[DIFF_NODE*0+PRNT_NODE] = 0;
|
|
770
|
|
771 (*piOrderLR_p)[DIFF_NODE*1+LEFT_NODE] = 1;
|
|
772 (*piOrderLR_p)[DIFF_NODE*1+RGHT_NODE] = 1;
|
|
773 (*piOrderLR_p)[DIFF_NODE*1+PRNT_NODE] = 1;
|
|
774
|
|
775 /* root */
|
|
776 (*piOrderLR_p)[DIFF_NODE*2+LEFT_NODE] = 0;
|
|
777 (*piOrderLR_p)[DIFF_NODE*2+RGHT_NODE] = 1;
|
|
778 (*piOrderLR_p)[DIFF_NODE*2+PRNT_NODE] = 2;
|
|
779
|
|
780 /* Same logic as CalcClustalWeights(). Changes there will
|
|
781 have to be reflected here as well. */
|
|
782 #if USE_WEIGHTS
|
|
783 (*pdWeights_p) = (double *) CKMALLOC(uNodeCount * sizeof(double));
|
|
784 (*pdWeights_p)[0] = 0.5;
|
|
785 (*pdWeights_p)[1] = 0.5;
|
|
786 #endif
|
|
787
|
|
788 return OK;
|
|
789 }
|
|
790
|
|
791
|
|
792 /* compute distance & guide tree, alternatively read distances or
|
|
793 * guide tree from file
|
|
794 *
|
|
795 */
|
|
796 if (NULL != pcGuidetreeInfile) {
|
|
797 Log(&rLog, LOG_INFO, "Reading guide-tree from %s", pcGuidetreeInfile);
|
|
798 if (GuideTreeFromFile(&prTree, prMSeq, pcGuidetreeInfile)) {
|
|
799 Log(&rLog, LOG_ERROR, "Reading of guide tree %s failed.", pcGuidetreeInfile);
|
|
800 return FAILURE;
|
|
801 }
|
|
802
|
|
803 } else {
|
|
804
|
|
805 if (bUseMbed) {
|
|
806 if (Mbed(&prTree, prMSeq, iPairDistType, pcGuidetreeOutfile)) {
|
|
807 Log(&rLog, LOG_ERROR, "mbed execution failed.");
|
|
808 return FAILURE;
|
|
809 }
|
|
810 Log(&rLog, LOG_INFO, "Guide-tree computation (mBed) done.");
|
|
811 if (NULL != pcDistmatOutfile) {
|
|
812 Log(&rLog, LOG_INFO,
|
|
813 "Ignoring request to write distance matrix (am in mBed mode)");
|
|
814 }
|
|
815 } else {
|
|
816
|
|
817 if (PairDistances(&distmat, prMSeq, iPairDistType,
|
|
818 0, prMSeq->nseqs, 0, prMSeq->nseqs,
|
|
819 pcDistmatInfile, pcDistmatOutfile)) {
|
|
820 Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
|
|
821 return FAILURE;
|
|
822 }
|
|
823
|
|
824 /* clustering of distances to get guide tree
|
|
825 */
|
|
826 if (CLUSTERING_UPGMA == iClusteringType) {
|
|
827 char **labels;
|
|
828 labels = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char*));
|
|
829 for (i=0; i<prMSeq->nseqs; i++) {
|
|
830 labels[i] = prMSeq->sqinfo[i].name;
|
|
831 }
|
|
832
|
|
833 GuideTreeUpgma(&prTree, labels, distmat, pcGuidetreeOutfile);
|
|
834 Log(&rLog, LOG_INFO, "Guide-tree computation done.");
|
|
835
|
|
836 CKFREE(labels);
|
|
837 } else {
|
|
838 Log(&rLog, LOG_FATAL, "INTERNAL ERROR %s",
|
|
839 "clustering method should have been checked before");
|
|
840 }
|
|
841 }
|
|
842 }
|
|
843
|
|
844 #if USE_WEIGHTS
|
|
845 /* derive sequence weights from tree
|
|
846 *
|
|
847 */
|
|
848 Log(&rLog, LOG_INFO, "Calculating sequence weights");
|
|
849 CalcClustalWeights(pdSeqWeights_p, prTree);
|
|
850 for (i = 0; i < GetLeafCount(prTree); i++) {
|
|
851 Log(&rLog, LOG_VERBOSE,
|
|
852 "Weight for seq no %d: %s = %f",
|
|
853 i, prMSeq->sqinfo[i].name, (*pdSeqWeights_p)[i]);
|
|
854 }
|
|
855 #else
|
|
856 Log(&rLog, LOG_DEBUG, "Not using weights");
|
|
857 #endif
|
|
858
|
|
859
|
|
860 /* define traversing order of tree
|
|
861 *
|
|
862 */
|
|
863 TraverseTree(piOrderLR_p, prTree, prMSeq);
|
|
864 if (rLog.iLogLevelEnabled <= LOG_DEBUG) {
|
|
865 /* FIXME: debug only, FS */
|
|
866 uint uNodeIndex;
|
|
867 FILE *fp = LogGetFP(&rLog, LOG_INFO);
|
|
868 Log(&rLog, LOG_DEBUG, "left/right order after tree traversal");
|
|
869 for (uNodeIndex = 0; uNodeIndex < GetNodeCount(prTree); uNodeIndex++) {
|
|
870 fprintf(fp, "%3d:\t%2d/%2d -> %d\n", i,
|
|
871 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+LEFT_NODE],
|
|
872 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+RGHT_NODE],
|
|
873 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+PRNT_NODE]);
|
|
874 }
|
|
875 }
|
|
876
|
|
877 FreeMuscleTree(prTree);
|
|
878 FreeSymMatrix(&distmat);
|
|
879
|
|
880 #if 0
|
|
881 Log(&rLog, LOG_FATAL, "DEBUG EXIT before leaving %s", __FUNCTION__);
|
|
882 #endif
|
|
883 return OK;
|
|
884 }
|
|
885 /* end of AlignmentOrder() */
|
|
886
|
|
887
|
|
888
|
|
889 /**
|
|
890 * @brief Set some options automatically based on number of sequences. Might
|
|
891 * overwrite some user-set options.
|
|
892 *
|
|
893 * @param[out] prOpts
|
|
894 * Pointer to alignment options structure
|
|
895 * @param[in] iNumSeq
|
|
896 * Number of sequences to align
|
|
897 */
|
|
898 void
|
|
899 SetAutoOptions(opts_t *prOpts, int iNumSeq) {
|
|
900
|
|
901 Log(&rLog, LOG_INFO,
|
|
902 "Setting options automatically based on input sequence characteristics (might overwrite some of your options).");
|
|
903
|
|
904 /* AW: new version of mbed is always good (uses subclusters) */
|
|
905 if (FALSE == prOpts->bUseMbed) {
|
|
906 Log(&rLog, LOG_INFO, "Auto settings: Enabling mBed.");
|
|
907 prOpts->bUseMbed = TRUE;
|
|
908 }
|
|
909
|
|
910 if (iNumSeq >= 1000) {
|
|
911 if (0 != prOpts->iNumIterations) {
|
|
912 Log(&rLog, LOG_INFO, "Auto settings: Disabling iterations.");
|
|
913 prOpts->iNumIterations = 0;
|
|
914 }
|
|
915
|
|
916 } else if (iNumSeq < 1000) {
|
|
917 if (1 != prOpts->iNumIterations) {
|
|
918 Log(&rLog, LOG_INFO, "Auto settings: Setting iteration to 1.");
|
|
919 prOpts->iNumIterations = 1;
|
|
920 }
|
|
921 }
|
|
922 }
|
|
923 /* end of */
|
|
924
|
|
925
|
|
926
|
|
927 /**
|
|
928 * @brief The main alignment function which wraps everything else.
|
|
929 *
|
|
930 * @param[out] prMSeq
|
|
931 * *the* multiple sequences structure
|
|
932 * @param[in] prMSeqProfile
|
|
933 * optional profile to align against
|
|
934 * @param[in] prOpts
|
|
935 * alignmemnt options to use
|
|
936 *
|
|
937 * @return 0 on success, -1 on failure
|
|
938 *
|
|
939 */
|
|
940 int
|
|
941 Align(mseq_t *prMSeq,
|
|
942 mseq_t *prMSeqProfile,
|
|
943 opts_t *prOpts,
|
|
944 hhalign_para rHhalignPara) {
|
|
945
|
|
946 /* HMM
|
|
947 */
|
|
948 /* structs with pseudocounts etc; one for each HMM infile, i.e.
|
|
949 * index range: 0..iHMMInputFiles */
|
|
950 hmm_light *prHMMs = NULL;
|
|
951
|
|
952 /* MSA order in which nodes/profiles are to be merged/aligned
|
|
953 (order of nodes in guide tree (left/right)*/
|
|
954 int *piOrderLR = NULL;
|
|
955
|
|
956 /* weights per sequence */
|
|
957 double *pdSeqWeights = NULL;
|
|
958
|
|
959 /* Iteration
|
|
960 */
|
|
961 int iIterationCounter = 0;
|
|
962 double dAlnScore;
|
|
963 /* last dAlnScore for iteration */
|
|
964 double dLastAlnScore = -666.666;
|
|
965
|
|
966 int i, j; /* aux */
|
|
967
|
|
968 assert(NULL != prMSeq);
|
|
969 if (NULL != prMSeqProfile) {
|
|
970 assert(TRUE == prMSeqProfile->aligned);
|
|
971 }
|
|
972
|
|
973
|
|
974 /* automatic setting of options
|
|
975 *
|
|
976 */
|
|
977 if (prOpts->bAutoOptions) {
|
|
978 SetAutoOptions(prOpts, prMSeq->nseqs);
|
|
979 }
|
|
980
|
|
981
|
|
982 #if SHUFFLE_INPUT_SEQ_ORDER
|
|
983 /*
|
|
984 * shuffle input: only useful for testing/debugging
|
|
985 */
|
|
986 Log(&rLog, LOG_WARN, "Shuffling input sequences! (Will also change output order)");
|
|
987 ShuffleMSeq(prMSeq);
|
|
988 #endif
|
|
989
|
|
990
|
|
991 #if SORT_INPUT_SEQS
|
|
992 /*
|
|
993 * sort input:
|
|
994 *
|
|
995 * would ensure we *always* (unless we get into the mbed k-means stage)
|
|
996 * get the same answer. usually you don't, because most pairwise alignment
|
|
997 * scores are in theory not symmetric, therefore sequence ordering might
|
|
998 * have an effect on the guide-tree. Sorting by length should get rid of
|
|
999 * this (and takes no time even for 100k seqs). Benchmark results on
|
|
1000 * Balibase show almost no difference after sorting.
|
|
1001 */
|
|
1002 Log(&rLog, LOG_WARN, "Sorting input seq by length! This will also change the output order");
|
|
1003 SortMSeqByLength(prMSeq, 'd');
|
|
1004
|
|
1005 #endif
|
|
1006
|
|
1007
|
|
1008 /* Read backgrounds HMMs and store in prHMMs
|
|
1009 *
|
|
1010 */
|
|
1011 if (0 < prOpts->iHMMInputFiles) {
|
|
1012 int iHMMInfileIndex;
|
|
1013
|
|
1014 /**
|
|
1015 * @warning old structure used to be initialised like this:
|
|
1016 * hmm_light rHMM = {0};
|
|
1017 */
|
|
1018 prHMMs = (hmm_light *) CKMALLOC(prOpts->iHMMInputFiles * sizeof(hmm_light));
|
|
1019
|
|
1020 for (iHMMInfileIndex=0; iHMMInfileIndex<prOpts->iHMMInputFiles; iHMMInfileIndex++) {
|
|
1021 char *pcHMMInput = prOpts->ppcHMMInput[iHMMInfileIndex];
|
|
1022 if (OK != readHMMWrapper(&prHMMs[iHMMInfileIndex], pcHMMInput)){
|
|
1023 Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMMInput);
|
|
1024 return -1;
|
|
1025 }
|
|
1026
|
|
1027 #if 0
|
|
1028 Log(&rLog, LOG_FORCED_DEBUG, "HMM length is %d", prHMMs[iHMMInfileIndex].L);
|
|
1029 Log(&rLog, LOG_FORCED_DEBUG, "n-display is %d", prHMMs[iHMMInfileIndex].n_display);
|
|
1030 for (i = 0; NULL != prHMMs[prOpts->iHMMInputFiles].seq[i]; i++){
|
|
1031 printf("seq[%d]: %s\n", i, prHMMs[iHMMInfileIndex].seq[i]);
|
|
1032 }
|
|
1033 Log(&rLog, LOG_FORCED_DEBUG, "Neff_HMM is %f", prHMMs[iHMMInfileIndex].Neff_HMM);
|
|
1034 #endif
|
|
1035 if (rLog.iLogLevelEnabled <= LOG_DEBUG){
|
|
1036 Log(&rLog, LOG_DEBUG, "print frequencies");
|
|
1037 for (i = 0; i < prHMMs[iHMMInfileIndex].L; i++){
|
|
1038 #define PRINT_TAIL 5
|
|
1039 if ( (PRINT_TAIL+1 == i) && (prHMMs[iHMMInfileIndex].L-PRINT_TAIL != i) ){
|
|
1040 printf("....\n");
|
|
1041 }
|
|
1042 if ( (i > PRINT_TAIL) && (i < prHMMs[iHMMInfileIndex].L-PRINT_TAIL) ){
|
|
1043 continue;
|
|
1044 }
|
|
1045 printf("%3d:", i);
|
|
1046 for (j = 0; j < 20; j++){
|
|
1047 printf("\t%1.3f", prHMMs[iHMMInfileIndex].f[i][j]);
|
|
1048 }
|
|
1049 printf("\n");
|
|
1050 }
|
|
1051 } /* debug print block */
|
|
1052
|
|
1053 CKFREE(prOpts->ppcHMMInput[iHMMInfileIndex]);
|
|
1054 } /* for each background HMM file */
|
|
1055 CKFREE(prOpts->ppcHMMInput);
|
|
1056 } /* there were background HMM files */
|
|
1057
|
|
1058
|
|
1059
|
|
1060 /* If the input ("non-profile") sequences are aligned, then turn
|
|
1061 * the alignment into a HMM and add to the list of background HMMs
|
|
1062 *
|
|
1063 */
|
|
1064 if (TRUE == prMSeq->aligned) {
|
|
1065 /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */
|
|
1066 hmm_light rHMMLocal = {0};
|
|
1067
|
|
1068 Log(&rLog, LOG_INFO,
|
|
1069 "Input sequences are aligned. Will turn alignment into HMM and add it to the user provided background HMMs.");
|
|
1070 if (OK !=
|
|
1071 #if INDIRECT_HMM
|
|
1072 AlnToHMM(&rHMMLocal, prMSeq)
|
|
1073 #else
|
|
1074 AlnToHMM2(&rHMMLocal, prMSeq->seq, prMSeq->nseqs)
|
|
1075 #endif
|
|
1076 ) {
|
|
1077 Log(&rLog, LOG_ERROR, "Couldn't convert aligned input sequences to HMM. Will try to continue");
|
|
1078 } else {
|
|
1079 prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light)));
|
|
1080 memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light));
|
|
1081 prOpts->iHMMInputFiles++;
|
|
1082 }
|
|
1083 }
|
|
1084
|
|
1085
|
|
1086 /* If we have a profile turn it into a HMM and add to
|
|
1087 * the list of background HMMs.
|
|
1088 *
|
|
1089 */
|
|
1090 if (NULL != prMSeqProfile) {
|
|
1091 /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */
|
|
1092 hmm_light rHMMLocal = {0};
|
|
1093 Log(&rLog, LOG_INFO,
|
|
1094 "Turning profile1 into HMM and will use it during progressive alignment.");
|
|
1095 if (OK !=
|
|
1096 #if INDIRECT_HMM
|
|
1097 AlnToHMM(&rHMMLocal, prMSeqProfile)
|
|
1098 #else
|
|
1099 AlnToHMM2(&rHMMLocal, prMSeqProfile->seq, prMSeqProfile->nseqs)
|
|
1100 #endif
|
|
1101 ) {
|
|
1102 Log(&rLog, LOG_ERROR, "Couldn't convert profile1 to HMM. Will try to continue");
|
|
1103 } else {
|
|
1104 prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light)));
|
|
1105 memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light));
|
|
1106 prOpts->iHMMInputFiles++;
|
|
1107 }
|
|
1108 }
|
|
1109
|
|
1110
|
|
1111 /* Now do a first alignment of the input sequences (prMSeq) adding
|
|
1112 * all collected background HMMs
|
|
1113 *
|
|
1114 */
|
|
1115 /* Determine progressive alignment order
|
|
1116 */
|
|
1117 if (TRUE == prMSeq->aligned) {
|
|
1118 Log(&rLog, LOG_INFO, "%s %s",
|
|
1119 "Input sequences are aligned.",
|
|
1120 "Will use Kimura distances of aligned sequences.");
|
|
1121 prOpts->iPairDistType = PAIRDIST_SQUIDID_KIMURA;
|
|
1122 }
|
|
1123
|
|
1124 #if 0
|
|
1125 Log(&rLog, LOG_WARN, "Using a sequential alignment order.");
|
|
1126 SequentialAlignmentOrder(&piOrderLR, prMSeq->nseqs);
|
|
1127 #else
|
|
1128 if (OK != AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq,
|
|
1129 prOpts->iPairDistType,
|
|
1130 prOpts->pcDistmatInfile, prOpts->pcDistmatOutfile,
|
|
1131 prOpts->iClusteringType,
|
|
1132 prOpts->pcGuidetreeInfile, prOpts->pcGuidetreeOutfile,
|
|
1133 prOpts->bUseMbed)) {
|
|
1134 Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue");
|
|
1135 return -1;
|
|
1136 }
|
|
1137 #endif
|
|
1138
|
|
1139 /* Progressive alignment of input sequences. Order defined by
|
|
1140 * branching of guide tree (piOrderLR). Use optional
|
|
1141 * background HMM information (prHMMs[0..prOpts->iHMMInputFiles-1])
|
|
1142 *
|
|
1143 */
|
|
1144 dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights,
|
|
1145 2*prMSeq->nseqs -1/* nodes */,
|
|
1146 prHMMs, prOpts->iHMMInputFiles, -1, rHhalignPara);
|
|
1147 dLastAlnScore = dAlnScore;
|
|
1148 Log(&rLog, LOG_VERBOSE,
|
|
1149 "Alignment score for first alignment = %f", dAlnScore);
|
|
1150
|
|
1151
|
|
1152
|
|
1153
|
|
1154 /* ------------------------------------------------------------
|
|
1155 *
|
|
1156 * prMSeq is aligned now. Now start iterations if requested and save the
|
|
1157 * alignment at the very end.
|
|
1158 *
|
|
1159 * @note We discard the background HMM information at this point,
|
|
1160 * because it was already used. Could consider to make this choice
|
|
1161 * optional. FIXME
|
|
1162 *
|
|
1163 * ------------------------------------------------------------ */
|
|
1164
|
|
1165
|
|
1166 /* iteration after first alignment was computed (if not profile-profile
|
|
1167 * alignment)
|
|
1168 *
|
|
1169 */
|
|
1170 for (iIterationCounter=0;
|
|
1171 (iIterationCounter < prOpts->iNumIterations || prOpts->bIterationsAuto);
|
|
1172 iIterationCounter++) {
|
|
1173
|
|
1174 hmm_light rHMMLocal = {0};
|
|
1175 /* FIXME Keep copy of old alignment in case new one sucks? */
|
|
1176
|
|
1177
|
|
1178 if (iIterationCounter >= prOpts->iMaxHMMIterations
|
|
1179 &&
|
|
1180 iIterationCounter >= prOpts->iMaxGuidetreeIterations) {
|
|
1181 Log(&rLog, LOG_VERBOSE, "Reached maximum number of HMM and guide-tree iterations");
|
|
1182 break;
|
|
1183 }
|
|
1184
|
|
1185 if (! prOpts->bIterationsAuto) {
|
|
1186 Log(&rLog, LOG_INFO, "Iteration step %d out of %d",
|
|
1187 iIterationCounter+1, prOpts->iNumIterations);
|
|
1188 } else {
|
|
1189 Log(&rLog, LOG_INFO, "Iteration step %d out of <auto>",
|
|
1190 iIterationCounter+1);
|
|
1191 }
|
|
1192 #if 0
|
|
1193 if (rLog.iLogLevelEnabled <= LOG_VERBOSE) {
|
|
1194 char zcIntermediate[1000] = {0};
|
|
1195 char *pcFormat = "fasta";
|
|
1196 sprintf(zcIntermediate, "clustalo-aln-iter~%d~", iIterationCounter);
|
|
1197 if (WriteAlignment(prMSeq, zcIntermediate, MSAFILE_A2M)) {
|
|
1198 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", zcIntermediate);
|
|
1199 return -1;
|
|
1200 }
|
|
1201 }
|
|
1202 #endif
|
|
1203
|
|
1204
|
|
1205 /* new guide-tree
|
|
1206 *
|
|
1207 */
|
|
1208 if (iIterationCounter < prOpts->iMaxGuidetreeIterations) {
|
|
1209 /* determine progressive alignment order
|
|
1210 *
|
|
1211 * few things are different now when calling AlignmentOrder:
|
|
1212 * - we have to ignore prOpts->pcDistmatInfile and pcGuidetreeInfile
|
|
1213 * as they were used before
|
|
1214 * - the corresponding outfiles are still valid though
|
|
1215 */
|
|
1216 /* Free stuff that has already been allocated by or further
|
|
1217 * downstream of AlignmentOrder()
|
|
1218 */
|
|
1219 if (NULL != piOrderLR)
|
|
1220 CKFREE(piOrderLR);
|
|
1221 if (NULL != pdSeqWeights)
|
|
1222 CKFREE(pdSeqWeights);
|
|
1223 if (AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq,
|
|
1224 PAIRDIST_SQUIDID_KIMURA /* override */, NULL, prOpts->pcDistmatOutfile,
|
|
1225 prOpts->iClusteringType, NULL, prOpts->pcGuidetreeOutfile,
|
|
1226 prOpts->bUseMbedForIteration)) {
|
|
1227 Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue");
|
|
1228 return -1;
|
|
1229 }
|
|
1230 } else {
|
|
1231 Log(&rLog, LOG_INFO, "Skipping guide-tree iteration at iteration step %d (reached maximum)",
|
|
1232 iIterationCounter);
|
|
1233 }
|
|
1234
|
|
1235
|
|
1236 /* new local hmm iteration
|
|
1237 *
|
|
1238 */
|
|
1239 if (iIterationCounter < prOpts->iMaxHMMIterations) {
|
|
1240 if (OK !=
|
|
1241 #if INDIRECT_HMM
|
|
1242 AlnToHMM(&rHMMLocal, prMSeq)
|
|
1243 #else
|
|
1244 AlnToHMM2(&rHMMLocal, prMSeq->seq, prMSeq->nseqs)
|
|
1245 #endif
|
|
1246 ) {
|
|
1247 Log(&rLog, LOG_ERROR, "Couldn't convert alignment to HMM. Will stop iterating now...");
|
|
1248 break;
|
|
1249 }
|
|
1250 } else {
|
|
1251 Log(&rLog, LOG_INFO, "Skipping HMM iteration at iteration step %d (reached maximum)",
|
|
1252 iIterationCounter);
|
|
1253 }
|
|
1254
|
|
1255
|
|
1256 /* align the sequences (again)
|
|
1257 */
|
|
1258 dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights,
|
|
1259 2*prMSeq->nseqs -1/* nodes */, &rHMMLocal, 1, -1, rHhalignPara);
|
|
1260 Log(&rLog, LOG_VERBOSE,
|
|
1261 "Alignment score for alignmnent in hmm-iteration no %d = %f (last score = %f)",
|
|
1262 iIterationCounter+1, dAlnScore, dLastAlnScore);
|
|
1263
|
|
1264
|
|
1265 FreeHMMstruct(&rHMMLocal);
|
|
1266
|
|
1267 #if 0
|
|
1268 /* FIXME: need a better score for automatic iteration */
|
|
1269 if (prOpts->bIterationsAuto) {
|
|
1270 /* automatic iteration: break if score improvement was not
|
|
1271 * big enough
|
|
1272 */
|
|
1273 double dScoreImprovement = (dAlnScore-dLastAlnScore)/dLastAlnScore;
|
|
1274 if (dScoreImprovement < ITERATION_SCORE_IMPROVEMENT_THRESHOLD) {
|
|
1275 Log(&rLog, LOG_INFO,
|
|
1276 "Stopping after %d guide-tree iterations. No further alignment score improvement achieved.",
|
|
1277 iIterationCounter+1);
|
|
1278 /* use previous alignment */
|
|
1279 FreeMSeq(&prMSeq);
|
|
1280 Log(&rLog, LOG_FORCED_DEBUG, "FIXME: %s", "CopyMSeq breaks things in this context");
|
|
1281 CopyMSeq(&prMSeq, prMSeqCopy);
|
|
1282 /* FIXME: prOpts->pcDistmatOutfile and pcGuidetreeOutfile
|
|
1283 * might have been updated, but then discarded here?
|
|
1284 */
|
|
1285 break;
|
|
1286 } else {
|
|
1287 Log(&rLog, LOG_INFO,
|
|
1288 "Got a %d%% better score in iteration step %d",
|
|
1289 (int)dScoreImprovement*100, iIterationCounter+1);
|
|
1290 FreeMSeq(&prMSeqCopy);
|
|
1291 }
|
|
1292 }
|
|
1293 dLastAlnScore = dAlnScore;
|
|
1294 #endif
|
|
1295
|
|
1296 }
|
|
1297 /* end of iterations */
|
|
1298
|
|
1299
|
|
1300
|
|
1301 /* Last step: if a profile was also provided then align now-aligned mseq
|
|
1302 * with this profile
|
|
1303 *
|
|
1304 * Don't use the backgrounds HMMs anymore and don't iterate.
|
|
1305 * (which was done before).
|
|
1306 *
|
|
1307 */
|
|
1308 if (NULL != prMSeqProfile) {
|
|
1309 if (AlignProfiles(prMSeq, prMSeqProfile, rHhalignPara)) {
|
|
1310 Log(&rLog, LOG_ERROR, "An error occured during the profile/profile alignment");
|
|
1311 return -1;
|
|
1312 }
|
|
1313 }
|
|
1314
|
|
1315
|
|
1316 if (NULL != piOrderLR) {
|
|
1317 CKFREE(piOrderLR);
|
|
1318 }
|
|
1319 if (NULL != pdSeqWeights) {
|
|
1320 CKFREE(pdSeqWeights);
|
|
1321 }
|
|
1322 if (0 < prOpts->iHMMInputFiles) {
|
|
1323 for (i=0; i<prOpts->iHMMInputFiles; i++) {
|
|
1324 FreeHMMstruct(&prHMMs[i]);
|
|
1325 }
|
|
1326 CKFREE(prHMMs);
|
|
1327 }
|
|
1328
|
|
1329 return 0;
|
|
1330 }
|
|
1331 /* end of Align() */
|
|
1332
|
|
1333
|
|
1334
|
|
1335
|
|
1336 /**
|
|
1337 * @brief Align two profiles, ie two sets of prealigned sequences. Already
|
|
1338 * aligned columns won't be changed.
|
|
1339 *
|
|
1340 * @param[out] prMSeqProfile1
|
|
1341 * First profile/aligned set of sequences. Merged alignment will be found in
|
|
1342 * here.
|
|
1343 * @param[in] prMSeqProfile2
|
|
1344 * First profile/aligned set of sequences
|
|
1345 *
|
|
1346 * @return 0 on success, -1 on failure
|
|
1347 *
|
|
1348 */
|
|
1349 int
|
|
1350 AlignProfiles(mseq_t *prMSeqProfile1,
|
|
1351 mseq_t *prMSeqProfile2, hhalign_para rHhalignPara) {
|
|
1352
|
|
1353 double dAlnScore;
|
|
1354
|
|
1355 /* number of seqs in first half of joined profile */
|
|
1356 int iProfProfSeparator = prMSeqProfile1->nseqs;
|
|
1357
|
|
1358 assert(TRUE == prMSeqProfile1->aligned);
|
|
1359 assert(TRUE == prMSeqProfile2->aligned);
|
|
1360
|
|
1361 Log(&rLog, LOG_INFO, "Performing profile/profile alignment");
|
|
1362
|
|
1363 /* Combine the available mseqs into prMSeq
|
|
1364 * which will be aligned afterwards.
|
|
1365 */
|
|
1366 JoinMSeqs(&prMSeqProfile1, prMSeqProfile2);
|
|
1367
|
|
1368
|
|
1369 /* set alignment flag explicitly to FALSE */
|
|
1370 prMSeqProfile1->aligned = FALSE;
|
|
1371
|
|
1372 dAlnScore = HHalignWrapper(prMSeqProfile1,
|
|
1373 NULL, /* no order */
|
|
1374 NULL, /* no weights */
|
|
1375 3, /* nodes: root+2profiles */
|
|
1376 NULL, 0 /* no bg-hmms */,
|
|
1377 iProfProfSeparator, rHhalignPara);
|
|
1378
|
|
1379 Log(&rLog, LOG_VERBOSE, "Alignment score is = %f", dAlnScore);
|
|
1380
|
|
1381 return 0;
|
|
1382 }
|
|
1383 /* end of AlignProfiles() */
|
|
1384
|
|
1385
|