Mercurial > repos > clustalomega > clustalomega
comparison clustalomega/clustal-omega-0.2.0/src/clustal-omega.c @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author | clustalomega |
---|---|
date | Tue, 07 Jun 2011 17:04:25 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ff1768533a07 |
---|---|
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ | |
2 | |
3 /********************************************************************* | |
4 * Clustal Omega - Multiple sequence alignment | |
5 * | |
6 * Copyright (C) 2010 University College Dublin | |
7 * | |
8 * Clustal-Omega is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU General Public License as | |
10 * published by the Free Software Foundation; either version 2 of the | |
11 * License, or (at your option) any later version. | |
12 * | |
13 * This file is part of Clustal-Omega. | |
14 * | |
15 ********************************************************************/ | |
16 | |
17 /* | |
18 * RCS $Id: clustal-omega.c 241 2011-05-04 14:37:17Z fabian $ | |
19 */ | |
20 | |
21 #ifdef HAVE_CONFIG_H | |
22 #include "config.h" | |
23 #endif | |
24 | |
25 #include <assert.h> | |
26 | |
27 #include "clustal-omega.h" | |
28 #include "hhalign/general.h" | |
29 | |
30 /* The following comment block contains the frontpage/mainpage of the doxygen | |
31 * documentation. Please add some more info. FIXME add more | |
32 */ | |
33 | |
34 /** | |
35 * | |
36 * @mainpage Clustal-Omega Documentation | |
37 * | |
38 * @section intro_sec Introduction | |
39 * | |
40 * For more information see http://www.clustal.org/ | |
41 * | |
42 * @section api_section API | |
43 * | |
44 * @subsection example_prog_subsection An Example Program | |
45 * | |
46 * To use libclustalo you will have to include the clustal-omega.h header and | |
47 * link against libclustalo. For linking against libclustalo you will have to | |
48 * use a C++ compiler, no matter if your program was written in C or C++. | |
49 * | |
50 * First compile (no linking) your source (for an example see section "\ref | |
51 * example_src_subsubsec"): | |
52 * | |
53 * @code | |
54 * $ gcc -c -ansi -Wall clustalo-api-test.c | |
55 * @endcode | |
56 * | |
57 * Then link against libclustalo (we recommend the use of pkg-config as | |
58 * explained in \ref pkgconfig_subsubsec). Assuming Clustal Omega was installed | |
59 * in system-wide default directory (e.g. /usr) just type: | |
60 * | |
61 * @code | |
62 * $ g++ -ansi -Wall -o clustalo-api-test clustalo-api-test.o -lclustalo | |
63 * @endcode | |
64 * | |
65 * Voila! Now you have your own alignment program which can be run with | |
66 * | |
67 * @code | |
68 * $ ./clustalo-api-test <your-sequence-input> | |
69 * @endcode | |
70 * | |
71 * It's best to use the same compiler that you used for compiling libclustal. | |
72 * If libclustal was compiled with OpenMP support, you will have to use OpenMP | |
73 * flags for you program as well. | |
74 * | |
75 * | |
76 * @subsubsection pkgconfig_subsubsec Using pkg-config / Figuring out compiler flags | |
77 * | |
78 * Clustal Omega comes with support for <a | |
79 * href="http://pkg-config.freedesktop.org">pkg-config</a>, which means you | |
80 * can run | |
81 * | |
82 * @code | |
83 * $ pkg-config --cflags --libs clustalo | |
84 * @endcode | |
85 * | |
86 * to figure out cflags and library flags needed to compile and link against | |
87 * libclustalo. This is especially handy if Clustal Omega was installed to a | |
88 * non-standard directory. | |
89 * | |
90 * You might have to change PKG_CONFIG_PATH. For example, if you used the prefix $HOME/local/ for | |
91 * installation then you will first need to set PKG_CONFIG_PATH: | |
92 * | |
93 * @code | |
94 * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig | |
95 * $ pkg-config --cflags --libs clustalo | |
96 * @endcode | |
97 * | |
98 * | |
99 * To compile your source use: | |
100 * | |
101 * @code | |
102 * $ export PKG_CONFIG_PATH=$HOME/local/lib/pkgconfig | |
103 * $ gcc -c -ansi -Wall clustalo-api-test.c $(pkg-config --cflags clustalo) | |
104 * $ g++ -ansi -Wall -o clustalo-api-test clustalo-api-test.o $(pkg-config --libs clustalo) | |
105 * @endcode | |
106 * | |
107 * | |
108 * @subsubsection example_src_subsubsec Example Source Code | |
109 * | |
110 * @include "clustalo-api-test.c" | |
111 * | |
112 * | |
113 */ | |
114 | |
115 | |
116 | |
117 | |
118 /* FIXME: doc */ | |
119 /* the following are temporary flags while the code is still under construction; | |
120 had problems internalising hhmake, so as temporary crutch | |
121 write alignment to file and get external hmmer/hhmake via system call | |
122 to read alignment and convert into HMM | |
123 All this will go, once hhmake is properly internalised */ | |
124 #define INDIRECT_HMM 0 /* temp flag: (1) write aln to file, use system(hmmer/hhmake), (0) internal hhmake */ | |
125 #define USEHMMER 1 /* temp flag: use system(hmmer) to build HMM */ | |
126 #define USEHHMAKE (!USEHMMER) /* temp flag: use system(hhmake) to build HMM */ | |
127 | |
128 | |
129 /* shuffle order of input sequences */ | |
130 #define SHUFFLE_INPUT_SEQ_ORDER 0 | |
131 | |
132 /* sort input sequences by length */ | |
133 #define SORT_INPUT_SEQS 0 | |
134 | |
135 | |
136 int iNumberOfThreads; | |
137 | |
138 /* broken, unused and lonely */ | |
139 static const int ITERATION_SCORE_IMPROVEMENT_THRESHOLD = 0.01; | |
140 | |
141 | |
142 /** | |
143 * @brief Print Long version information to pre-allocated char. | |
144 * | |
145 * @note short version | |
146 * information is equivalent to PACKAGE_VERSION | |
147 * | |
148 * @param[out] pcStr | |
149 * char pointer to write to preallocated to hold iSize chars. | |
150 * @param[in] iSize | |
151 * size of pcStr | |
152 */ | |
153 void | |
154 PrintLongVersion(char *pcStr, int iSize) | |
155 { | |
156 snprintf(pcStr, iSize, "version %s; code-name '%s'; build date %s", | |
157 PACKAGE_VERSION, PACKAGE_CODENAME, __DATE__); | |
158 } | |
159 /* end of PrintLongVersion() */ | |
160 | |
161 | |
162 | |
163 /** | |
164 * @brief free aln opts members | |
165 * | |
166 */ | |
167 void | |
168 FreeAlnOpts(opts_t *prAlnOpts) { | |
169 if (NULL != prAlnOpts->pcGuidetreeInfile) { | |
170 CKFREE(prAlnOpts->pcGuidetreeInfile); | |
171 } | |
172 if (NULL != prAlnOpts->pcGuidetreeOutfile) { | |
173 CKFREE(prAlnOpts->pcGuidetreeOutfile); | |
174 } | |
175 if (NULL != prAlnOpts->pcDistmatOutfile) { | |
176 CKFREE(prAlnOpts->pcDistmatOutfile); | |
177 } | |
178 if (NULL != prAlnOpts->pcDistmatInfile) { | |
179 CKFREE(prAlnOpts->pcDistmatInfile); | |
180 } | |
181 } | |
182 /* end of FreeAlnOpts() */ | |
183 | |
184 | |
185 | |
186 /** | |
187 * @brief Sets members of given user opts struct to default values | |
188 * | |
189 * @param[out] prOpts | |
190 * User opt struct to initialise | |
191 * | |
192 */ | |
193 void | |
194 SetDefaultAlnOpts(opts_t *prOpts) { | |
195 prOpts->bAutoOptions = FALSE; | |
196 | |
197 prOpts->pcDistmatInfile = NULL; | |
198 prOpts->pcDistmatOutfile = NULL; | |
199 | |
200 prOpts->iClusteringType = CLUSTERING_UPGMA; | |
201 prOpts->iPairDistType = PAIRDIST_KTUPLE; | |
202 prOpts->bUseMbed = FALSE; | |
203 prOpts->bUseMbedForIteration = FALSE; | |
204 prOpts->pcGuidetreeOutfile = NULL; | |
205 prOpts->pcGuidetreeInfile = NULL; | |
206 | |
207 prOpts->ppcHMMInput = NULL; | |
208 prOpts->iHMMInputFiles = 0; | |
209 | |
210 prOpts->iNumIterations = 0; | |
211 prOpts->bIterationsAuto = FALSE; | |
212 prOpts->iMaxGuidetreeIterations = INT_MAX; | |
213 prOpts->iMaxHMMIterations = INT_MAX; | |
214 prOpts->iMacRam = 2048; /* give 2GB to MAC algorithm. FS, r240 -> r241 */ | |
215 } | |
216 /* end of SetDefaultAlnOpts() */ | |
217 | |
218 | |
219 | |
220 /** | |
221 * @brief Check logic of parsed user options. Will exit (call Log(&rLog, LOG_FATAL, )) | |
222 * on Fatal logic error | |
223 * | |
224 * @param[in] prOpts | |
225 * Already parsed user options | |
226 * | |
227 */ | |
228 void | |
229 AlnOptsLogicCheck(opts_t *prOpts) | |
230 { | |
231 /* guide-tree & distmat | |
232 * | |
233 */ | |
234 if (prOpts->pcDistmatInfile && prOpts->pcGuidetreeInfile) { | |
235 Log(&rLog, LOG_FATAL, "Read distances *and* guide-tree from file doesn't make sense."); | |
236 } | |
237 | |
238 if (prOpts->pcDistmatOutfile && prOpts->pcGuidetreeInfile) { | |
239 Log(&rLog, LOG_FATAL, "Won't be able to save distances to file, because I got a guide-tree as input."); | |
240 } | |
241 | |
242 /* combination of options that don't make sense when not iterating | |
243 */ | |
244 if (prOpts->iNumIterations==0 && prOpts->bIterationsAuto != TRUE) { | |
245 | |
246 if (prOpts->pcGuidetreeInfile && prOpts->pcGuidetreeOutfile) { | |
247 Log(&rLog, LOG_FATAL, "Got a guide-tree as input and output which doesn't make sense when not iterating."); | |
248 } | |
249 /* | |
250 if (prOpts->pcGuidetreeInfile && prOpts->bUseMbed > 0) { | |
251 Log(&rLog, LOG_FATAL, "Got a guide-tree as input and was requested to cluster with mBed, which doesn't make sense when not iterating."); | |
252 } | |
253 */ | |
254 if (prOpts->bUseMbedForIteration > 0) { | |
255 Log(&rLog, LOG_FATAL, "No iteration requested, but mbed for iteration was set. Paranoia exit."); | |
256 } | |
257 } | |
258 | |
259 if (prOpts->iMacRam < 512) { | |
260 | |
261 Log(&rLog, LOG_INFO, "Memory for MAC Algorithm quite low, Viterbi Algorithm may be triggered."); | |
262 | |
263 if (prOpts->iMacRam < 1) { | |
264 | |
265 Log(&rLog, LOG_WARN, "Viterbi Algorithm always turned on, increase MAC-RAM to turn on MAC."); | |
266 } | |
267 } | |
268 | |
269 return; | |
270 } | |
271 /* end of AlnOptsLogicCheck() */ | |
272 | |
273 | |
274 /** | |
275 * @brief FIXME doc | |
276 */ | |
277 void | |
278 PrintAlnOpts(FILE *prFile, opts_t *prOpts) | |
279 { | |
280 int iAux; | |
281 | |
282 | |
283 /* keep in same order as struct */ | |
284 fprintf(prFile, "option: auto-options = %d\n", prOpts->bAutoOptions); | |
285 fprintf(prFile, "option: distmat-infile = %s\n", | |
286 NULL != prOpts->pcDistmatInfile? prOpts->pcDistmatInfile: "(null)"); | |
287 fprintf(prFile, "option: distmat-outfile = %s\n", | |
288 NULL != prOpts->pcDistmatOutfile? prOpts->pcDistmatOutfile: "(null)"); | |
289 fprintf(prFile, "option: clustering-type = %d\n", prOpts->iClusteringType); | |
290 fprintf(prFile, "option: pair-dist-type = %d\n", prOpts->iPairDistType); | |
291 fprintf(prFile, "option: use-mbed = %d\n", prOpts->bUseMbed); | |
292 fprintf(prFile, "option: use-mbed-for-iteration = %d\n", prOpts->bUseMbedForIteration); | |
293 fprintf(prFile, "option: guidetree-outfile = %s\n", | |
294 NULL != prOpts->pcGuidetreeOutfile? prOpts->pcGuidetreeOutfile: "(null)"); | |
295 fprintf(prFile, "option: guidetree-infile = %s\n", | |
296 NULL != prOpts->pcGuidetreeInfile? prOpts->pcGuidetreeInfile: "(null)"); | |
297 for (iAux=0; iAux<prOpts->iHMMInputFiles; iAux++) { | |
298 fprintf(prFile, "option: hmm-input no %d = %s\n", iAux, prOpts->ppcHMMInput[iAux]); | |
299 } | |
300 fprintf(prFile, "option: hmm-input-files = %d\n", prOpts->iHMMInputFiles); | |
301 fprintf(prFile, "option: num-iterations = %d\n", prOpts->iNumIterations); | |
302 fprintf(prFile, "option: iterations-auto = %d\n", prOpts->bIterationsAuto); | |
303 fprintf(prFile, "option: max-hmm-iterations = %d\n", prOpts->iMaxHMMIterations); | |
304 fprintf(prFile, "option: max-guidetree-iterations = %d\n", prOpts->iMaxGuidetreeIterations); | |
305 } | |
306 /* end of PrintAlnOpts() */ | |
307 | |
308 | |
309 | |
310 /** | |
311 * @brief Returns major version of HMMER. Whichever hmmbuild version | |
312 * is found first in your PATH will be used | |
313 * | |
314 * @return -1 on error, major hmmer version otherwise | |
315 * | |
316 */ | |
317 int | |
318 HmmerVersion() | |
319 { | |
320 char zcHmmerTestCall[] = "hmmbuild -h"; | |
321 FILE *fp = NULL; | |
322 int iMajorVersion = 0; | |
323 char zcLine[16384]; | |
324 | |
325 if (NULL == (fp = popen(zcHmmerTestCall, "r"))) { | |
326 Log(&rLog, LOG_ERROR, "Couldn't exec %s", zcHmmerTestCall); | |
327 return -1; | |
328 } | |
329 while (fgets(zcLine, sizeof(zcLine), fp)) { | |
330 char *pcLocate; | |
331 if ((pcLocate = strstr(zcLine, "HMMER "))) { | |
332 iMajorVersion = atoi(&pcLocate[6]); | |
333 break; | |
334 } | |
335 } | |
336 pclose(fp); | |
337 | |
338 return iMajorVersion; | |
339 } | |
340 /* end of HmmerVersion() */ | |
341 | |
342 | |
343 | |
344 /** | |
345 * @brief Create a HHM file from aligned sequences | |
346 * | |
347 * @warning Should be eliminated in the future | |
348 * as building routine should not create intermediate files | |
349 * | |
350 * @param[in] prMSeq | |
351 * Aligned mseq_t | |
352 * @param[in] pcHMMOut | |
353 * HMM output file name | |
354 * | |
355 * @return Non-zero on error | |
356 * | |
357 */ | |
358 int | |
359 AlnToHHMFile(mseq_t *prMSeq, char *pcHMMOut) | |
360 { | |
361 char *tmp_aln = NULL; | |
362 int retcode = OK; | |
363 | |
364 assert(NULL!=prMSeq); | |
365 assert(NULL!=pcHMMOut); | |
366 | |
367 if (FALSE == prMSeq->aligned) { | |
368 Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM"); | |
369 return FAILURE; | |
370 } | |
371 | |
372 /* Convert alignment to a2m, and call hhmake | |
373 * | |
374 * can't be static templates, or mktemp fails (at least on os x | |
375 * (with a bus error)) | |
376 * | |
377 * gcc says we should use mkstemp to avoid race conditions, | |
378 * but that returns a file descriptor, which is of no use to | |
379 * us | |
380 */ | |
381 /* NOTE: the following won't work on windows: missing /tmp/ */ | |
382 tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX"); | |
383 if (NULL == mktemp(tmp_aln)) { | |
384 Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename"); | |
385 retcode = FAILURE; | |
386 goto cleanup_and_return; | |
387 } | |
388 if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_A2M)) { | |
389 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln); | |
390 retcode = FAILURE; | |
391 goto cleanup_and_return; | |
392 } | |
393 | |
394 if (HHMake_Wrapper(tmp_aln, pcHMMOut)){ | |
395 Log(&rLog, LOG_ERROR, "Could not convert alignment %s into HHM", tmp_aln); | |
396 retcode = FAILURE; | |
397 goto cleanup_and_return; | |
398 } | |
399 | |
400 | |
401 cleanup_and_return: | |
402 | |
403 if (NULL != tmp_aln) { | |
404 if (FileExists(tmp_aln)) { | |
405 if (remove(tmp_aln)) { | |
406 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln); | |
407 } | |
408 } | |
409 CKFREE(tmp_aln); | |
410 } | |
411 | |
412 return retcode; | |
413 | |
414 } /* end of AlnToHHMFile() */ | |
415 | |
416 | |
417 | |
418 /** | |
419 * @brief Create a HMM file from aligned sequences | |
420 * | |
421 * @warning Should be replaced in the future by some internal HMM | |
422 * building routine that does not call external programs | |
423 * | |
424 * @param[in] prMSeq | |
425 * Aligned mseq_t | |
426 * @param[in] pcHMMOut | |
427 * HMM output file name | |
428 * | |
429 * @return Non-zero on error | |
430 * | |
431 | |
432 */ | |
433 int | |
434 AlnToHMMFile(mseq_t *prMSeq, const char *pcHMMOut) | |
435 { | |
436 char *tmp_aln = NULL; | |
437 char *tmp_hmm = NULL; /* only needed for hmmer3 to hmmer2 conversion */ | |
438 char cmdbuf[16384]; | |
439 int iHmmerVersion = 0; | |
440 int retcode = OK; | |
441 | |
442 assert(NULL!=prMSeq); | |
443 assert(NULL!=pcHMMOut); | |
444 | |
445 if (FALSE == prMSeq->aligned) { | |
446 Log(&rLog, LOG_ERROR, "Sequences need to be aligned to create an HMM"); | |
447 return FAILURE; | |
448 } | |
449 | |
450 iHmmerVersion = HmmerVersion(); | |
451 if (2 != iHmmerVersion && 3 != iHmmerVersion) { | |
452 Log(&rLog, LOG_ERROR, "Could not find suitable HMMER binaries"); | |
453 return FAILURE; | |
454 } | |
455 | |
456 /* Convert alignment to stockholm, call hmmbuild and then | |
457 * either hmmconvert (hmmer3) or hmmcalibrate (hmmer2) | |
458 * | |
459 * can't be static templates, or mktemp fails (at least on os x | |
460 * (with a bus error)) | |
461 * | |
462 * gcc says we should use mkstemp to avoid race conditions, | |
463 * but that returns a file descriptor, which is of no use to | |
464 * us | |
465 */ | |
466 /* NOTE: the following won't work on windows: missing /tmp/ */ | |
467 tmp_aln = CkStrdup("/tmp/clustalo_tmpaln_XXXXXX"); | |
468 if (NULL == mktemp(tmp_aln)) { | |
469 Log(&rLog, LOG_ERROR, "Could not create temporary alignment filename"); | |
470 retcode = FAILURE; | |
471 goto cleanup_and_return; | |
472 } | |
473 if (WriteAlignment(prMSeq, tmp_aln, MSAFILE_STOCKHOLM)) { | |
474 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", tmp_aln); | |
475 retcode = FAILURE; | |
476 goto cleanup_and_return; | |
477 } | |
478 | |
479 if (2 == iHmmerVersion) { | |
480 sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmcalibrate %s >/dev/null", | |
481 pcHMMOut, tmp_aln, pcHMMOut); | |
482 if (system(cmdbuf)) { | |
483 Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf); | |
484 retcode = FAILURE; | |
485 goto cleanup_and_return; | |
486 } | |
487 } else if (3 == iHmmerVersion) { | |
488 /* NOTE: the following won't work on windows: missing /tmp/ */ | |
489 tmp_hmm = CkStrdup("/tmp/clustalo_tmphmm2_XXXXXX"); | |
490 if (NULL == mktemp(tmp_hmm)) { | |
491 Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename"); | |
492 retcode = FAILURE; | |
493 goto cleanup_and_return; | |
494 } | |
495 sprintf(cmdbuf, "hmmbuild %s %s >/dev/null && hmmconvert -2 %s > %s", | |
496 tmp_hmm, tmp_aln, tmp_hmm, pcHMMOut); | |
497 if (system(cmdbuf)) { | |
498 Log(&rLog, LOG_ERROR, "Command '%s' failed", cmdbuf); | |
499 retcode = FAILURE; | |
500 goto cleanup_and_return; | |
501 } | |
502 } else { | |
503 CKFREE(tmp_aln); | |
504 Log(&rLog, LOG_FATAL, "Internal error: Unknown Hmmer version %d", iHmmerVersion); | |
505 } | |
506 | |
507 | |
508 cleanup_and_return: | |
509 | |
510 if (NULL != tmp_aln) { | |
511 if (FileExists(tmp_aln)) { | |
512 if (remove(tmp_aln)) { | |
513 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_aln); | |
514 } | |
515 } | |
516 CKFREE(tmp_aln); | |
517 } | |
518 if (NULL != tmp_hmm) { | |
519 if (FileExists(tmp_hmm)) { | |
520 if (remove(tmp_hmm)) { | |
521 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", tmp_hmm); | |
522 } | |
523 } | |
524 CKFREE(tmp_hmm); | |
525 } | |
526 | |
527 return retcode; | |
528 } | |
529 /* end of AlnToHMMFile() */ | |
530 | |
531 | |
532 | |
533 /** | |
534 * @brief Convert a multiple sequence structure into a HMM | |
535 * | |
536 * @param[out] prHMM | |
537 * Pointer to preallocted HMM which will be set here | |
538 * @param[in] prMSeq | |
539 * Pointer to an alignment | |
540 * | |
541 * @return 0 on error, non-0 otherwise | |
542 * | |
543 * @see AlnToHMMFile() | |
544 * | |
545 */ | |
546 int | |
547 AlnToHMM(hmm_light *prHMM, mseq_t *prMSeq) | |
548 { | |
549 char *pcHMM; /* temp hmm file */ | |
550 | |
551 Log(&rLog, LOG_INFO, | |
552 "Using HMMER version %d to calculate a new HMM.", | |
553 HmmerVersion()); | |
554 /* FIXME replace all this with internal HMM computation (HHmake) */ | |
555 | |
556 /** | |
557 * @warning the following probably won't work on windows: missing | |
558 * /tmp/. Should be ok on Cygwin though | |
559 */ | |
560 pcHMM = CkStrdup("/tmp/clustalo-hmm-iter_XXXXXX"); | |
561 if (NULL == mktemp(pcHMM)) { | |
562 Log(&rLog, LOG_ERROR, "Could not create temporary hmm filename"); | |
563 CKFREE(pcHMM); | |
564 return FAILURE; | |
565 } | |
566 | |
567 /* Create a HMM representing the current alignment | |
568 */ | |
569 #if USEHMMER | |
570 if (AlnToHMMFile(prMSeq, pcHMM)) { | |
571 Log(&rLog, LOG_ERROR, "AlnToHMMFile() on %s failed.", pcHMM); | |
572 CKFREE(pcHMM); | |
573 return FAILURE; | |
574 } | |
575 #elif USEHHMAKE | |
576 if (AlnToHHMFile(prMSeq, pcHMM)) { | |
577 Log(&rLog, LOG_ERROR, "AlnToHHMFile() on %s failed.", pcHMM); | |
578 CKFREE(pcHMM); | |
579 return FAILURE; | |
580 } | |
581 /* Log(&rLog, LOG_FATAL, "Method to create HHM (HMM using hhmake) not installed yet"); */ | |
582 #else | |
583 Log(&rLog, LOG_FATAL, "Unknown method to create temporary HMM"); | |
584 #endif | |
585 | |
586 /* Read HMM information | |
587 */ | |
588 if (OK != readHMMWrapper(prHMM, pcHMM)){ | |
589 Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMM); | |
590 CKFREE(pcHMM); | |
591 return FAILURE; | |
592 } | |
593 | |
594 if (remove(pcHMM)) { | |
595 Log(&rLog, LOG_WARN, "Removing %s failed. Continuing anyway", pcHMM); | |
596 } | |
597 CKFREE(pcHMM); | |
598 | |
599 return OK; | |
600 } | |
601 /* end of AlnToHMM() */ | |
602 | |
603 | |
604 | |
605 /** | |
606 * @brief FIXME | |
607 * | |
608 */ | |
609 void | |
610 InitClustalOmega(int iNumThreadsRequested) | |
611 { | |
612 | |
613 #ifdef HAVE_OPENMP | |
614 iNumberOfThreads = iNumThreadsRequested; | |
615 omp_set_num_threads(iNumberOfThreads); | |
616 #else | |
617 if (iNumThreadsRequested>1) { | |
618 Log(&rLog, LOG_FATAL, "Cannot change number of threads to %d. %s was build without OpenMP support.", | |
619 iNumThreadsRequested, PACKAGE_NAME); | |
620 } | |
621 iNumberOfThreads = 1; /* need to set this, even if build without support */ | |
622 #endif | |
623 | |
624 Log(&rLog, LOG_INFO, "Using %d threads", | |
625 iNumberOfThreads); | |
626 | |
627 } | |
628 /* end of InitClustalOmega() */ | |
629 | |
630 | |
631 | |
632 /** | |
633 * @brief Defines an alignment order, which adds sequences | |
634 * sequentially, i.e. one at a time starting with seq 1 & 2 | |
635 * | |
636 * @param[out] piOrderLR_p | |
637 * order in which nodes/profiles are to be merged/aligned | |
638 * @param[in] iNumSeq | |
639 * Number of sequences | |
640 * | |
641 * @see TraverseTree() | |
642 * | |
643 */ | |
644 void | |
645 SequentialAlignmentOrder(int **piOrderLR_p, int iNumSeq) | |
646 { | |
647 unsigned int uNodes = iNumSeq*2-1; | |
648 unsigned int uNodeCounter = 0; | |
649 unsigned int uSeqCounter = 0; | |
650 | |
651 Log(&rLog, LOG_FATAL, "FIXME: Untested..."); | |
652 | |
653 (*piOrderLR_p) = (int *)CKCALLOC(DIFF_NODE * uNodes, sizeof(int)); | |
654 /* loop over merge nodes, which have per definition even indices | |
655 * and set up children which have odd indices | |
656 */ | |
657 uSeqCounter = 0; | |
658 for (uNodeCounter=iNumSeq; uNodeCounter<uNodes; uNodeCounter+=1) { | |
659 unsigned int uLeftChildNodeIndex = uNodeCounter-1; | |
660 unsigned int uRightChildNodeIndex = uNodeCounter-iNumSeq+1; | |
661 unsigned int uParentNodeIndex = uNodeCounter+1; | |
662 | |
663 /* merge node setup */ | |
664 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE] = uLeftChildNodeIndex; | |
665 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+RGHT_NODE] = uRightChildNodeIndex; | |
666 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+PRNT_NODE] = uParentNodeIndex; | |
667 /* only setup left child if at first merge node, all other left childs | |
668 * should be merge nodes that are already set up. also correct | |
669 * left node number here. | |
670 */ | |
671 if (uNodeCounter==iNumSeq) { | |
672 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE] = 0; | |
673 | |
674 (*piOrderLR_p)[0+LEFT_NODE] = 0; | |
675 (*piOrderLR_p)[0+RGHT_NODE] = 0; | |
676 (*piOrderLR_p)[0+PRNT_NODE] = uNodeCounter; | |
677 uSeqCounter++; | |
678 | |
679 Log(&rLog, LOG_FORCED_DEBUG, "Set up first leaf with node counter %d: left=%d right=%d parent=%d", | |
680 0, | |
681 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+LEFT_NODE], | |
682 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+RGHT_NODE], | |
683 (*piOrderLR_p)[DIFF_NODE*uLeftChildNodeIndex+PRNT_NODE]); | |
684 } | |
685 Log(&rLog, LOG_FORCED_DEBUG, "Set up merge node with node counter %d: left=%d right=%d parent=%d", | |
686 uNodeCounter, (*piOrderLR_p)[DIFF_NODE*uNodeCounter+LEFT_NODE], | |
687 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+RGHT_NODE], | |
688 (*piOrderLR_p)[DIFF_NODE*uNodeCounter+PRNT_NODE]); | |
689 | |
690 /* right child */ | |
691 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+LEFT_NODE] = uSeqCounter; | |
692 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+RGHT_NODE] = uSeqCounter; | |
693 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+PRNT_NODE] = uNodeCounter; | |
694 uSeqCounter++; | |
695 | |
696 Log(&rLog, LOG_FORCED_DEBUG, "Set up leaf with node counter %d: left=%d right=%d parent=%d", | |
697 uRightChildNodeIndex, (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+LEFT_NODE], | |
698 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+RGHT_NODE], | |
699 (*piOrderLR_p)[DIFF_NODE*uRightChildNodeIndex+PRNT_NODE]); | |
700 } | |
701 } | |
702 /* end of SequentialAlignmentOrder() */ | |
703 | |
704 | |
705 | |
706 /** | |
707 * @brief Defines the alignment order by calculating a guide tree. In | |
708 * a first-step pairwise distances will be calculated (or read from a | |
709 * file). In a second step those distances will be clustered and a | |
710 * guide-tree created. Steps 1 and 2 will be skipped if a guide-tree | |
711 * file was given, in which case the guide-tree will be just read from | |
712 * the file. | |
713 * | |
714 * @param[out] piOrderLR_p | |
715 * order in which nodes/profiles are to be merged/aligned | |
716 * @param[out] pdSeqWeights_p | |
717 * Sequence weights | |
718 * @param[out] pdSeqWeights_p | |
719 * Sequence weights | |
720 * @param[in] prMSeq | |
721 * The sequences from which the alignment order is to be calculated | |
722 * @param[in] iPairDistType | |
723 * Method of pairwise distance comparison | |
724 * @param[in] pcDistmatInfile | |
725 * If not NULL distances will be read from this file instead of being | |
726 * calculated | |
727 * @param[in] pcDistmatOutfile | |
728 * If not NULL computed pairwise distances will be written to this file | |
729 * @param[in] iClusteringType | |
730 * Clustering method to be used to cluster the pairwise distances | |
731 * @param[in] pcGuidetreeInfile | |
732 * If not NULL guidetree will be read from this file. Skips pairwise | |
733 * distance and guidetree computation | |
734 * @param[in] pcGuidetreeOutfile | |
735 * If not NULL computed guidetree will be written to this file | |
736 * @param[in] bUseMbed | |
737 * If TRUE, fast mBed guidetree computation will be employed | |
738 * | |
739 * @return Non-zero on error | |
740 * | |
741 */ | |
742 int | |
743 AlignmentOrder(int **piOrderLR_p, double **pdSeqWeights_p, mseq_t *prMSeq, | |
744 int iPairDistType, char *pcDistmatInfile, char *pcDistmatOutfile, | |
745 int iClusteringType, char *pcGuidetreeInfile, char *pcGuidetreeOutfile, | |
746 bool bUseMbed) | |
747 { | |
748 /* pairwise distance matrix (tmat in 1.83) */ | |
749 symmatrix_t *distmat = NULL; | |
750 /* guide tree */ | |
751 tree_t *prTree = NULL; | |
752 int i = 0; | |
753 | |
754 | |
755 /* Shortcut for only two sequences: Do not compute k-tuple | |
756 * distances. Use the same logic as in TraverseTree() to setup | |
757 * piOrderLR_p. Changes there will have to be reflected here as | |
758 * well. */ | |
759 if (2==prMSeq->nseqs) { | |
760 Log(&rLog, LOG_VERBOSE, | |
761 "Have only two sequences: No need to compute pairwise score and compute a tree."); | |
762 | |
763 (*piOrderLR_p) = (int*) CKMALLOC(DIFF_NODE * 3 * sizeof(int)); | |
764 (*piOrderLR_p)[DIFF_NODE*0+LEFT_NODE] = 0; | |
765 (*piOrderLR_p)[DIFF_NODE*0+RGHT_NODE] = 0; | |
766 (*piOrderLR_p)[DIFF_NODE*0+PRNT_NODE] = 0; | |
767 | |
768 (*piOrderLR_p)[DIFF_NODE*1+LEFT_NODE] = 1; | |
769 (*piOrderLR_p)[DIFF_NODE*1+RGHT_NODE] = 1; | |
770 (*piOrderLR_p)[DIFF_NODE*1+PRNT_NODE] = 1; | |
771 | |
772 /* root */ | |
773 (*piOrderLR_p)[DIFF_NODE*2+LEFT_NODE] = 0; | |
774 (*piOrderLR_p)[DIFF_NODE*2+RGHT_NODE] = 1; | |
775 (*piOrderLR_p)[DIFF_NODE*2+PRNT_NODE] = 2; | |
776 | |
777 /* Same logic as CalcClustalWeights(). Changes there will | |
778 have to be reflected here as well. */ | |
779 #if USE_WEIGHTS | |
780 (*pdWeights_p) = (double *) CKMALLOC(uNodeCount * sizeof(double)); | |
781 (*pdWeights_p)[0] = 0.5; | |
782 (*pdWeights_p)[1] = 0.5; | |
783 #endif | |
784 | |
785 return OK; | |
786 } | |
787 | |
788 | |
789 /* compute distance & guide tree, alternatively read distances or | |
790 * guide tree from file | |
791 * | |
792 */ | |
793 if (NULL != pcGuidetreeInfile) { | |
794 Log(&rLog, LOG_INFO, "Reading guide-tree from %s", pcGuidetreeInfile); | |
795 if (GuideTreeFromFile(&prTree, prMSeq, pcGuidetreeInfile)) { | |
796 Log(&rLog, LOG_ERROR, "Reading of guide tree %s failed.", pcGuidetreeInfile); | |
797 return FAILURE; | |
798 } | |
799 | |
800 } else { | |
801 | |
802 if (bUseMbed) { | |
803 if (Mbed(&prTree, prMSeq, iPairDistType, pcGuidetreeOutfile)) { | |
804 Log(&rLog, LOG_ERROR, "mbed execution failed."); | |
805 return FAILURE; | |
806 } | |
807 Log(&rLog, LOG_INFO, "Guide-tree computation (mBed) done."); | |
808 if (NULL != pcDistmatOutfile) { | |
809 Log(&rLog, LOG_INFO, | |
810 "Ignoring request to write distance matrix (am in mBed mode)"); | |
811 } | |
812 } else { | |
813 | |
814 if (PairDistances(&distmat, prMSeq, iPairDistType, | |
815 0, prMSeq->nseqs, 0, prMSeq->nseqs, | |
816 pcDistmatInfile, pcDistmatOutfile)) { | |
817 Log(&rLog, LOG_ERROR, "Couldn't compute pair distances"); | |
818 return FAILURE; | |
819 } | |
820 | |
821 /* clustering of distances to get guide tree | |
822 */ | |
823 if (CLUSTERING_UPGMA == iClusteringType) { | |
824 char **labels; | |
825 labels = (char**) CKMALLOC(prMSeq->nseqs * sizeof(char*)); | |
826 for (i=0; i<prMSeq->nseqs; i++) { | |
827 labels[i] = prMSeq->sqinfo[i].name; | |
828 } | |
829 | |
830 GuideTreeUpgma(&prTree, labels, distmat, pcGuidetreeOutfile); | |
831 Log(&rLog, LOG_INFO, "Guide-tree computation done."); | |
832 | |
833 CKFREE(labels); | |
834 } else { | |
835 Log(&rLog, LOG_FATAL, "INTERNAL ERROR %s", | |
836 "clustering method should have been checked before"); | |
837 } | |
838 } | |
839 } | |
840 | |
841 #if USE_WEIGHTS | |
842 /* derive sequence weights from tree | |
843 * | |
844 */ | |
845 Log(&rLog, LOG_INFO, "Calculating sequence weights"); | |
846 CalcClustalWeights(pdSeqWeights_p, prTree); | |
847 for (i = 0; i < GetLeafCount(prTree); i++) { | |
848 Log(&rLog, LOG_VERBOSE, | |
849 "Weight for seq no %d: %s = %f", | |
850 i, prMSeq->sqinfo[i].name, (*pdSeqWeights_p)[i]); | |
851 } | |
852 #else | |
853 Log(&rLog, LOG_DEBUG, "Not using weights"); | |
854 #endif | |
855 | |
856 | |
857 /* define traversing order of tree | |
858 * | |
859 */ | |
860 TraverseTree(piOrderLR_p, prTree, prMSeq); | |
861 if (rLog.iLogLevelEnabled <= LOG_DEBUG) { | |
862 /* FIXME: debug only, FS */ | |
863 uint uNodeIndex; | |
864 FILE *fp = LogGetFP(&rLog, LOG_INFO); | |
865 Log(&rLog, LOG_DEBUG, "left/right order after tree traversal"); | |
866 for (uNodeIndex = 0; uNodeIndex < GetNodeCount(prTree); uNodeIndex++) { | |
867 fprintf(fp, "%3d:\t%2d/%2d -> %d\n", i, | |
868 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+LEFT_NODE], | |
869 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+RGHT_NODE], | |
870 (*piOrderLR_p)[DIFF_NODE*uNodeIndex+PRNT_NODE]); | |
871 } | |
872 } | |
873 | |
874 FreeMuscleTree(prTree); | |
875 FreeSymMatrix(&distmat); | |
876 | |
877 #if 0 | |
878 Log(&rLog, LOG_FATAL, "DEBUG EXIT before leaving %s", __FUNCTION__); | |
879 #endif | |
880 return OK; | |
881 } | |
882 /* end of AlignmentOrder() */ | |
883 | |
884 | |
885 | |
886 /** | |
887 * @brief Set some options automatically based on number of sequences. Might | |
888 * overwrite some user-set options. | |
889 * | |
890 * @param[out] prOpts | |
891 * Pointer to alignment options structure | |
892 * @param[in] iNumSeq | |
893 * Number of sequences to align | |
894 */ | |
895 void | |
896 SetAutoOptions(opts_t *prOpts, int iNumSeq) { | |
897 | |
898 Log(&rLog, LOG_INFO, | |
899 "Setting options automatically based on input sequence characteristics (might overwrite some of your options)."); | |
900 | |
901 if (iNumSeq >= 10000) { | |
902 if (0 != prOpts->iNumIterations) { | |
903 Log(&rLog, LOG_INFO, "Auto settings: Disabling iterations."); | |
904 prOpts->iNumIterations = 0; | |
905 } | |
906 if (FALSE == prOpts->bUseMbed) { | |
907 Log(&rLog, LOG_INFO, "Auto settings: Enabling mBed."); | |
908 prOpts->bUseMbed = TRUE; | |
909 } | |
910 | |
911 } else if (iNumSeq >= 1000) { | |
912 if (1 != prOpts->iNumIterations) { | |
913 Log(&rLog, LOG_INFO, "Auto settings: Setting iteration to 1."); | |
914 prOpts->iNumIterations = 1; | |
915 } | |
916 if (FALSE == prOpts->bUseMbed) { | |
917 Log(&rLog, LOG_INFO, "Auto settings: Enabling mBed."); | |
918 prOpts->bUseMbed = TRUE; | |
919 } | |
920 | |
921 | |
922 } else { | |
923 if (0 != prOpts->iNumIterations) { | |
924 Log(&rLog, LOG_INFO, "Auto settings: Disabling iterations."); | |
925 prOpts->iNumIterations = 0; | |
926 } | |
927 if (TRUE == prOpts->bUseMbed) { | |
928 Log(&rLog, LOG_INFO, "Auto settings: Disabling mBed."); | |
929 prOpts->bUseMbed = FALSE; | |
930 } | |
931 } | |
932 } | |
933 /* end of */ | |
934 | |
935 | |
936 | |
937 /** | |
938 * @brief The main alignment function which wraps everything else. | |
939 * | |
940 * @param[out] prMSeq | |
941 * *the* multiple sequences structure | |
942 * @param[in] prMSeqProfile | |
943 * optional profile to align against | |
944 * @param[in] prOpts | |
945 * alignmemnt options to use | |
946 * | |
947 * @return 0 on success, -1 on failure | |
948 * | |
949 */ | |
950 int | |
951 Align(mseq_t *prMSeq, | |
952 mseq_t *prMSeqProfile, | |
953 opts_t *prOpts, | |
954 hhalign_para rHhalignPara) { | |
955 | |
956 /* HMM | |
957 */ | |
958 /* structs with pseudocounts etc; one for each HMM infile, i.e. | |
959 * index range: 0..iHMMInputFiles */ | |
960 hmm_light *prHMMs = NULL; | |
961 | |
962 /* MSA order in which nodes/profiles are to be merged/aligned | |
963 (order of nodes in guide tree (left/right)*/ | |
964 int *piOrderLR = NULL; | |
965 | |
966 /* weights per sequence */ | |
967 double *pdSeqWeights = NULL; | |
968 | |
969 /* Iteration | |
970 */ | |
971 int iIterationCounter = 0; | |
972 double dAlnScore; | |
973 /* last dAlnScore for iteration */ | |
974 double dLastAlnScore = -666.666; | |
975 | |
976 int i, j; /* aux */ | |
977 | |
978 assert(NULL != prMSeq); | |
979 if (NULL != prMSeqProfile) { | |
980 assert(TRUE == prMSeqProfile->aligned); | |
981 } | |
982 | |
983 | |
984 /* automatic setting of options | |
985 * | |
986 */ | |
987 if (prOpts->bAutoOptions) { | |
988 SetAutoOptions(prOpts, prMSeq->nseqs); | |
989 } | |
990 | |
991 | |
992 #if SHUFFLE_INPUT_SEQ_ORDER | |
993 /* | |
994 * shuffle input: only useful for testing/debugging | |
995 */ | |
996 Log(&rLog, LOG_WARN, "Shuffling input sequences! (Will also change output order)"); | |
997 ShuffleMSeq(prMSeq); | |
998 #endif | |
999 | |
1000 | |
1001 #if SORT_INPUT_SEQS | |
1002 /* | |
1003 * sort input: | |
1004 * | |
1005 * would ensure we *always* (unless we get into the mbed k-means stage) | |
1006 * get the same answer. usually you don't, because most pairwise alignment | |
1007 * scores are in theory not symmetric, therefore sequence ordering might | |
1008 * have an effect on the guide-tree. Sorting by length should get rid of | |
1009 * this (and takes no time even for 100k seqs). Benchmark results on | |
1010 * Balibase show almost no difference after sorting. | |
1011 */ | |
1012 Log(&rLog, LOG_WARN, "Sorting input seq by length! This will also change the output order"); | |
1013 SortMSeqByLength(prMSeq, 'd'); | |
1014 | |
1015 #endif | |
1016 | |
1017 | |
1018 /* Read backgrounds HMMs and store in prHMMs | |
1019 * | |
1020 */ | |
1021 if (0 < prOpts->iHMMInputFiles) { | |
1022 int iHMMInfileIndex; | |
1023 | |
1024 /** | |
1025 * @warning old structure used to be initialised like this: | |
1026 * hmm_light rHMM = {0}; | |
1027 */ | |
1028 prHMMs = (hmm_light *) CKMALLOC(prOpts->iHMMInputFiles * sizeof(hmm_light)); | |
1029 | |
1030 for (iHMMInfileIndex=0; iHMMInfileIndex<prOpts->iHMMInputFiles; iHMMInfileIndex++) { | |
1031 char *pcHMMInput = prOpts->ppcHMMInput[iHMMInfileIndex]; | |
1032 if (OK != readHMMWrapper(&prHMMs[iHMMInfileIndex], pcHMMInput)){ | |
1033 Log(&rLog, LOG_ERROR, "Processing of HMM file %s failed", pcHMMInput); | |
1034 return -1; | |
1035 } | |
1036 | |
1037 #if 0 | |
1038 Log(&rLog, LOG_FORCED_DEBUG, "HMM length is %d", prHMMs[iHMMInfileIndex].L); | |
1039 Log(&rLog, LOG_FORCED_DEBUG, "n-display is %d", prHMMs[iHMMInfileIndex].n_display); | |
1040 for (i = 0; NULL != prHMMs[prOpts->iHMMInputFiles].seq[i]; i++){ | |
1041 printf("seq[%d]: %s\n", i, prHMMs[iHMMInfileIndex].seq[i]); | |
1042 } | |
1043 Log(&rLog, LOG_FORCED_DEBUG, "Neff_HMM is %f", prHMMs[iHMMInfileIndex].Neff_HMM); | |
1044 #endif | |
1045 if (rLog.iLogLevelEnabled <= LOG_DEBUG){ | |
1046 Log(&rLog, LOG_DEBUG, "print frequencies"); | |
1047 for (i = 0; i < prHMMs[iHMMInfileIndex].L; i++){ | |
1048 #define PRINT_TAIL 5 | |
1049 if ( (PRINT_TAIL+1 == i) && (prHMMs[iHMMInfileIndex].L-PRINT_TAIL != i) ){ | |
1050 printf("....\n"); | |
1051 } | |
1052 if ( (i > PRINT_TAIL) && (i < prHMMs[iHMMInfileIndex].L-PRINT_TAIL) ){ | |
1053 continue; | |
1054 } | |
1055 printf("%3d:", i); | |
1056 for (j = 0; j < 20; j++){ | |
1057 printf("\t%1.3f", prHMMs[iHMMInfileIndex].f[i][j]); | |
1058 } | |
1059 printf("\n"); | |
1060 } | |
1061 } /* debug print block */ | |
1062 | |
1063 CKFREE(prOpts->ppcHMMInput[iHMMInfileIndex]); | |
1064 } /* for each background HMM file */ | |
1065 CKFREE(prOpts->ppcHMMInput); | |
1066 } /* there were background HMM files */ | |
1067 | |
1068 | |
1069 | |
1070 /* If the input ("non-profile") sequences are aligned, then turn | |
1071 * the alignment into a HMM and add to the list of background HMMs | |
1072 * | |
1073 */ | |
1074 if (TRUE == prMSeq->aligned) { | |
1075 /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */ | |
1076 hmm_light rHMMLocal = {0}; | |
1077 | |
1078 Log(&rLog, LOG_INFO, | |
1079 "Input sequences are aligned. Will turn alignment into HMM and add it to the user provided background HMMs."); | |
1080 if (OK != | |
1081 #if INDIRECT_HMM | |
1082 AlnToHMM(&rHMMLocal, prMSeq) | |
1083 #else | |
1084 AlnToHMM2(&rHMMLocal, prMSeq->seq, prMSeq->nseqs) | |
1085 #endif | |
1086 ) { | |
1087 Log(&rLog, LOG_ERROR, "Couldn't convert aligned input sequences to HMM. Will try to continue"); | |
1088 } else { | |
1089 prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light))); | |
1090 memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light)); | |
1091 prOpts->iHMMInputFiles++; | |
1092 } | |
1093 } | |
1094 | |
1095 | |
1096 /* If we have a profile turn it into a HMM and add to | |
1097 * the list of background HMMs. | |
1098 * | |
1099 */ | |
1100 if (NULL != prMSeqProfile) { | |
1101 /* FIXME: gcc warns about missing initialiser here (-Wall -Wextra -pedantic) */ | |
1102 hmm_light rHMMLocal = {0}; | |
1103 Log(&rLog, LOG_INFO, | |
1104 "Turning profile1 into HMM and will use it during progressive alignment."); | |
1105 if (OK != | |
1106 #if INDIRECT_HMM | |
1107 AlnToHMM(&rHMMLocal, prMSeqProfile) | |
1108 #else | |
1109 AlnToHMM2(&rHMMLocal, prMSeqProfile->seq, prMSeqProfile->nseqs) | |
1110 #endif | |
1111 ) { | |
1112 Log(&rLog, LOG_ERROR, "Couldn't convert profile1 to HMM. Will try to continue"); | |
1113 } else { | |
1114 prHMMs = (hmm_light *) CKREALLOC(prHMMs, ((prOpts->iHMMInputFiles+1) * sizeof(hmm_light))); | |
1115 memcpy(&(prHMMs[prOpts->iHMMInputFiles]), &rHMMLocal, sizeof(hmm_light)); | |
1116 prOpts->iHMMInputFiles++; | |
1117 } | |
1118 } | |
1119 | |
1120 | |
1121 /* Now do a first alignment of the input sequences (prMSeq) adding | |
1122 * all collected background HMMs | |
1123 * | |
1124 */ | |
1125 /* Determine progressive alignment order | |
1126 */ | |
1127 if (TRUE == prMSeq->aligned) { | |
1128 Log(&rLog, LOG_INFO, "%s %s", | |
1129 "Input sequences are aligned.", | |
1130 "Will use Kimura distances of aligned sequences."); | |
1131 prOpts->iPairDistType = PAIRDIST_SQUIDID_KIMURA; | |
1132 } | |
1133 | |
1134 #if 0 | |
1135 Log(&rLog, LOG_WARN, "Using a sequential alignment order."); | |
1136 SequentialAlignmentOrder(&piOrderLR, prMSeq->nseqs); | |
1137 #else | |
1138 if (OK != AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq, | |
1139 prOpts->iPairDistType, | |
1140 prOpts->pcDistmatInfile, prOpts->pcDistmatOutfile, | |
1141 prOpts->iClusteringType, | |
1142 prOpts->pcGuidetreeInfile, prOpts->pcGuidetreeOutfile, | |
1143 prOpts->bUseMbed)) { | |
1144 Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue"); | |
1145 return -1; | |
1146 } | |
1147 #endif | |
1148 | |
1149 /* Progressive alignment of input sequences. Order defined by | |
1150 * branching of guide tree (piOrderLR). Use optional | |
1151 * background HMM information (prHMMs[0..prOpts->iHMMInputFiles-1]) | |
1152 * | |
1153 */ | |
1154 dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights, | |
1155 2*prMSeq->nseqs -1/* nodes */, | |
1156 prHMMs, prOpts->iHMMInputFiles, -1, rHhalignPara); | |
1157 dLastAlnScore = dAlnScore; | |
1158 Log(&rLog, LOG_VERBOSE, | |
1159 "Alignment score for first alignment = %f", dAlnScore); | |
1160 | |
1161 | |
1162 | |
1163 | |
1164 /* ------------------------------------------------------------ | |
1165 * | |
1166 * prMSeq is aligned now. Now start iterations if requested and save the | |
1167 * alignment at the very end. | |
1168 * | |
1169 * @note We discard the background HMM information at this point, | |
1170 * because it was already used. Could consider to make this choice | |
1171 * optional. FIXME | |
1172 * | |
1173 * ------------------------------------------------------------ */ | |
1174 | |
1175 | |
1176 /* iteration after first alignment was computed (if not profile-profile | |
1177 * alignment) | |
1178 * | |
1179 */ | |
1180 for (iIterationCounter=0; | |
1181 (iIterationCounter < prOpts->iNumIterations || prOpts->bIterationsAuto); | |
1182 iIterationCounter++) { | |
1183 | |
1184 hmm_light rHMMLocal = {0}; | |
1185 /* FIXME Keep copy of old alignment in case new one sucks? */ | |
1186 | |
1187 | |
1188 if (iIterationCounter >= prOpts->iMaxHMMIterations | |
1189 && | |
1190 iIterationCounter >= prOpts->iMaxGuidetreeIterations) { | |
1191 Log(&rLog, LOG_VERBOSE, "Reached maximum number of HMM and guide-tree iterations"); | |
1192 break; | |
1193 } | |
1194 | |
1195 if (! prOpts->bIterationsAuto) { | |
1196 Log(&rLog, LOG_INFO, "Iteration step %d out of %d", | |
1197 iIterationCounter+1, prOpts->iNumIterations); | |
1198 } else { | |
1199 Log(&rLog, LOG_INFO, "Iteration step %d out of <auto>", | |
1200 iIterationCounter+1); | |
1201 } | |
1202 #if 0 | |
1203 if (rLog.iLogLevelEnabled <= LOG_VERBOSE) { | |
1204 char zcIntermediate[1000] = {0}; | |
1205 char *pcFormat = "fasta"; | |
1206 sprintf(zcIntermediate, "clustalo-aln-iter~%d~", iIterationCounter); | |
1207 if (WriteAlignment(prMSeq, zcIntermediate, MSAFILE_A2M)) { | |
1208 Log(&rLog, LOG_ERROR, "Could not save alignment to %s", zcIntermediate); | |
1209 return -1; | |
1210 } | |
1211 } | |
1212 #endif | |
1213 | |
1214 | |
1215 /* new guide-tree | |
1216 * | |
1217 */ | |
1218 if (iIterationCounter < prOpts->iMaxGuidetreeIterations) { | |
1219 /* determine progressive alignment order | |
1220 * | |
1221 * few things are different now when calling AlignmentOrder: | |
1222 * - we have to ignore prOpts->pcDistmatInfile and pcGuidetreeInfile | |
1223 * as they were used before | |
1224 * - the corresponding outfiles are still valid though | |
1225 */ | |
1226 /* Free stuff that has already been allocated by or further | |
1227 * downstream of AlignmentOrder() | |
1228 */ | |
1229 if (NULL != piOrderLR) | |
1230 CKFREE(piOrderLR); | |
1231 if (NULL != pdSeqWeights) | |
1232 CKFREE(pdSeqWeights); | |
1233 if (AlignmentOrder(&piOrderLR, &pdSeqWeights, prMSeq, | |
1234 PAIRDIST_SQUIDID_KIMURA /* override */, NULL, prOpts->pcDistmatOutfile, | |
1235 prOpts->iClusteringType, NULL, prOpts->pcGuidetreeOutfile, | |
1236 prOpts->bUseMbedForIteration)) { | |
1237 Log(&rLog, LOG_ERROR, "AlignmentOrder() failed. Cannot continue"); | |
1238 return -1; | |
1239 } | |
1240 } else { | |
1241 Log(&rLog, LOG_INFO, "Skipping guide-tree iteration at iteration step %d (reached maximum)", | |
1242 iIterationCounter); | |
1243 } | |
1244 | |
1245 | |
1246 /* new local hmm iteration | |
1247 * | |
1248 */ | |
1249 if (iIterationCounter < prOpts->iMaxHMMIterations) { | |
1250 if (OK != | |
1251 #if INDIRECT_HMM | |
1252 AlnToHMM(&rHMMLocal, prMSeq) | |
1253 #else | |
1254 AlnToHMM2(&rHMMLocal, prMSeq->seq, prMSeq->nseqs) | |
1255 #endif | |
1256 ) { | |
1257 Log(&rLog, LOG_ERROR, "Couldn't convert alignment to HMM. Will stop iterating now..."); | |
1258 break; | |
1259 } | |
1260 } else { | |
1261 Log(&rLog, LOG_INFO, "Skipping HMM iteration at iteration step %d (reached maximum)", | |
1262 iIterationCounter); | |
1263 } | |
1264 | |
1265 | |
1266 /* align the sequences (again) | |
1267 */ | |
1268 dAlnScore = HHalignWrapper(prMSeq, piOrderLR, pdSeqWeights, | |
1269 2*prMSeq->nseqs -1/* nodes */, &rHMMLocal, 1, -1, rHhalignPara); | |
1270 Log(&rLog, LOG_VERBOSE, | |
1271 "Alignment score for alignmnent in hmm-iteration no %d = %f (last score = %f)", | |
1272 iIterationCounter+1, dAlnScore, dLastAlnScore); | |
1273 | |
1274 | |
1275 FreeHMMstruct(&rHMMLocal); | |
1276 | |
1277 #if 0 | |
1278 /* FIXME: need a better score for automatic iteration */ | |
1279 if (prOpts->bIterationsAuto) { | |
1280 /* automatic iteration: break if score improvement was not | |
1281 * big enough | |
1282 */ | |
1283 double dScoreImprovement = (dAlnScore-dLastAlnScore)/dLastAlnScore; | |
1284 if (dScoreImprovement < ITERATION_SCORE_IMPROVEMENT_THRESHOLD) { | |
1285 Log(&rLog, LOG_INFO, | |
1286 "Stopping after %d guide-tree iterations. No further alignment score improvement achieved.", | |
1287 iIterationCounter+1); | |
1288 /* use previous alignment */ | |
1289 FreeMSeq(&prMSeq); | |
1290 Log(&rLog, LOG_FORCED_DEBUG, "FIXME: %s", "CopyMSeq breaks things in this context"); | |
1291 CopyMSeq(&prMSeq, prMSeqCopy); | |
1292 /* FIXME: prOpts->pcDistmatOutfile and pcGuidetreeOutfile | |
1293 * might have been updated, but then discarded here? | |
1294 */ | |
1295 break; | |
1296 } else { | |
1297 Log(&rLog, LOG_INFO, | |
1298 "Got a %d%% better score in iteration step %d", | |
1299 (int)dScoreImprovement*100, iIterationCounter+1); | |
1300 FreeMSeq(&prMSeqCopy); | |
1301 } | |
1302 } | |
1303 dLastAlnScore = dAlnScore; | |
1304 #endif | |
1305 | |
1306 } | |
1307 /* end of iterations */ | |
1308 | |
1309 | |
1310 | |
1311 /* Last step: if a profile was also provided then align now-aligned mseq | |
1312 * with this profile | |
1313 * | |
1314 * Don't use the backgrounds HMMs anymore and don't iterate. | |
1315 * (which was done before). | |
1316 * | |
1317 */ | |
1318 if (NULL != prMSeqProfile) { | |
1319 if (AlignProfiles(prMSeq, prMSeqProfile, rHhalignPara)) { | |
1320 Log(&rLog, LOG_ERROR, "An error occured during the profile/profile alignment"); | |
1321 return -1; | |
1322 } | |
1323 } | |
1324 | |
1325 | |
1326 if (NULL != piOrderLR) { | |
1327 CKFREE(piOrderLR); | |
1328 } | |
1329 if (NULL != pdSeqWeights) { | |
1330 CKFREE(pdSeqWeights); | |
1331 } | |
1332 if (0 < prOpts->iHMMInputFiles) { | |
1333 for (i=0; i<prOpts->iHMMInputFiles; i++) { | |
1334 FreeHMMstruct(&prHMMs[i]); | |
1335 } | |
1336 CKFREE(prHMMs); | |
1337 } | |
1338 | |
1339 return 0; | |
1340 } | |
1341 /* end of Align() */ | |
1342 | |
1343 | |
1344 | |
1345 | |
1346 /** | |
1347 * @brief Align two profiles, ie two sets of prealigned sequences. Already | |
1348 * aligned columns won't be changed. | |
1349 * | |
1350 * @param[out] prMSeqProfile1 | |
1351 * First profile/aligned set of sequences. Merged alignment will be found in | |
1352 * here. | |
1353 * @param[in] prMSeqProfile2 | |
1354 * First profile/aligned set of sequences | |
1355 * | |
1356 * @return 0 on success, -1 on failure | |
1357 * | |
1358 */ | |
1359 int | |
1360 AlignProfiles(mseq_t *prMSeqProfile1, | |
1361 mseq_t *prMSeqProfile2, hhalign_para rHhalignPara) { | |
1362 | |
1363 double dAlnScore; | |
1364 | |
1365 /* number of seqs in first half of joined profile */ | |
1366 int iProfProfSeparator = prMSeqProfile1->nseqs; | |
1367 | |
1368 assert(TRUE == prMSeqProfile1->aligned); | |
1369 assert(TRUE == prMSeqProfile2->aligned); | |
1370 | |
1371 Log(&rLog, LOG_INFO, "Performing profile/profile alignment"); | |
1372 | |
1373 /* Combine the available mseqs into prMSeq | |
1374 * which will be aligned afterwards. | |
1375 */ | |
1376 JoinMSeqs(&prMSeqProfile1, prMSeqProfile2); | |
1377 | |
1378 | |
1379 /* set alignment flag explicitly to FALSE */ | |
1380 prMSeqProfile1->aligned = FALSE; | |
1381 | |
1382 dAlnScore = HHalignWrapper(prMSeqProfile1, | |
1383 NULL, /* no order */ | |
1384 NULL, /* no weights */ | |
1385 3, /* nodes: root+2profiles */ | |
1386 NULL, 0 /* no bg-hmms */, | |
1387 iProfProfSeparator, rHhalignPara); | |
1388 | |
1389 Log(&rLog, LOG_VERBOSE, "Alignment score is = %f", dAlnScore); | |
1390 | |
1391 return 0; | |
1392 } | |
1393 /* end of AlignProfiles() */ | |
1394 | |
1395 |