Mercurial > repos > clustalomega > clustalomega
comparison clustalomega/clustal-omega-0.2.0/src/clustal/mbed.c @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author | clustalomega |
---|---|
date | Tue, 07 Jun 2011 17:04:25 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ff1768533a07 |
---|---|
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ | |
2 | |
3 /********************************************************************* | |
4 * Clustal Omega - Multiple sequence alignment | |
5 * | |
6 * Copyright (C) 2010 University College Dublin | |
7 * | |
8 * Clustal-Omega is free software; you can redistribute it and/or | |
9 * modify it under the terms of the GNU General Public License as | |
10 * published by the Free Software Foundation; either version 2 of the | |
11 * License, or (at your option) any later version. | |
12 * | |
13 * This file is part of Clustal-Omega. | |
14 * | |
15 ********************************************************************/ | |
16 | |
17 /* | |
18 * RCS $Id: mbed.c 242 2011-05-27 14:04:21Z andreas $ | |
19 * | |
20 * | |
21 * Reimplementation from scratch of mBed: | |
22 * Blackshields et al. (2010); PMID 20470396 | |
23 * | |
24 * | |
25 */ | |
26 | |
27 #ifdef HAVE_CONFIG_H | |
28 #include "config.h" | |
29 #endif | |
30 | |
31 #include <assert.h> | |
32 #include <float.h> | |
33 | |
34 #include "mbed.h" | |
35 #include "pair_dist.h" | |
36 #include "symmatrix.h" | |
37 #include "ktuple_pair.h" | |
38 #include "tree.h" | |
39 #include "util.h" | |
40 #include "progress.h" | |
41 #include "queue.h" | |
42 | |
43 #include "log.h" | |
44 #include "kmpp/KMeans.h" | |
45 #include "mbed.h" | |
46 | |
47 #define TIMING 0 | |
48 #if TIMING | |
49 #include "squid/stopwatch.h" | |
50 #endif | |
51 | |
52 | |
53 /* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each | |
54 * bisecting kmeans subcluster are not estimated using the vectors, | |
55 * but calculated normally (using ktuple or kimura). Surprisingly this | |
56 * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000 | |
57 * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS | |
58 * log2 instead of log2^2). And of course it slows things down. | |
59 */ | |
60 #define FULL_WITHIN_CLUSTER_DISTANCES 1 | |
61 | |
62 | |
63 /* Cluster size limits. Maximum is a soft limit, which might be | |
64 * exceeded if a K-Means split was unsuccesful, where unsuccesful | |
65 * might also mean that the minimum required number seqs. was not | |
66 * reached */ | |
67 #if FULL_WITHIN_CLUSTER_DISTANCES | |
68 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100; | |
69 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1; | |
70 #else | |
71 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000; | |
72 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100; | |
73 #endif | |
74 | |
75 /* How many restarts per bisecting kmeans split. 10 give 0.5% increase | |
76 * in quality on original HOMFAM over just 1. It also increases kmeans | |
77 * time by a factor of ~3, but overall time is insignificant | |
78 * compared to pairdist/progressive alignment part. | |
79 */ | |
80 static const int RESTARTS_PER_SPLIT = 10; | |
81 | |
82 | |
83 /* Use standard kmeans (lloyds) or kmeans++. Both are almost | |
84 * indistinguishable here, although kmeans++ is slightly ahead the | |
85 * fewer seeds you pick (and it should be better in theory) | |
86 */ | |
87 #define USE_KMEANS_LLOYDS 0 | |
88 | |
89 | |
90 #ifndef HAVE_LOG2 | |
91 #define log2(x) (log(x) / 0.69314718055994530942) | |
92 #endif | |
93 #define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2) | |
94 | |
95 | |
96 /* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed | |
97 * approach: Sample iSeeds sequence with constant stride from length-sorted X. | |
98 * It might be better to pick the seeds randomly, because length sorting might | |
99 * be more prone to including outliers (e.g. very long and very short seqs). | |
100 * However, we could never observer such a thing. So just stick to the | |
101 * original version as this also removes the random element. | |
102 */ | |
103 enum SEED_SELECTION_TYPE { | |
104 SELECT_SEEDS_RANDOMLY, | |
105 SELECT_SEEDS_BY_LENGTH | |
106 }; | |
107 #define SEED_SELECTION SELECT_SEEDS_BY_LENGTH | |
108 | |
109 | |
110 /* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is | |
111 * no difference between mbed-trees created from cosine or euclidean | |
112 * distances (simple version, just using disparities). | |
113 */ | |
114 #define USE_EUCLIDEAN_DISTANCE 1 | |
115 | |
116 | |
117 /* print some mbed pre-cluster usage to screen */ | |
118 #define PRINT_CLUSTER_DISTRIBUTION 0 | |
119 | |
120 | |
121 #define TRACE 0 | |
122 | |
123 | |
124 typedef struct { | |
125 /* Number of final clusters | |
126 */ | |
127 int iNClusters; | |
128 | |
129 /* Coordinates (columns) for each cluster (rows) | |
130 * valid indices: [0...iNClusters][0...dim-1] | |
131 */ | |
132 double **ppdClusterCenters; | |
133 | |
134 /* Dimensionality of input data and cluster center coordinates | |
135 */ | |
136 int iDim; | |
137 | |
138 /* Number of objects per cluster | |
139 */ | |
140 int *piNObjsPerCluster; | |
141 | |
142 /* Objects (indices) for each cluster. [i][j] will point to (index | |
143 * of) object j in cluster i | |
144 */ | |
145 int **ppiObjIndicesPerCluster; | |
146 } bisecting_kmeans_result_t; | |
147 | |
148 | |
149 | |
150 /** | |
151 * @brief Free KMeans result structure. | |
152 * | |
153 * @param[out] prResult_p | |
154 * K-Means result to free | |
155 * | |
156 * @see NewKMeansResult() | |
157 */ | |
158 void | |
159 FreeKMeansResult(bisecting_kmeans_result_t **prResult_p) | |
160 { | |
161 int iAux; | |
162 CKFREE((*prResult_p)->piNObjsPerCluster); | |
163 for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) { | |
164 CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]); | |
165 CKFREE((*prResult_p)->ppdClusterCenters[iAux]); | |
166 } | |
167 CKFREE((*prResult_p)->ppiObjIndicesPerCluster); | |
168 CKFREE((*prResult_p)->ppdClusterCenters); | |
169 (*prResult_p)->iNClusters = 0; | |
170 (*prResult_p)->iDim = 0; | |
171 CKFREE(*prResult_p); | |
172 } | |
173 /*** end: FreeKMeansResult() ***/ | |
174 | |
175 | |
176 | |
177 /** | |
178 * @brief Allocate new KMeans result | |
179 * | |
180 * @param[out] prKMeansResult_p | |
181 * K-Means result to free | |
182 * | |
183 * @see NewKMeansResult() | |
184 */ | |
185 void | |
186 NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p) | |
187 { | |
188 (*prKMeansResult_p) = (bisecting_kmeans_result_t *) | |
189 CKCALLOC(1, sizeof(bisecting_kmeans_result_t)); | |
190 (*prKMeansResult_p)->iNClusters = 0; | |
191 (*prKMeansResult_p)->iDim = 0; | |
192 (*prKMeansResult_p)->ppdClusterCenters = NULL; | |
193 (*prKMeansResult_p)->piNObjsPerCluster = NULL; | |
194 (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL; | |
195 | |
196 } | |
197 /*** end: NewKMeansResult() ***/ | |
198 | |
199 | |
200 | |
201 /** | |
202 * @brief Calculate the euclidean distance between two vectors | |
203 * | |
204 * @param[in] v1 | |
205 * First vector with dim dimensions | |
206 * @param[in] v2 | |
207 * Second vector with dim dimensions | |
208 * @param[in] dim | |
209 * Dimensionality of v1 and v2 | |
210 * | |
211 * @return euclidean distance as double | |
212 * | |
213 * @note Could probably be inlined. Also perfect for SSE | |
214 */ | |
215 double | |
216 EuclDist(const double *v1, const double *v2, const int dim) | |
217 { | |
218 int i; /* aux */ | |
219 double dist; /* returned distance */ | |
220 | |
221 assert(v1!=NULL); | |
222 assert(v2!=NULL); | |
223 | |
224 dist = 0.0; | |
225 for (i=0; i<dim; i++) { | |
226 dist += pow(v1[i]-v2[i], 2.0); | |
227 } | |
228 | |
229 return sqrt(dist); | |
230 } | |
231 /*** end: EuclDist() ***/ | |
232 | |
233 | |
234 | |
235 /** | |
236 * @brief Calculate the cosine distance between two vectors | |
237 * | |
238 * @param[in] v1 | |
239 * First vector with dim dimensions | |
240 * @param[in] v2 | |
241 * Second vector with dim dimensions | |
242 * @param[in] dim | |
243 * Dimensionality of v1 and v2 | |
244 * | |
245 * @return cosine distance as double | |
246 * | |
247 * @note Could probably be inlined. Also perfect for SSE | |
248 */ | |
249 double | |
250 CosDist(const double *v1, const double *v2, const int dim) | |
251 { | |
252 int i; /* aux */ | |
253 double dist; /* returned distance */ | |
254 double s, sq1, sq2; | |
255 | |
256 assert(v1!=NULL); | |
257 assert(v2!=NULL); | |
258 | |
259 s = 0.0; | |
260 sq1 = 0.0; | |
261 sq2 = 0.0; | |
262 for (i=0; i<dim; i++) { | |
263 s += (v1[i]*v2[i]); | |
264 sq1 += pow(v1[i], 2.0); | |
265 sq2 += pow(v2[i], 2.0); | |
266 } | |
267 sq1 = sqrt(sq1); | |
268 sq2 = sqrt(sq2); | |
269 | |
270 if ((sq1 * sq2) < DBL_EPSILON) { | |
271 dist = 1 - s / (sq1 * sq2); | |
272 } else { | |
273 /* if the denominator gets small, the fraction gets big, hence dist | |
274 gets small: */ | |
275 dist = 0.0; | |
276 } | |
277 return dist; | |
278 } | |
279 /*** end: CosDist() ***/ | |
280 | |
281 | |
282 | |
283 /** | |
284 * @brief convert sequences into mbed-like (distance) vector | |
285 * representation. Seeds (prMSeq sequence indices) have to be picked before | |
286 * | |
287 * @param[out] ppdSeqVec | |
288 * Pointer to preallocated matrix of size nseqs x iSeeds | |
289 * @param[in] prMSeq | |
290 * Sequences which are to be converted | |
291 * @param[in] piSeeds | |
292 * Array of sequences in prMSeq which are to be used as seeds. | |
293 * @param[in] iNumSeeds | |
294 * Number of seeds/elements in piSeeds | |
295 * @param[in] iPairDistType | |
296 * Type of pairwise distance comparison | |
297 * | |
298 */ | |
299 int | |
300 SeqToVec(double **ppdSeqVec, mseq_t *prMSeq, | |
301 int *piSeeds, const int iNumSeeds, | |
302 const int iPairDistType) | |
303 { | |
304 symmatrix_t *prDistmat = NULL; | |
305 int iSeqIndex; | |
306 int iSeedIndex; | |
307 /* indices for restoring order */ | |
308 int *restore; | |
309 /* sorted copy of piSeeds */ | |
310 int *piSortedSeeds; | |
311 | |
312 #if TIMING | |
313 Stopwatch_t *stopwatch = StopwatchCreate(); | |
314 StopwatchZero(stopwatch); | |
315 StopwatchStart(stopwatch); | |
316 #endif | |
317 | |
318 assert(NULL != ppdSeqVec); | |
319 assert(iNumSeeds>0 && iNumSeeds<=prMSeq->nseqs); | |
320 | |
321 piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int)); | |
322 memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int)); | |
323 | |
324 /* need them sorted, otherwise the swapping approach below might | |
325 * break | |
326 */ | |
327 qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp); | |
328 | |
329 | |
330 /* rearrange mseq order so that we can use ktuple_pairdist code as | |
331 * is. That is, swap seeds and non-seeds such that the seeds end | |
332 * up in front of mseq. This way we can use the KTuplePairDist() | |
333 * code, without making a copy of mseq. Also, keep an array of | |
334 * indices which serves to restore the original sequence order | |
335 * after putting the seeds in front | |
336 * | |
337 */ | |
338 restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int)); | |
339 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
340 restore[iSeqIndex] = iSeqIndex; | |
341 } | |
342 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) { | |
343 #if TRACE | |
344 Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %u", piSortedSeeds[iSeedIndex], iSeedIndex); | |
345 #endif | |
346 if (piSortedSeeds[iSeedIndex]!=iSeedIndex) { | |
347 int swap; | |
348 SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex); | |
349 | |
350 swap = restore[iSeedIndex]; | |
351 restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]]; | |
352 restore[piSortedSeeds[iSeedIndex]] = swap; | |
353 } | |
354 } | |
355 #if TRACE | |
356 printf("DEBUG(%s|%s():%d): restore indices =", | |
357 __FILE__, __FUNCTION__, __LINE__); | |
358 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
359 printf(" %u:%u", iSeqIndex, restore[iSeqIndex]); | |
360 } | |
361 printf("\n"); | |
362 | |
363 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
364 Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now: seq = %s", | |
365 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name); | |
366 } | |
367 #endif | |
368 | |
369 | |
370 /* convert sequences into vectors of distances by calling pairwise | |
371 * distance function for each seq/seed pair | |
372 * | |
373 */ | |
374 if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType, | |
375 0, iNumSeeds, 0, prMSeq->nseqs, | |
376 NULL, NULL)) { | |
377 Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed."); | |
378 FreeSymMatrix(&prDistmat); | |
379 CKFREE(piSortedSeeds); | |
380 CKFREE(restore); | |
381 return -1; | |
382 } | |
383 #if TRACE | |
384 { | |
385 char **labels; | |
386 labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *)); | |
387 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
388 labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name; | |
389 } | |
390 SymMatrixPrint(prDistmat, labels, NULL); | |
391 CKFREE(labels); | |
392 } | |
393 #endif | |
394 | |
395 | |
396 /* update ppdSeqVec according to prDistmat. keep in mind that we | |
397 * changed mseq order | |
398 * | |
399 */ | |
400 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
401 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) { | |
402 ppdSeqVec[restore[iSeqIndex]][iSeedIndex] = | |
403 SymMatrixGetValue(prDistmat, iSeqIndex, iSeedIndex); | |
404 } | |
405 } | |
406 | |
407 | |
408 /* restore mseq order by reverse swapping | |
409 * | |
410 * need reverse order now, so start from top index | |
411 */ | |
412 iSeedIndex=iNumSeeds-1; | |
413 do { | |
414 #if TRACE | |
415 Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %d", piSortedSeeds[iSeedIndex], iSeedIndex); | |
416 #endif | |
417 if (piSortedSeeds[iSeedIndex]!=iSeedIndex) { | |
418 int swap; | |
419 | |
420 SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex); | |
421 | |
422 swap = restore[iSeedIndex]; | |
423 restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]]; | |
424 restore[piSortedSeeds[iSeedIndex]] = swap; | |
425 } | |
426 } while (0 != iSeedIndex--); | |
427 #if TRACE | |
428 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
429 Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d: seq = %s %s", | |
430 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]); | |
431 } | |
432 #endif | |
433 | |
434 #if TRACE | |
435 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
436 printf("DEBUG: seq %-4u as distance vector =", iSeqIndex); | |
437 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) { | |
438 printf(" %f", ppdSeqVec[iSeqIndex][iSeedIndex]); | |
439 } | |
440 printf("\n"); | |
441 } | |
442 #endif | |
443 | |
444 | |
445 FreeSymMatrix(&prDistmat); | |
446 CKFREE(restore); | |
447 #if TIMING | |
448 StopwatchStop(stopwatch); | |
449 StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch); | |
450 StopwatchFree(stopwatch); | |
451 #endif | |
452 | |
453 | |
454 return 0; | |
455 } | |
456 /*** end: SeqToVec() ***/ | |
457 | |
458 | |
459 | |
460 /** | |
461 * @brief Select seeds to be used from an prMSeq | |
462 * | |
463 * @param[out] piSeeds | |
464 * Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated. | |
465 * @param[in] iNumSeeds | |
466 * Number of seeds to be picked | |
467 * @param[in] iSelectionMethod | |
468 * Seed selection method to be used | |
469 * @param[in] prMSeq | |
470 * The prMSeq structure to pick sequences from | |
471 * | |
472 * @return: Non-zero on failure | |
473 * | |
474 */ | |
475 int | |
476 SeedSelection(int *piSeeds, int iNumSeeds, int iSelectionMethod, mseq_t *prMSeq) | |
477 { | |
478 /* seed iterator */ | |
479 int iSeedIndex; | |
480 /* sequence iterator */ | |
481 int iSeqIndex; | |
482 | |
483 if (SELECT_SEEDS_RANDOMLY == iSelectionMethod) { | |
484 int *piPermArray; | |
485 | |
486 Log(&rLog, LOG_INFO, | |
487 "Using %d seeds (randomly chosen) for mBed (from a total of %d sequences)", | |
488 iNumSeeds, prMSeq->nseqs); | |
489 | |
490 PermutationArray(&piPermArray, iNumSeeds); | |
491 /* copy to piSeeds */ | |
492 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) { | |
493 piSeeds[iSeedIndex] = piPermArray[iSeedIndex]; | |
494 } | |
495 CKFREE(piPermArray); | |
496 | |
497 } else if (SELECT_SEEDS_BY_LENGTH == iSelectionMethod) { | |
498 | |
499 /* step size for picking with constant stride */ | |
500 int iStep; | |
501 int *piSeqLen = CKMALLOC(prMSeq->nseqs * sizeof(int)); | |
502 int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int)); | |
503 | |
504 Log(&rLog, LOG_INFO, | |
505 "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)", | |
506 iNumSeeds, prMSeq->nseqs); | |
507 | |
508 iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big | |
509 * due to rounding */ | |
510 /* first get an array of seq indices order according to | |
511 corresponding sequence length: piOrder */ | |
512 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
513 piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len; | |
514 } | |
515 QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE); | |
516 #if 0 | |
517 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
518 Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)", | |
519 iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]); | |
520 } | |
521 #endif | |
522 CKFREE(piSeqLen); | |
523 iSeedIndex = 0; | |
524 for (iSeqIndex=0; iSeedIndex<iNumSeeds; iSeqIndex+=iStep) { | |
525 piSeeds[iSeedIndex++] = piOrder[iSeqIndex]; | |
526 } | |
527 CKFREE(piOrder); | |
528 | |
529 } else { | |
530 | |
531 Log(&rLog, LOG_ERROR, "Internal error: unknown seed selection type"); | |
532 return -1; | |
533 } | |
534 | |
535 | |
536 #ifdef PICK_SEEDS_FROM_KMEANS_CLUSTERS | |
537 /* do initial mbedding and kmeans. then pick seeds from each cluster and | |
538 do full mbed with these seeds. idea is that those seeds represent the | |
539 sequence space better than random seeds. however, this reduces the | |
540 quality slightly. random is almost always better. | |
541 */ | |
542 | |
543 if (1) { | |
544 /* seqs represented as distance vectors */ | |
545 double **ppdSeqVec; | |
546 double dCost = -1.0; | |
547 /* assignments for each seq to the K newly created clusters */ | |
548 int *piKMeansClusterAssignments = CKMALLOC(prMSeq->nseqs * sizeof(int)); | |
549 double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double)); | |
550 /* create a copy of ppdSeqVec suitable for KMeans */ | |
551 double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double)); | |
552 bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool)); | |
553 | |
554 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds"); | |
555 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters"); | |
556 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round"); | |
557 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE"); | |
558 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good"); | |
559 | |
560 /* | |
561 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds"); | |
562 iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2); | |
563 */ | |
564 | |
565 ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *)); | |
566 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
567 ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double)); | |
568 } | |
569 if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) { | |
570 Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed"); | |
571 return -1; | |
572 } | |
573 | |
574 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
575 int iDim; | |
576 for (iDim=0; iDim<iNumSeeds; ++iDim) { | |
577 pdKMeansVectors[iDim*iSeqIndex + iDim] = ppdSeqVec[iSeqIndex][iDim]; | |
578 } | |
579 } | |
580 | |
581 | |
582 Log(&rLog, LOG_FORCED_DEBUG, "%s\n", "FIXME hardcoded RESTARTS_PER_SPLIT"); | |
583 dCost = KMeans(prMSeq->nseqs, iNumSeeds, iNumSeeds, | |
584 pdKMeansVectors, | |
585 RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS, | |
586 pdKMeansClusterCenters, piKMeansClusterAssignments); | |
587 Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost); | |
588 | |
589 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers"); | |
590 | |
591 #if TRACE | |
592 Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:"); | |
593 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
594 Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d", | |
595 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, | |
596 piKMeansClusterAssignments[iSeqIndex]); | |
597 } | |
598 #endif | |
599 | |
600 Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds"); | |
601 iSeedIndex = 0; | |
602 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { | |
603 int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex]; | |
604 if (pbClusterUsed[iAssignedCluster]) { | |
605 continue; | |
606 } else { | |
607 /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/ | |
608 piSeeds[iSeedIndex++] = iSeqIndex; | |
609 pbClusterUsed[iAssignedCluster] = TRUE; | |
610 } | |
611 } | |
612 CKFREE(pbClusterUsed); | |
613 | |
614 CKFREE(pdKMeansVectors); | |
615 CKFREE(pdKMeansClusterCenters); | |
616 CKFREE(piKMeansClusterAssignments); | |
617 | |
618 } | |
619 /* if (1) */ | |
620 #endif | |
621 | |
622 | |
623 if (LOG_DEBUG <= rLog.iLogLevelEnabled) { | |
624 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) { | |
625 Log(&rLog, LOG_DEBUG, "Picked sequence %d (%s) as seed no %d", | |
626 piSeeds[iSeedIndex], prMSeq->sqinfo[piSeeds[iSeedIndex]].name, iSeedIndex); | |
627 } | |
628 } | |
629 | |
630 return 0; | |
631 } | |
632 /* end of SeedSelection() */ | |
633 | |
634 | |
635 | |
636 | |
637 /** | |
638 * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with | |
639 * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster. | |
640 * | |
641 * @param[out] prKMeansResult_p | |
642 * Result of Bisecting KMeans. Will be allocated here. | |
643 * Caller has to free. See @see FreeKMeansResult() | |
644 * @param[in] iNObjs | |
645 * Number of objects/sequences to cluster | |
646 * @param[in] iDim | |
647 * Dimensionality of input data | |
648 * @param[in] ppdVectors | |
649 * each row holds iDim points for this object's coordinates | |
650 * @param[in] iMinRequiredObjsPerCluster | |
651 * Minimum number of objects per Cluster (inclusive)/ | |
652 * @param[in] iMaxAllowedObjsPerCluster | |
653 * Maximum number of objects per Cluster (inclusive). Function returns once no | |
654 * cluster contains more then this number of objects. Soft limit! | |
655 * | |
656 * @note Convoluted code. Could use some restructuring. My apologies. | |
657 * AW | |
658 * | |
659 */ | |
660 void | |
661 BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p, | |
662 const int iNObjs, const int iDim, double **ppdVectors, | |
663 const int iMinRequiredObjsPerCluster, | |
664 const int iMaxAllowedObjsPerCluster) | |
665 { | |
666 int iN, iD; | |
667 /* cluster centers for each cluster created at each split */ | |
668 double *pdKClusterCenters; | |
669 /* keep track of updated object indices per newly created | |
670 * cluster */ | |
671 int *piCurObjToUpdate; | |
672 /* number of times we called 2-means */ | |
673 int iNRounds; | |
674 /* flag for unsuccessful cluster split */ | |
675 bool bNaNDetected = FALSE; | |
676 /* flag for detected small cluster after split */ | |
677 bool bSmallClusterDetected; | |
678 /* queue of clusters which are to be split */ | |
679 int_queue_t rClusterSplitQueue; | |
680 | |
681 #if TIMING | |
682 Stopwatch_t *stopwatch = StopwatchCreate(); | |
683 #endif | |
684 | |
685 | |
686 piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int)); | |
687 | |
688 NewKMeansResult(prKMeansResult_p); | |
689 | |
690 /* new cluster centers created at each split/KMeans run | |
691 */ | |
692 pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double)); | |
693 | |
694 /* init results by setting a first cluster that contains all objects | |
695 */ | |
696 (*prKMeansResult_p)->iNClusters = 1; | |
697 (*prKMeansResult_p)->iDim = iDim; | |
698 /* fake center coordinates of first cluster */ | |
699 (*prKMeansResult_p)->ppdClusterCenters = | |
700 (double **) CKMALLOC(1 * sizeof(double *)); | |
701 (*prKMeansResult_p)->ppdClusterCenters[0] = | |
702 (double *) CKMALLOC(iDim * sizeof(double)); | |
703 /* objects per cluster */ | |
704 (*prKMeansResult_p)->piNObjsPerCluster = | |
705 (int *) CKMALLOC(1 * sizeof(int)); | |
706 (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs; | |
707 /* object indices per cluster */ | |
708 (*prKMeansResult_p)->ppiObjIndicesPerCluster = | |
709 (int **) CKMALLOC(1 * sizeof(int *)); | |
710 (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *) | |
711 CKMALLOC(iNObjs * sizeof(int)); | |
712 for (iN=0; iN<iNObjs; iN++) { | |
713 (*prKMeansResult_p)->ppiObjIndicesPerCluster[0][iN] = iN; | |
714 } | |
715 | |
716 | |
717 | |
718 /* Starting with the first cluster that now contains all the | |
719 * sequences keep splitting until no cluster contains more than | |
720 * iMaxAllowedObjsPerCluster | |
721 * | |
722 * Keep a queue of clusters (rClusterSplitQueue) to split | |
723 * | |
724 * At each split values/memory of the just split cluster will be | |
725 * reused and exactly one new only allocated. | |
726 * | |
727 * Example: | |
728 * Given the following cluster assignments | |
729 * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4 | |
730 * and a K-Means split in cluster 3 at |: | |
731 * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4 | |
732 * The result should be this: | |
733 * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4 | |
734 * | |
735 * | |
736 */ | |
737 INT_QUEUE_INIT(&rClusterSplitQueue); | |
738 if (iNObjs>iMaxAllowedObjsPerCluster) { | |
739 /* pish fake first cluster index */ | |
740 INT_QUEUE_PUSH(&rClusterSplitQueue, 0); | |
741 } | |
742 iNRounds = 0; | |
743 while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) { | |
744 /* assignments for each seq to the K newly created clusters */ | |
745 int *piKClusterAssignments; | |
746 /* number of objects in cluster that is to be split */ | |
747 int iNObjsInClusterToSplit; | |
748 /* coordinates of points in cluster that is to be split | |
749 * array of size n*d where [d*i + j] gives coordinate j of point i | |
750 */ | |
751 double *pdVectorsInClusterToSplit; | |
752 /* copy of object indices in split cluster */ | |
753 int *piObjIndicesOfSplitCluster; | |
754 /* best cost of kmeans rounds */ | |
755 double dCost = -1.0; | |
756 | |
757 /* indices for the two created clusters | |
758 */ | |
759 /* index of cluster to split */ | |
760 int iClusterToSplot; | |
761 /* index of newly created cluster at each round. data for the | |
762 other created cluster goes to the one just split, | |
763 i.e. (*piClusterToSplot) */ | |
764 int iNewClusterIdx; | |
765 | |
766 #if TIMING | |
767 StopwatchZero(stopwatch); | |
768 StopwatchStart(stopwatch); | |
769 #endif | |
770 | |
771 INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot); | |
772 | |
773 iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot]; | |
774 piKClusterAssignments = (int *) | |
775 CKMALLOC(iNObjsInClusterToSplit * sizeof(int)); | |
776 | |
777 pdVectorsInClusterToSplit = (double *) | |
778 CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double)); | |
779 for (iN=0; iN<iNObjsInClusterToSplit; iN++) { | |
780 for (iD=0; iD<iDim; ++iD) { | |
781 int iThisObjIdx = | |
782 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]; | |
783 pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD]; | |
784 } | |
785 } | |
786 | |
787 #if TRACE | |
788 Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects", | |
789 iNRounds, iClusterToSplot, iNObjsInClusterToSplit); | |
790 fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:", | |
791 __FILE__, __FUNCTION__, __LINE__); | |
792 for (iN=0; iN<iNObjsInClusterToSplit; iN++) { | |
793 fprintf(stderr, " %u", | |
794 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]); | |
795 } | |
796 fprintf(stderr, "\n"); | |
797 (void) fflush(stderr); | |
798 #endif | |
799 | |
800 #if TRACE | |
801 for (iN=0; iN<iNObjsInClusterToSplit; iN++) { | |
802 fprintf(stderr, | |
803 "DEBUG(%s|%s():%d): Coordinate of object %u (real index %u) in cluster to split:", | |
804 __FILE__, __FUNCTION__, __LINE__, iN, | |
805 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]); | |
806 for (iD=0; iD<iDim; iD++) { | |
807 fprintf(stderr, " %f", pdVectorsInClusterToSplit[iDim*iN + iD]); | |
808 } | |
809 fprintf(stderr, "\n"); | |
810 (void) fflush(stderr); | |
811 } | |
812 #endif | |
813 | |
814 /* KMeans(1 "The number of points in the data set", | |
815 * 2 "The number of clusters to look for", | |
816 * 3 "The number of dimensions that the data set lives in", | |
817 * 4 "points: An array of size n*d where points[d*i + j] gives coordinate j of point i", | |
818 * 5 "attempts: The number of times to independently run k-means", | |
819 * 6 "use_lloyds_method: uses kmpp if false, otherwise lloyds method", | |
820 * 7 "centers: This can either be null or an array of size k*d. | |
821 * In the latter case, centers[d*i + j] will give coordinate j of center i. | |
822 * If the cluster is unused, it will contain NaN instead.", | |
823 * 8 "assignments: This can either be null or an array of size n. | |
824 * In the latter case, it will be filled with the cluster that each point is assigned to | |
825 * (an integer between 0 and k-1 inclusive)."); | |
826 */ | |
827 dCost = KMeans(iNObjsInClusterToSplit, 2, iDim, | |
828 pdVectorsInClusterToSplit, | |
829 RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS, | |
830 pdKClusterCenters, piKClusterAssignments); | |
831 | |
832 #if TRACE | |
833 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Raw K-Means output:"); | |
834 for (iN=0; iN<2; iN++) { | |
835 fprintf(stderr, "DEBUG(%s|%s():%d): Cluster Center %u =", | |
836 __FILE__, __FUNCTION__, __LINE__, iN); | |
837 for (iD=0; iD<iDim; iD++) { | |
838 fprintf(stderr, " %f", pdKClusterCenters[iN*iDim+iD]); | |
839 } | |
840 fprintf(stderr, "\n"); | |
841 (void) fflush(stderr); | |
842 } | |
843 for (iN=0; iN<iNObjsInClusterToSplit; iN++) { | |
844 Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u to cluster %d (of #%u)", | |
845 iN, piKClusterAssignments[iN], 2); | |
846 } | |
847 #endif | |
848 | |
849 /* real index of one of the newly created clusters. the other | |
850 * one is iClusterToSplot */ | |
851 iNewClusterIdx = (*prKMeansResult_p)->iNClusters; | |
852 | |
853 | |
854 /* We don't want Clusters which are too small. Check here if a | |
855 * split created a small cluster and if yes, discard the | |
856 * solution. Because the cluster has already been removed from | |
857 * the queue, we can just continue. | |
858 */ | |
859 bSmallClusterDetected = FALSE; | |
860 if (iMinRequiredObjsPerCluster>1) { | |
861 int iNObjsCluster[2]; | |
862 iNObjsCluster[0] = 0; /* first cluster */ | |
863 iNObjsCluster[1] = 0; /* second cluster */ | |
864 for (iN=0; iN<iNObjsInClusterToSplit; iN++) { | |
865 iNObjsCluster[piKClusterAssignments[iN]]+=1; | |
866 } | |
867 | |
868 if (iNObjsCluster[0]<iMinRequiredObjsPerCluster | |
869 || | |
870 iNObjsCluster[1]<iMinRequiredObjsPerCluster) { | |
871 bSmallClusterDetected = TRUE; | |
872 Log(&rLog, LOG_FORCED_DEBUG, "Skipping this split because objs in 1st/2nd cluster = %d/%d < %d", | |
873 iNObjsCluster[0], iNObjsCluster[1], iMinRequiredObjsPerCluster); | |
874 } | |
875 } | |
876 | |
877 /* Same logic as for small clusters applies if KMeans couldn't | |
878 * split the cluster. In this case one of its center | |
879 * coordinates will be NaN, which we check for in the | |
880 * following. | |
881 */ | |
882 if (! bSmallClusterDetected) { | |
883 for (iN=0; iN<2; iN++) { | |
884 bNaNDetected = FALSE; | |
885 for (iD=0; iD<iDim; iD++) { | |
886 if (0 != isnan(pdKClusterCenters[iN*iDim+iN])) { | |
887 /* Got NaN as coordinate after splitting */ | |
888 Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s", | |
889 __FUNCTION__, | |
890 iClusterToSplot, iNObjsInClusterToSplit, | |
891 "Hope it's not too big and doesn't slow things down." | |
892 ); | |
893 bNaNDetected = TRUE; | |
894 break; | |
895 } | |
896 } | |
897 if (bNaNDetected) { | |
898 break; | |
899 } | |
900 } | |
901 } | |
902 | |
903 /* Discarding split of this cluster. It has been removed from the | |
904 * queue already. so just continue | |
905 */ | |
906 if (bNaNDetected || bSmallClusterDetected) { | |
907 CKFREE(piKClusterAssignments); | |
908 CKFREE(pdVectorsInClusterToSplit); | |
909 continue; | |
910 } | |
911 | |
912 | |
913 /* update cluster centers: pdClusterCenters | |
914 * | |
915 */ | |
916 /* reuse memory of existing/old/split cluster | |
917 */ | |
918 for (iN=0; iN<iDim; iN++) { | |
919 double dCoord = pdKClusterCenters[0*iDim+iN]; | |
920 (*prKMeansResult_p)->ppdClusterCenters[iClusterToSplot][iN] = dCoord; | |
921 } | |
922 /* realloc and set new one | |
923 */ | |
924 (*prKMeansResult_p)->ppdClusterCenters = (double **) | |
925 CKREALLOC((*prKMeansResult_p)->ppdClusterCenters, | |
926 ((*prKMeansResult_p)->iNClusters+1) * sizeof(double *)); | |
927 (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *) | |
928 CKMALLOC(iDim * sizeof(double)); | |
929 for (iD=0; iD<iDim; iD++) { | |
930 double dCoord = pdKClusterCenters[1*iDim+iD]; | |
931 (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx][iD] = dCoord; | |
932 } | |
933 #if TRACE | |
934 Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:"); | |
935 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) { | |
936 fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =", | |
937 __FILE__, __FUNCTION__, __LINE__, iN); | |
938 for (iD=0; iD<iDim; iD++) { | |
939 fprintf(stderr, " %f", (*prKMeansResult_p)->ppdClusterCenters[iN][iD]); | |
940 } | |
941 fprintf(stderr, "\n"); | |
942 (void) fflush(stderr); | |
943 } | |
944 #endif | |
945 | |
946 | |
947 /* update #seqs per cluster: piNObjsPerCluster | |
948 * | |
949 */ | |
950 (*prKMeansResult_p)->piNObjsPerCluster = (int *) | |
951 CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster, | |
952 ((*prKMeansResult_p)->iNClusters+1) * sizeof(int)); | |
953 /* init new and old one to zero */ | |
954 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0; | |
955 (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0; | |
956 /* now update values */ | |
957 for (iN=0; iN<iNObjsInClusterToSplit; iN++) { | |
958 if (0 == piKClusterAssignments[iN]) { | |
959 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] += 1; | |
960 } else if (1 == piKClusterAssignments[iN]) { | |
961 (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1; | |
962 } else { | |
963 /* there used to be code for iK>=2 in r101 */ | |
964 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)", | |
965 piKClusterAssignments[iN]); | |
966 } | |
967 } | |
968 #if TRACE | |
969 Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:"); | |
970 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) { | |
971 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences", | |
972 iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]); | |
973 } | |
974 #endif | |
975 /* queue clusters if still they are still too big | |
976 */ | |
977 if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) { | |
978 INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot); | |
979 } | |
980 if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) { | |
981 INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx); | |
982 } | |
983 | |
984 | |
985 | |
986 /* update which objects belong to which cluster: | |
987 * | |
988 * note: piNObjsPerCluster needs to be already updated | |
989 * | |
990 */ | |
991 /* create a copy of the object indices in the split cluster | |
992 * (original will be overwritten) | |
993 */ | |
994 piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int)); | |
995 memcpy(piObjIndicesOfSplitCluster, | |
996 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot], | |
997 iNObjsInClusterToSplit * sizeof(int)); | |
998 | |
999 (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **) | |
1000 CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster, | |
1001 ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *)); | |
1002 | |
1003 | |
1004 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] = | |
1005 (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot], | |
1006 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int)); | |
1007 | |
1008 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] = | |
1009 (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int)); | |
1010 | |
1011 | |
1012 /* now reassign the object indices to the assigned cluster | |
1013 */ | |
1014 piCurObjToUpdate[0] = 0; | |
1015 piCurObjToUpdate[1] = 0; | |
1016 for (iN=0; iN<iNObjsInClusterToSplit; iN++) { | |
1017 int iThisObjIdx = piObjIndicesOfSplitCluster[iN]; | |
1018 int iThisClusterAssignment = piKClusterAssignments[iN]; | |
1019 int iThisOffset = piCurObjToUpdate[iThisClusterAssignment]; | |
1020 int iThisClusterIndex = 0; | |
1021 | |
1022 if (0 == iThisClusterAssignment) { | |
1023 iThisClusterIndex = iClusterToSplot; | |
1024 } else if (1 == iThisClusterAssignment) { | |
1025 iThisClusterIndex = iNewClusterIdx; | |
1026 } else { | |
1027 /* there used to be code for iK>=2 in r101 */ | |
1028 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)", | |
1029 piKClusterAssignments[iN]); | |
1030 } | |
1031 #if 0 | |
1032 Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d", | |
1033 iThisClusterIndex, iThisOffset, iThisObjIdx); | |
1034 #endif | |
1035 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx; | |
1036 piCurObjToUpdate[iThisClusterAssignment]+=1; | |
1037 } | |
1038 CKFREE(piObjIndicesOfSplitCluster); | |
1039 #if TRACE | |
1040 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) { | |
1041 int iObj; | |
1042 fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ", | |
1043 __FILE__, __FUNCTION__, __LINE__, iN); | |
1044 for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) { | |
1045 fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]); | |
1046 } | |
1047 fprintf(stderr, "\n"); | |
1048 (void) fflush(stderr); | |
1049 } | |
1050 #endif | |
1051 | |
1052 | |
1053 #if TIMING | |
1054 StopwatchStop(stopwatch); | |
1055 StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch); | |
1056 #endif | |
1057 | |
1058 | |
1059 /* finally: increase number of clusters | |
1060 */ | |
1061 (*prKMeansResult_p)->iNClusters += 1; | |
1062 iNRounds += 1; | |
1063 CKFREE(piKClusterAssignments); | |
1064 CKFREE(pdVectorsInClusterToSplit); | |
1065 | |
1066 } /* while */ | |
1067 INT_QUEUE_DESTROY(&rClusterSplitQueue); | |
1068 | |
1069 | |
1070 Log(&rLog, LOG_DEBUG, | |
1071 "Bisecting K-means finished after %d rounds (no more clusters to split)", | |
1072 iNRounds); | |
1073 | |
1074 #if TIMING | |
1075 StopwatchFree(stopwatch); | |
1076 #endif | |
1077 | |
1078 /* @note could use progress/timer */ | |
1079 | |
1080 CKFREE(pdKClusterCenters); | |
1081 CKFREE(piCurObjToUpdate); | |
1082 | |
1083 return; | |
1084 } | |
1085 /*** end: BisectingKmeans() ***/ | |
1086 | |
1087 | |
1088 | |
1089 /** | |
1090 * | |
1091 * @brief From scratch reimplementation of mBed: Blackshields et al. | |
1092 * (2010); PMID 20470396. | |
1093 * | |
1094 * Idea is a follows: | |
1095 * - convert sequences into vectors of distances | |
1096 * - cluster the vectors using k-means | |
1097 * - cluster each of the k clusters using upgma (used cached distances | |
1098 * from above?) | |
1099 * - join the sub-clusters to create on tree (use UPGMA on k-means | |
1100 * medoids) | |
1101 * | |
1102 * | |
1103 * @param[out] prMbedTree_p | |
1104 * Created upgma tree. will be allocated here. use FreeMuscleTree() | |
1105 * to free | |
1106 * @param[in] prMSeq | |
1107 * Multiple sequences | |
1108 * @param[in] iPairDistType | |
1109 * Distance measure for pairwise alignments | |
1110 * @param[in] pcGuidetreeOut | |
1111 * Passed down to GuideTreeUpgma() | |
1112 * | |
1113 * @return Zero on success, non-zero on error | |
1114 * | |
1115 */ | |
1116 int | |
1117 Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, | |
1118 const char *pcGuidetreeOut) | |
1119 { | |
1120 /* number of seeds */ | |
1121 int iNumSeeds; | |
1122 /* seed indices matching prMSeq */ | |
1123 int *piSeeds; | |
1124 /* seqs represented as distance vectors */ | |
1125 double **ppdSeqVec; | |
1126 /* kmeans result */ | |
1127 bisecting_kmeans_result_t *prKMeansResult = NULL; | |
1128 /* distance matrix of kmeans (pre-)cluster centers */ | |
1129 symmatrix_t *prPreClusterDistmat = NULL; | |
1130 /* auxiliary for symmetric matrix output; tree routines etc */ | |
1131 char **ppcLabels; | |
1132 int iNodeIndex; | |
1133 /* mapping of cluster-center tree node indices to corresponding | |
1134 cluster */ | |
1135 int *piClusterToTreeNode; | |
1136 int iClusterIndex; | |
1137 int iI, uJ; | |
1138 FILE *pfOut; | |
1139 progress_t *prSubClusterDistanceProgress; | |
1140 bool bPrintCR = (LOG_VERBOSE<=rLog.iLogLevelEnabled) ? FALSE : TRUE; | |
1141 | |
1142 #if FULL_WITHIN_CLUSTER_DISTANCES | |
1143 Log(&rLog, LOG_WARN, | |
1144 "FULL_WITHIN_CLUSTER_DISTANCES (using min %d / max %d seqs per subcluster) is a quick-hack. Brace yourself...", | |
1145 MIN_REQUIRED_SEQ_PER_PRECLUSTER, MAX_ALLOWED_SEQ_PER_PRECLUSTER); | |
1146 #endif | |
1147 | |
1148 #if MBED_TIMING | |
1149 Stopwatch_t *stopwatch = StopwatchCreate(); | |
1150 #endif | |
1151 | |
1152 assert(NULL != prMbedTree_p); | |
1153 assert(NULL != prMSeq); | |
1154 | |
1155 | |
1156 iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs); | |
1157 if (iNumSeeds >= prMSeq->nseqs) { | |
1158 /* -1 is condition for RandomUniqueIntArray */ | |
1159 iNumSeeds = prMSeq->nseqs-1; | |
1160 Log(&rLog, LOG_DEBUG, | |
1161 "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d", | |
1162 iNumSeeds); | |
1163 } | |
1164 | |
1165 | |
1166 /* Turn sequences into vectors of distances to the seeds | |
1167 * | |
1168 */ | |
1169 piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int)); | |
1170 if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) { | |
1171 Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed"); | |
1172 return -1; | |
1173 } | |
1174 ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *)); | |
1175 for (iI=0; iI<prMSeq->nseqs; iI++) { | |
1176 ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double)); | |
1177 } | |
1178 if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) { | |
1179 Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed"); | |
1180 return -1; | |
1181 } | |
1182 CKFREE(piSeeds); | |
1183 | |
1184 | |
1185 /* Calculate (pre-)clusters of sequence vectors by applying | |
1186 * bisecting kmeans | |
1187 * | |
1188 */ | |
1189 #if MBED_TIMING | |
1190 /* start clock only here, to make sure we don't include pairwise | |
1191 * distance computation */ | |
1192 StopwatchZero(stopwatch); | |
1193 StopwatchStart(stopwatch); | |
1194 #endif | |
1195 | |
1196 BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec, | |
1197 MIN_REQUIRED_SEQ_PER_PRECLUSTER, | |
1198 MAX_ALLOWED_SEQ_PER_PRECLUSTER); | |
1199 Log(&rLog, LOG_INFO, | |
1200 "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)", | |
1201 prKMeansResult->iNClusters, | |
1202 MIN_REQUIRED_SEQ_PER_PRECLUSTER, | |
1203 MAX_ALLOWED_SEQ_PER_PRECLUSTER); | |
1204 | |
1205 | |
1206 #if PRINT_CLUSTER_DISTRIBUTION | |
1207 Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters); | |
1208 for (iI=0; iI<prKMeansResult->iNClusters; iI++) { | |
1209 #if TRACE | |
1210 int iD; | |
1211 Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI); | |
1212 fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =", | |
1213 __FILE__, __FUNCTION__, __LINE__); | |
1214 for (iD=0; iD<iNumSeeds; iD++) { | |
1215 fprintf(stderr, " %f", prKMeansResult->ppdClusterCenters[iI][iD]); | |
1216 } | |
1217 fprintf(stderr, "\n"); | |
1218 fflush(stderr); | |
1219 #endif | |
1220 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned", | |
1221 iI, prKMeansResult->piNObjsPerCluster[iI]); | |
1222 #if TRACE | |
1223 for (uJ=0; uJ<prKMeansResult->piNObjsPerCluster[iI]; uJ++) { | |
1224 int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][uJ]; | |
1225 | |
1226 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)", | |
1227 iI, uJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name); | |
1228 } | |
1229 #endif | |
1230 } | |
1231 #endif | |
1232 | |
1233 | |
1234 /* Cluster pre-clusters produced by k-means. | |
1235 * | |
1236 * Do this by calculating the vector distances of the cluster | |
1237 * centers and applying UPGMA. | |
1238 * | |
1239 * @note could try to force-balance the tree here | |
1240 * | |
1241 */ | |
1242 if (0 != NewSymMatrix(&prPreClusterDistmat, | |
1243 prKMeansResult->iNClusters, prKMeansResult->iNClusters)) { | |
1244 Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed"); | |
1245 } | |
1246 for (iI=0; iI<prKMeansResult->iNClusters; iI++) { | |
1247 for (uJ=iI+1; uJ<prKMeansResult->iNClusters; uJ++) { | |
1248 double dDist; | |
1249 dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI], | |
1250 prKMeansResult->ppdClusterCenters[uJ], | |
1251 iNumSeeds); | |
1252 SymMatrixSetValue(prPreClusterDistmat, iI, uJ, dDist); | |
1253 /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f", | |
1254 iI, uJ, dDist); */ | |
1255 } | |
1256 } | |
1257 | |
1258 | |
1259 /* labels needed for the guide tree building routine only */ | |
1260 ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*)); | |
1261 for (iI=0; iI<prKMeansResult->iNClusters; iI++) { | |
1262 ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char)); | |
1263 (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI); | |
1264 } | |
1265 #if TRACE | |
1266 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:"); | |
1267 SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL); | |
1268 #endif | |
1269 | |
1270 GuideTreeUpgma(prMbedTree_p, | |
1271 ppcLabels, prPreClusterDistmat, NULL); | |
1272 | |
1273 for (iI=0; iI<prKMeansResult->iNClusters; iI++) { | |
1274 CKFREE(ppcLabels[iI]); | |
1275 } | |
1276 CKFREE(ppcLabels); | |
1277 #if TRACE | |
1278 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:"); | |
1279 LogTree(*prMbedTree_p); | |
1280 #endif | |
1281 | |
1282 | |
1283 | |
1284 /* Now replace each leaf in the pre-cluster-center tree | |
1285 * appropriately, i.e. with the corresponding sub-cluster. | |
1286 * | |
1287 * For each leaf/sub-cluster, create a distance matrix for the | |
1288 * corresponding sequences. Use distances between vectors as | |
1289 * approximated distances between sequences. Then create a | |
1290 * guide-tree by applying UPGMA. | |
1291 * | |
1292 */ | |
1293 | |
1294 /* Get a mapping of (pre)cluster number and leaf-node indices in | |
1295 * the cluster-center tree. We can add trees to prMbedTree_p | |
1296 * because AppendTrees() guarantees that no other than the node to | |
1297 * append to changes. | |
1298 */ | |
1299 piClusterToTreeNode = (int*) | |
1300 CKMALLOC(prKMeansResult->iNClusters * sizeof(int)); | |
1301 iNodeIndex = FirstDepthFirstNode(*prMbedTree_p); | |
1302 do { | |
1303 if (IsLeaf(iNodeIndex, *prMbedTree_p)) { | |
1304 int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p); | |
1305 piClusterToTreeNode[iLeafId] = iNodeIndex; | |
1306 } | |
1307 iNodeIndex = NextDepthFirstNode(iNodeIndex, *prMbedTree_p); | |
1308 } while (NULL_NEIGHBOR != iNodeIndex); | |
1309 | |
1310 | |
1311 | |
1312 | |
1313 /* Now step through all the leafs and replace them with the | |
1314 * corresponding sub-trees | |
1315 */ | |
1316 NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO), | |
1317 "Distance calculation within sub-clusters", bPrintCR); | |
1318 /* for each cluster */ | |
1319 for (iClusterIndex=0; | |
1320 iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) { | |
1321 /* distance matrix for the sub-cluster */ | |
1322 symmatrix_t *prWithinClusterDistances = NULL; | |
1323 int iNSeqInCluster; | |
1324 tree_t *prSubClusterTree = NULL; | |
1325 | |
1326 ProgressLog(prSubClusterDistanceProgress, | |
1327 iClusterIndex, prKMeansResult->iNClusters, FALSE); | |
1328 | |
1329 #if FULL_WITHIN_CLUSTER_DISTANCES | |
1330 mseq_t *prSubClusterMSeq; | |
1331 int iPairDistType; | |
1332 int iSeqIndex; | |
1333 | |
1334 Log(&rLog, LOG_DEBUG, | |
1335 "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER"); | |
1336 | |
1337 if (TRUE == prMSeq->aligned) { | |
1338 iPairDistType = PAIRDIST_SQUIDID_KIMURA; | |
1339 } else { | |
1340 iPairDistType = PAIRDIST_KTUPLE; | |
1341 } | |
1342 #endif | |
1343 | |
1344 iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex]; | |
1345 #if TRACE | |
1346 Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d", | |
1347 iClusterIndex, iNSeqInCluster); | |
1348 #endif | |
1349 | |
1350 #if FULL_WITHIN_CLUSTER_DISTANCES | |
1351 /* create an mseq structure for sequences in this cluster | |
1352 * don't need most of the members (e.g. orig_seq) but copy | |
1353 * them anyway for the sake of completeness | |
1354 */ | |
1355 NewMSeq(&prSubClusterMSeq); | |
1356 | |
1357 prSubClusterMSeq->nseqs = iNSeqInCluster; | |
1358 prSubClusterMSeq->seqtype = prMSeq->seqtype; | |
1359 if (NULL!=prMSeq->filename) { | |
1360 prSubClusterMSeq->filename = CkStrdup(prMSeq->filename); | |
1361 } | |
1362 prSubClusterMSeq->seq = (char **) | |
1363 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *)); | |
1364 prSubClusterMSeq->orig_seq = (char **) | |
1365 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *)); | |
1366 prSubClusterMSeq->sqinfo = (SQINFO *) | |
1367 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO)); | |
1368 | |
1369 for (iSeqIndex=0; iSeqIndex<iNSeqInCluster; iSeqIndex++) { | |
1370 int iRealSeqIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iSeqIndex]; | |
1371 prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]); | |
1372 prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]); | |
1373 SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]); | |
1374 #if TRACE | |
1375 Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)", | |
1376 iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name, | |
1377 iRealSeqIndex); | |
1378 #endif | |
1379 } | |
1380 #endif | |
1381 | |
1382 | |
1383 /* Create a distance matrix for this sub-cluster | |
1384 * (prWithinClusterDistances) by using the vector distances or | |
1385 * ktuple distances. | |
1386 * | |
1387 * Then apply UPGMA to get a subcluster tree | |
1388 * (prSubClusterTree) and append created tree to the | |
1389 * pre-cluster-tree (prMbedTree_p) | |
1390 * | |
1391 */ | |
1392 #if FULL_WITHIN_CLUSTER_DISTANCES | |
1393 | |
1394 /* compute distances, but be quiet */ | |
1395 if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType, | |
1396 0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs, | |
1397 NULL, NULL)) { | |
1398 Log(&rLog, LOG_ERROR, "Couldn't compute pair distances"); | |
1399 return -1; | |
1400 } | |
1401 #else | |
1402 if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) { | |
1403 Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed"); | |
1404 } | |
1405 for (iI=0; iI<iNSeqInCluster; iI++) { | |
1406 int iRealIndexI = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI]; | |
1407 | |
1408 for (uJ=iI+1; uJ<iNSeqInCluster; uJ++) { | |
1409 int iRealIndexJ = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][uJ]; | |
1410 double dist; | |
1411 | |
1412 /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s", | |
1413 iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name, | |
1414 uJ, prMSeq->sqinfo[iRealIndexJ].name); */ | |
1415 | |
1416 if (1 == USE_EUCLIDEAN_DISTANCE) { | |
1417 dist = EuclDist(ppdSeqVec[iRealIndexI], | |
1418 ppdSeqVec[iRealIndexJ], iNumSeeds); | |
1419 } else { | |
1420 dist = CosDist(ppdSeqVec[iRealIndexI], | |
1421 ppdSeqVec[iRealIndexJ], iNumSeeds); | |
1422 } | |
1423 SymMatrixSetValue(prWithinClusterDistances, iI, uJ, dist); | |
1424 } | |
1425 } | |
1426 #endif | |
1427 | |
1428 /* labels needed for the guide tree building routine only */ | |
1429 ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*)); | |
1430 for (iI=0; iI<iNSeqInCluster; iI++) { | |
1431 #if FULL_WITHIN_CLUSTER_DISTANCES | |
1432 ppcLabels[iI] = prSubClusterMSeq->sqinfo[iI].name; | |
1433 #else | |
1434 int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI]; | |
1435 ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name; | |
1436 #endif | |
1437 } | |
1438 #if TRACE | |
1439 Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d", | |
1440 iClusterIndex, prKMeansResult->iNClusters); | |
1441 SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL); | |
1442 #endif | |
1443 | |
1444 | |
1445 GuideTreeUpgma(&prSubClusterTree, ppcLabels, | |
1446 prWithinClusterDistances, NULL); | |
1447 | |
1448 CKFREE(ppcLabels); /* don't free members, they just point */ | |
1449 #if 0 | |
1450 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex); | |
1451 LogTree(prSubClusterTree); | |
1452 #endif | |
1453 | |
1454 | |
1455 /* The guide tree id's (that point to the sequences) now start | |
1456 * from 0, i.e. the association with the prMSeq numbering is | |
1457 * broken and fixed in the following | |
1458 */ | |
1459 for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) { | |
1460 if (IsLeaf(iNodeIndex, prSubClusterTree)) { | |
1461 int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree); | |
1462 int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId]; | |
1463 #if 0 | |
1464 Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)", | |
1465 iNodeIndex, iLeafId, | |
1466 GetLeafName(iNodeIndex, prSubClusterTree), | |
1467 iRealId, prMSeq->sqinfo[iRealId].name); | |
1468 #endif | |
1469 SetLeafId(prSubClusterTree, iNodeIndex, iRealId); | |
1470 } | |
1471 } | |
1472 | |
1473 | |
1474 /* Append the newly created tree (prSubClusterTree) to the | |
1475 * corresponding node index of prMbedTree_p. | |
1476 */ | |
1477 #if TRACE | |
1478 Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s", | |
1479 piClusterToTreeNode[iClusterIndex], | |
1480 GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p)); | |
1481 #endif | |
1482 | |
1483 AppendTree(*prMbedTree_p, | |
1484 piClusterToTreeNode[iClusterIndex], | |
1485 prSubClusterTree); | |
1486 /* Note: piClusterToTreeNode is still valid, because | |
1487 * AppendTrees() guarantees that no other than the node to | |
1488 * append to changes. */ | |
1489 | |
1490 #if 0 | |
1491 Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex); | |
1492 LogTree(*prMbedTree_p); | |
1493 | |
1494 if (0) { | |
1495 char fname[] = "mbed-joined-tree.dnd"; | |
1496 FILE *pfOut; | |
1497 if (NULL == (pfOut = fopen(fname, "w"))) { | |
1498 Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname); | |
1499 } | |
1500 MuscleTreeToFile(pfOut, *prMbedTree_p); | |
1501 Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname); | |
1502 fclose(pfOut); | |
1503 Log(&rLog, LOG_FATAL, "DEBUG EXIT"); | |
1504 } | |
1505 #endif | |
1506 | |
1507 /* cleanup | |
1508 */ | |
1509 FreeMuscleTree(prSubClusterTree); | |
1510 FreeSymMatrix(&prWithinClusterDistances); | |
1511 #if FULL_WITHIN_CLUSTER_DISTANCES | |
1512 FreeMSeq(&prSubClusterMSeq); | |
1513 #endif | |
1514 } /* end for each cluster */ | |
1515 ProgressDone(prSubClusterDistanceProgress); | |
1516 FreeProgress(&prSubClusterDistanceProgress); | |
1517 | |
1518 | |
1519 if (NULL != pcGuidetreeOut) { | |
1520 if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) { | |
1521 Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut); | |
1522 } else { | |
1523 MuscleTreeToFile(pfOut, *prMbedTree_p); | |
1524 Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut); | |
1525 (void) fclose(pfOut); | |
1526 } | |
1527 } | |
1528 | |
1529 | |
1530 /* cleanup | |
1531 * | |
1532 */ | |
1533 #if MBED_TIMING | |
1534 StopwatchStop(stopwatch); | |
1535 StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch); | |
1536 StopwatchFree(stopwatch); | |
1537 #endif | |
1538 | |
1539 FreeKMeansResult(&prKMeansResult); | |
1540 FreeSymMatrix(&prPreClusterDistmat); | |
1541 for (iI=0; iI<prMSeq->nseqs; iI++) { | |
1542 CKFREE(ppdSeqVec[iI]); | |
1543 } | |
1544 CKFREE(ppdSeqVec); | |
1545 CKFREE(piClusterToTreeNode); | |
1546 | |
1547 #ifndef NDEBUG | |
1548 TreeValidate(*prMbedTree_p); | |
1549 #endif | |
1550 | |
1551 return 0; | |
1552 } | |
1553 /*** end: Mbed() ***/ |