comparison clustalomega/clustal-omega-0.2.0/src/clustal/mbed.c @ 0:ff1768533a07

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 17:04:25 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ff1768533a07
1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
5 *
6 * Copyright (C) 2010 University College Dublin
7 *
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This file is part of Clustal-Omega.
14 *
15 ********************************************************************/
16
17 /*
18 * RCS $Id: mbed.c 242 2011-05-27 14:04:21Z andreas $
19 *
20 *
21 * Reimplementation from scratch of mBed:
22 * Blackshields et al. (2010); PMID 20470396
23 *
24 *
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30
31 #include <assert.h>
32 #include <float.h>
33
34 #include "mbed.h"
35 #include "pair_dist.h"
36 #include "symmatrix.h"
37 #include "ktuple_pair.h"
38 #include "tree.h"
39 #include "util.h"
40 #include "progress.h"
41 #include "queue.h"
42
43 #include "log.h"
44 #include "kmpp/KMeans.h"
45 #include "mbed.h"
46
47 #define TIMING 0
48 #if TIMING
49 #include "squid/stopwatch.h"
50 #endif
51
52
53 /* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each
54 * bisecting kmeans subcluster are not estimated using the vectors,
55 * but calculated normally (using ktuple or kimura). Surprisingly this
56 * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000
57 * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS
58 * log2 instead of log2^2). And of course it slows things down.
59 */
60 #define FULL_WITHIN_CLUSTER_DISTANCES 1
61
62
63 /* Cluster size limits. Maximum is a soft limit, which might be
64 * exceeded if a K-Means split was unsuccesful, where unsuccesful
65 * might also mean that the minimum required number seqs. was not
66 * reached */
67 #if FULL_WITHIN_CLUSTER_DISTANCES
68 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100;
69 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1;
70 #else
71 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000;
72 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100;
73 #endif
74
75 /* How many restarts per bisecting kmeans split. 10 give 0.5% increase
76 * in quality on original HOMFAM over just 1. It also increases kmeans
77 * time by a factor of ~3, but overall time is insignificant
78 * compared to pairdist/progressive alignment part.
79 */
80 static const int RESTARTS_PER_SPLIT = 10;
81
82
83 /* Use standard kmeans (lloyds) or kmeans++. Both are almost
84 * indistinguishable here, although kmeans++ is slightly ahead the
85 * fewer seeds you pick (and it should be better in theory)
86 */
87 #define USE_KMEANS_LLOYDS 0
88
89
90 #ifndef HAVE_LOG2
91 #define log2(x) (log(x) / 0.69314718055994530942)
92 #endif
93 #define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2)
94
95
96 /* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed
97 * approach: Sample iSeeds sequence with constant stride from length-sorted X.
98 * It might be better to pick the seeds randomly, because length sorting might
99 * be more prone to including outliers (e.g. very long and very short seqs).
100 * However, we could never observer such a thing. So just stick to the
101 * original version as this also removes the random element.
102 */
103 enum SEED_SELECTION_TYPE {
104 SELECT_SEEDS_RANDOMLY,
105 SELECT_SEEDS_BY_LENGTH
106 };
107 #define SEED_SELECTION SELECT_SEEDS_BY_LENGTH
108
109
110 /* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is
111 * no difference between mbed-trees created from cosine or euclidean
112 * distances (simple version, just using disparities).
113 */
114 #define USE_EUCLIDEAN_DISTANCE 1
115
116
117 /* print some mbed pre-cluster usage to screen */
118 #define PRINT_CLUSTER_DISTRIBUTION 0
119
120
121 #define TRACE 0
122
123
124 typedef struct {
125 /* Number of final clusters
126 */
127 int iNClusters;
128
129 /* Coordinates (columns) for each cluster (rows)
130 * valid indices: [0...iNClusters][0...dim-1]
131 */
132 double **ppdClusterCenters;
133
134 /* Dimensionality of input data and cluster center coordinates
135 */
136 int iDim;
137
138 /* Number of objects per cluster
139 */
140 int *piNObjsPerCluster;
141
142 /* Objects (indices) for each cluster. [i][j] will point to (index
143 * of) object j in cluster i
144 */
145 int **ppiObjIndicesPerCluster;
146 } bisecting_kmeans_result_t;
147
148
149
150 /**
151 * @brief Free KMeans result structure.
152 *
153 * @param[out] prResult_p
154 * K-Means result to free
155 *
156 * @see NewKMeansResult()
157 */
158 void
159 FreeKMeansResult(bisecting_kmeans_result_t **prResult_p)
160 {
161 int iAux;
162 CKFREE((*prResult_p)->piNObjsPerCluster);
163 for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) {
164 CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]);
165 CKFREE((*prResult_p)->ppdClusterCenters[iAux]);
166 }
167 CKFREE((*prResult_p)->ppiObjIndicesPerCluster);
168 CKFREE((*prResult_p)->ppdClusterCenters);
169 (*prResult_p)->iNClusters = 0;
170 (*prResult_p)->iDim = 0;
171 CKFREE(*prResult_p);
172 }
173 /*** end: FreeKMeansResult() ***/
174
175
176
177 /**
178 * @brief Allocate new KMeans result
179 *
180 * @param[out] prKMeansResult_p
181 * K-Means result to free
182 *
183 * @see NewKMeansResult()
184 */
185 void
186 NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p)
187 {
188 (*prKMeansResult_p) = (bisecting_kmeans_result_t *)
189 CKCALLOC(1, sizeof(bisecting_kmeans_result_t));
190 (*prKMeansResult_p)->iNClusters = 0;
191 (*prKMeansResult_p)->iDim = 0;
192 (*prKMeansResult_p)->ppdClusterCenters = NULL;
193 (*prKMeansResult_p)->piNObjsPerCluster = NULL;
194 (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL;
195
196 }
197 /*** end: NewKMeansResult() ***/
198
199
200
201 /**
202 * @brief Calculate the euclidean distance between two vectors
203 *
204 * @param[in] v1
205 * First vector with dim dimensions
206 * @param[in] v2
207 * Second vector with dim dimensions
208 * @param[in] dim
209 * Dimensionality of v1 and v2
210 *
211 * @return euclidean distance as double
212 *
213 * @note Could probably be inlined. Also perfect for SSE
214 */
215 double
216 EuclDist(const double *v1, const double *v2, const int dim)
217 {
218 int i; /* aux */
219 double dist; /* returned distance */
220
221 assert(v1!=NULL);
222 assert(v2!=NULL);
223
224 dist = 0.0;
225 for (i=0; i<dim; i++) {
226 dist += pow(v1[i]-v2[i], 2.0);
227 }
228
229 return sqrt(dist);
230 }
231 /*** end: EuclDist() ***/
232
233
234
235 /**
236 * @brief Calculate the cosine distance between two vectors
237 *
238 * @param[in] v1
239 * First vector with dim dimensions
240 * @param[in] v2
241 * Second vector with dim dimensions
242 * @param[in] dim
243 * Dimensionality of v1 and v2
244 *
245 * @return cosine distance as double
246 *
247 * @note Could probably be inlined. Also perfect for SSE
248 */
249 double
250 CosDist(const double *v1, const double *v2, const int dim)
251 {
252 int i; /* aux */
253 double dist; /* returned distance */
254 double s, sq1, sq2;
255
256 assert(v1!=NULL);
257 assert(v2!=NULL);
258
259 s = 0.0;
260 sq1 = 0.0;
261 sq2 = 0.0;
262 for (i=0; i<dim; i++) {
263 s += (v1[i]*v2[i]);
264 sq1 += pow(v1[i], 2.0);
265 sq2 += pow(v2[i], 2.0);
266 }
267 sq1 = sqrt(sq1);
268 sq2 = sqrt(sq2);
269
270 if ((sq1 * sq2) < DBL_EPSILON) {
271 dist = 1 - s / (sq1 * sq2);
272 } else {
273 /* if the denominator gets small, the fraction gets big, hence dist
274 gets small: */
275 dist = 0.0;
276 }
277 return dist;
278 }
279 /*** end: CosDist() ***/
280
281
282
283 /**
284 * @brief convert sequences into mbed-like (distance) vector
285 * representation. Seeds (prMSeq sequence indices) have to be picked before
286 *
287 * @param[out] ppdSeqVec
288 * Pointer to preallocated matrix of size nseqs x iSeeds
289 * @param[in] prMSeq
290 * Sequences which are to be converted
291 * @param[in] piSeeds
292 * Array of sequences in prMSeq which are to be used as seeds.
293 * @param[in] iNumSeeds
294 * Number of seeds/elements in piSeeds
295 * @param[in] iPairDistType
296 * Type of pairwise distance comparison
297 *
298 */
299 int
300 SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
301 int *piSeeds, const int iNumSeeds,
302 const int iPairDistType)
303 {
304 symmatrix_t *prDistmat = NULL;
305 int iSeqIndex;
306 int iSeedIndex;
307 /* indices for restoring order */
308 int *restore;
309 /* sorted copy of piSeeds */
310 int *piSortedSeeds;
311
312 #if TIMING
313 Stopwatch_t *stopwatch = StopwatchCreate();
314 StopwatchZero(stopwatch);
315 StopwatchStart(stopwatch);
316 #endif
317
318 assert(NULL != ppdSeqVec);
319 assert(iNumSeeds>0 && iNumSeeds<=prMSeq->nseqs);
320
321 piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int));
322 memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int));
323
324 /* need them sorted, otherwise the swapping approach below might
325 * break
326 */
327 qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp);
328
329
330 /* rearrange mseq order so that we can use ktuple_pairdist code as
331 * is. That is, swap seeds and non-seeds such that the seeds end
332 * up in front of mseq. This way we can use the KTuplePairDist()
333 * code, without making a copy of mseq. Also, keep an array of
334 * indices which serves to restore the original sequence order
335 * after putting the seeds in front
336 *
337 */
338 restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
339 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
340 restore[iSeqIndex] = iSeqIndex;
341 }
342 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
343 #if TRACE
344 Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %u", piSortedSeeds[iSeedIndex], iSeedIndex);
345 #endif
346 if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
347 int swap;
348 SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
349
350 swap = restore[iSeedIndex];
351 restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
352 restore[piSortedSeeds[iSeedIndex]] = swap;
353 }
354 }
355 #if TRACE
356 printf("DEBUG(%s|%s():%d): restore indices =",
357 __FILE__, __FUNCTION__, __LINE__);
358 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
359 printf(" %u:%u", iSeqIndex, restore[iSeqIndex]);
360 }
361 printf("\n");
362
363 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
364 Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now: seq = %s",
365 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name);
366 }
367 #endif
368
369
370 /* convert sequences into vectors of distances by calling pairwise
371 * distance function for each seq/seed pair
372 *
373 */
374 if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType,
375 0, iNumSeeds, 0, prMSeq->nseqs,
376 NULL, NULL)) {
377 Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed.");
378 FreeSymMatrix(&prDistmat);
379 CKFREE(piSortedSeeds);
380 CKFREE(restore);
381 return -1;
382 }
383 #if TRACE
384 {
385 char **labels;
386 labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *));
387 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
388 labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name;
389 }
390 SymMatrixPrint(prDistmat, labels, NULL);
391 CKFREE(labels);
392 }
393 #endif
394
395
396 /* update ppdSeqVec according to prDistmat. keep in mind that we
397 * changed mseq order
398 *
399 */
400 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
401 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
402 ppdSeqVec[restore[iSeqIndex]][iSeedIndex] =
403 SymMatrixGetValue(prDistmat, iSeqIndex, iSeedIndex);
404 }
405 }
406
407
408 /* restore mseq order by reverse swapping
409 *
410 * need reverse order now, so start from top index
411 */
412 iSeedIndex=iNumSeeds-1;
413 do {
414 #if TRACE
415 Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %d", piSortedSeeds[iSeedIndex], iSeedIndex);
416 #endif
417 if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
418 int swap;
419
420 SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
421
422 swap = restore[iSeedIndex];
423 restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
424 restore[piSortedSeeds[iSeedIndex]] = swap;
425 }
426 } while (0 != iSeedIndex--);
427 #if TRACE
428 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
429 Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d: seq = %s %s",
430 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]);
431 }
432 #endif
433
434 #if TRACE
435 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
436 printf("DEBUG: seq %-4u as distance vector =", iSeqIndex);
437 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
438 printf(" %f", ppdSeqVec[iSeqIndex][iSeedIndex]);
439 }
440 printf("\n");
441 }
442 #endif
443
444
445 FreeSymMatrix(&prDistmat);
446 CKFREE(restore);
447 #if TIMING
448 StopwatchStop(stopwatch);
449 StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
450 StopwatchFree(stopwatch);
451 #endif
452
453
454 return 0;
455 }
456 /*** end: SeqToVec() ***/
457
458
459
460 /**
461 * @brief Select seeds to be used from an prMSeq
462 *
463 * @param[out] piSeeds
464 * Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated.
465 * @param[in] iNumSeeds
466 * Number of seeds to be picked
467 * @param[in] iSelectionMethod
468 * Seed selection method to be used
469 * @param[in] prMSeq
470 * The prMSeq structure to pick sequences from
471 *
472 * @return: Non-zero on failure
473 *
474 */
475 int
476 SeedSelection(int *piSeeds, int iNumSeeds, int iSelectionMethod, mseq_t *prMSeq)
477 {
478 /* seed iterator */
479 int iSeedIndex;
480 /* sequence iterator */
481 int iSeqIndex;
482
483 if (SELECT_SEEDS_RANDOMLY == iSelectionMethod) {
484 int *piPermArray;
485
486 Log(&rLog, LOG_INFO,
487 "Using %d seeds (randomly chosen) for mBed (from a total of %d sequences)",
488 iNumSeeds, prMSeq->nseqs);
489
490 PermutationArray(&piPermArray, iNumSeeds);
491 /* copy to piSeeds */
492 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
493 piSeeds[iSeedIndex] = piPermArray[iSeedIndex];
494 }
495 CKFREE(piPermArray);
496
497 } else if (SELECT_SEEDS_BY_LENGTH == iSelectionMethod) {
498
499 /* step size for picking with constant stride */
500 int iStep;
501 int *piSeqLen = CKMALLOC(prMSeq->nseqs * sizeof(int));
502 int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int));
503
504 Log(&rLog, LOG_INFO,
505 "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)",
506 iNumSeeds, prMSeq->nseqs);
507
508 iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big
509 * due to rounding */
510 /* first get an array of seq indices order according to
511 corresponding sequence length: piOrder */
512 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
513 piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
514 }
515 QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE);
516 #if 0
517 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
518 Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)",
519 iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]);
520 }
521 #endif
522 CKFREE(piSeqLen);
523 iSeedIndex = 0;
524 for (iSeqIndex=0; iSeedIndex<iNumSeeds; iSeqIndex+=iStep) {
525 piSeeds[iSeedIndex++] = piOrder[iSeqIndex];
526 }
527 CKFREE(piOrder);
528
529 } else {
530
531 Log(&rLog, LOG_ERROR, "Internal error: unknown seed selection type");
532 return -1;
533 }
534
535
536 #ifdef PICK_SEEDS_FROM_KMEANS_CLUSTERS
537 /* do initial mbedding and kmeans. then pick seeds from each cluster and
538 do full mbed with these seeds. idea is that those seeds represent the
539 sequence space better than random seeds. however, this reduces the
540 quality slightly. random is almost always better.
541 */
542
543 if (1) {
544 /* seqs represented as distance vectors */
545 double **ppdSeqVec;
546 double dCost = -1.0;
547 /* assignments for each seq to the K newly created clusters */
548 int *piKMeansClusterAssignments = CKMALLOC(prMSeq->nseqs * sizeof(int));
549 double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double));
550 /* create a copy of ppdSeqVec suitable for KMeans */
551 double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double));
552 bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool));
553
554 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds");
555 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters");
556 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round");
557 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE");
558 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good");
559
560 /*
561 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds");
562 iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2);
563 */
564
565 ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
566 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
567 ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
568 }
569 if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) {
570 Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
571 return -1;
572 }
573
574 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
575 int iDim;
576 for (iDim=0; iDim<iNumSeeds; ++iDim) {
577 pdKMeansVectors[iDim*iSeqIndex + iDim] = ppdSeqVec[iSeqIndex][iDim];
578 }
579 }
580
581
582 Log(&rLog, LOG_FORCED_DEBUG, "%s\n", "FIXME hardcoded RESTARTS_PER_SPLIT");
583 dCost = KMeans(prMSeq->nseqs, iNumSeeds, iNumSeeds,
584 pdKMeansVectors,
585 RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
586 pdKMeansClusterCenters, piKMeansClusterAssignments);
587 Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost);
588
589 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers");
590
591 #if TRACE
592 Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:");
593 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
594 Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d",
595 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name,
596 piKMeansClusterAssignments[iSeqIndex]);
597 }
598 #endif
599
600 Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds");
601 iSeedIndex = 0;
602 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
603 int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex];
604 if (pbClusterUsed[iAssignedCluster]) {
605 continue;
606 } else {
607 /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/
608 piSeeds[iSeedIndex++] = iSeqIndex;
609 pbClusterUsed[iAssignedCluster] = TRUE;
610 }
611 }
612 CKFREE(pbClusterUsed);
613
614 CKFREE(pdKMeansVectors);
615 CKFREE(pdKMeansClusterCenters);
616 CKFREE(piKMeansClusterAssignments);
617
618 }
619 /* if (1) */
620 #endif
621
622
623 if (LOG_DEBUG <= rLog.iLogLevelEnabled) {
624 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
625 Log(&rLog, LOG_DEBUG, "Picked sequence %d (%s) as seed no %d",
626 piSeeds[iSeedIndex], prMSeq->sqinfo[piSeeds[iSeedIndex]].name, iSeedIndex);
627 }
628 }
629
630 return 0;
631 }
632 /* end of SeedSelection() */
633
634
635
636
637 /**
638 * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with
639 * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
640 *
641 * @param[out] prKMeansResult_p
642 * Result of Bisecting KMeans. Will be allocated here.
643 * Caller has to free. See @see FreeKMeansResult()
644 * @param[in] iNObjs
645 * Number of objects/sequences to cluster
646 * @param[in] iDim
647 * Dimensionality of input data
648 * @param[in] ppdVectors
649 * each row holds iDim points for this object's coordinates
650 * @param[in] iMinRequiredObjsPerCluster
651 * Minimum number of objects per Cluster (inclusive)/
652 * @param[in] iMaxAllowedObjsPerCluster
653 * Maximum number of objects per Cluster (inclusive). Function returns once no
654 * cluster contains more then this number of objects. Soft limit!
655 *
656 * @note Convoluted code. Could use some restructuring. My apologies.
657 * AW
658 *
659 */
660 void
661 BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
662 const int iNObjs, const int iDim, double **ppdVectors,
663 const int iMinRequiredObjsPerCluster,
664 const int iMaxAllowedObjsPerCluster)
665 {
666 int iN, iD;
667 /* cluster centers for each cluster created at each split */
668 double *pdKClusterCenters;
669 /* keep track of updated object indices per newly created
670 * cluster */
671 int *piCurObjToUpdate;
672 /* number of times we called 2-means */
673 int iNRounds;
674 /* flag for unsuccessful cluster split */
675 bool bNaNDetected = FALSE;
676 /* flag for detected small cluster after split */
677 bool bSmallClusterDetected;
678 /* queue of clusters which are to be split */
679 int_queue_t rClusterSplitQueue;
680
681 #if TIMING
682 Stopwatch_t *stopwatch = StopwatchCreate();
683 #endif
684
685
686 piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int));
687
688 NewKMeansResult(prKMeansResult_p);
689
690 /* new cluster centers created at each split/KMeans run
691 */
692 pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double));
693
694 /* init results by setting a first cluster that contains all objects
695 */
696 (*prKMeansResult_p)->iNClusters = 1;
697 (*prKMeansResult_p)->iDim = iDim;
698 /* fake center coordinates of first cluster */
699 (*prKMeansResult_p)->ppdClusterCenters =
700 (double **) CKMALLOC(1 * sizeof(double *));
701 (*prKMeansResult_p)->ppdClusterCenters[0] =
702 (double *) CKMALLOC(iDim * sizeof(double));
703 /* objects per cluster */
704 (*prKMeansResult_p)->piNObjsPerCluster =
705 (int *) CKMALLOC(1 * sizeof(int));
706 (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs;
707 /* object indices per cluster */
708 (*prKMeansResult_p)->ppiObjIndicesPerCluster =
709 (int **) CKMALLOC(1 * sizeof(int *));
710 (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *)
711 CKMALLOC(iNObjs * sizeof(int));
712 for (iN=0; iN<iNObjs; iN++) {
713 (*prKMeansResult_p)->ppiObjIndicesPerCluster[0][iN] = iN;
714 }
715
716
717
718 /* Starting with the first cluster that now contains all the
719 * sequences keep splitting until no cluster contains more than
720 * iMaxAllowedObjsPerCluster
721 *
722 * Keep a queue of clusters (rClusterSplitQueue) to split
723 *
724 * At each split values/memory of the just split cluster will be
725 * reused and exactly one new only allocated.
726 *
727 * Example:
728 * Given the following cluster assignments
729 * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4
730 * and a K-Means split in cluster 3 at |:
731 * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4
732 * The result should be this:
733 * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4
734 *
735 *
736 */
737 INT_QUEUE_INIT(&rClusterSplitQueue);
738 if (iNObjs>iMaxAllowedObjsPerCluster) {
739 /* pish fake first cluster index */
740 INT_QUEUE_PUSH(&rClusterSplitQueue, 0);
741 }
742 iNRounds = 0;
743 while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) {
744 /* assignments for each seq to the K newly created clusters */
745 int *piKClusterAssignments;
746 /* number of objects in cluster that is to be split */
747 int iNObjsInClusterToSplit;
748 /* coordinates of points in cluster that is to be split
749 * array of size n*d where [d*i + j] gives coordinate j of point i
750 */
751 double *pdVectorsInClusterToSplit;
752 /* copy of object indices in split cluster */
753 int *piObjIndicesOfSplitCluster;
754 /* best cost of kmeans rounds */
755 double dCost = -1.0;
756
757 /* indices for the two created clusters
758 */
759 /* index of cluster to split */
760 int iClusterToSplot;
761 /* index of newly created cluster at each round. data for the
762 other created cluster goes to the one just split,
763 i.e. (*piClusterToSplot) */
764 int iNewClusterIdx;
765
766 #if TIMING
767 StopwatchZero(stopwatch);
768 StopwatchStart(stopwatch);
769 #endif
770
771 INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot);
772
773 iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot];
774 piKClusterAssignments = (int *)
775 CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
776
777 pdVectorsInClusterToSplit = (double *)
778 CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double));
779 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
780 for (iD=0; iD<iDim; ++iD) {
781 int iThisObjIdx =
782 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN];
783 pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD];
784 }
785 }
786
787 #if TRACE
788 Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
789 iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
790 fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:",
791 __FILE__, __FUNCTION__, __LINE__);
792 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
793 fprintf(stderr, " %u",
794 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
795 }
796 fprintf(stderr, "\n");
797 (void) fflush(stderr);
798 #endif
799
800 #if TRACE
801 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
802 fprintf(stderr,
803 "DEBUG(%s|%s():%d): Coordinate of object %u (real index %u) in cluster to split:",
804 __FILE__, __FUNCTION__, __LINE__, iN,
805 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
806 for (iD=0; iD<iDim; iD++) {
807 fprintf(stderr, " %f", pdVectorsInClusterToSplit[iDim*iN + iD]);
808 }
809 fprintf(stderr, "\n");
810 (void) fflush(stderr);
811 }
812 #endif
813
814 /* KMeans(1 "The number of points in the data set",
815 * 2 "The number of clusters to look for",
816 * 3 "The number of dimensions that the data set lives in",
817 * 4 "points: An array of size n*d where points[d*i + j] gives coordinate j of point i",
818 * 5 "attempts: The number of times to independently run k-means",
819 * 6 "use_lloyds_method: uses kmpp if false, otherwise lloyds method",
820 * 7 "centers: This can either be null or an array of size k*d.
821 * In the latter case, centers[d*i + j] will give coordinate j of center i.
822 * If the cluster is unused, it will contain NaN instead.",
823 * 8 "assignments: This can either be null or an array of size n.
824 * In the latter case, it will be filled with the cluster that each point is assigned to
825 * (an integer between 0 and k-1 inclusive).");
826 */
827 dCost = KMeans(iNObjsInClusterToSplit, 2, iDim,
828 pdVectorsInClusterToSplit,
829 RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
830 pdKClusterCenters, piKClusterAssignments);
831
832 #if TRACE
833 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Raw K-Means output:");
834 for (iN=0; iN<2; iN++) {
835 fprintf(stderr, "DEBUG(%s|%s():%d): Cluster Center %u =",
836 __FILE__, __FUNCTION__, __LINE__, iN);
837 for (iD=0; iD<iDim; iD++) {
838 fprintf(stderr, " %f", pdKClusterCenters[iN*iDim+iD]);
839 }
840 fprintf(stderr, "\n");
841 (void) fflush(stderr);
842 }
843 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
844 Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u to cluster %d (of #%u)",
845 iN, piKClusterAssignments[iN], 2);
846 }
847 #endif
848
849 /* real index of one of the newly created clusters. the other
850 * one is iClusterToSplot */
851 iNewClusterIdx = (*prKMeansResult_p)->iNClusters;
852
853
854 /* We don't want Clusters which are too small. Check here if a
855 * split created a small cluster and if yes, discard the
856 * solution. Because the cluster has already been removed from
857 * the queue, we can just continue.
858 */
859 bSmallClusterDetected = FALSE;
860 if (iMinRequiredObjsPerCluster>1) {
861 int iNObjsCluster[2];
862 iNObjsCluster[0] = 0; /* first cluster */
863 iNObjsCluster[1] = 0; /* second cluster */
864 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
865 iNObjsCluster[piKClusterAssignments[iN]]+=1;
866 }
867
868 if (iNObjsCluster[0]<iMinRequiredObjsPerCluster
869 ||
870 iNObjsCluster[1]<iMinRequiredObjsPerCluster) {
871 bSmallClusterDetected = TRUE;
872 Log(&rLog, LOG_FORCED_DEBUG, "Skipping this split because objs in 1st/2nd cluster = %d/%d < %d",
873 iNObjsCluster[0], iNObjsCluster[1], iMinRequiredObjsPerCluster);
874 }
875 }
876
877 /* Same logic as for small clusters applies if KMeans couldn't
878 * split the cluster. In this case one of its center
879 * coordinates will be NaN, which we check for in the
880 * following.
881 */
882 if (! bSmallClusterDetected) {
883 for (iN=0; iN<2; iN++) {
884 bNaNDetected = FALSE;
885 for (iD=0; iD<iDim; iD++) {
886 if (0 != isnan(pdKClusterCenters[iN*iDim+iN])) {
887 /* Got NaN as coordinate after splitting */
888 Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s",
889 __FUNCTION__,
890 iClusterToSplot, iNObjsInClusterToSplit,
891 "Hope it's not too big and doesn't slow things down."
892 );
893 bNaNDetected = TRUE;
894 break;
895 }
896 }
897 if (bNaNDetected) {
898 break;
899 }
900 }
901 }
902
903 /* Discarding split of this cluster. It has been removed from the
904 * queue already. so just continue
905 */
906 if (bNaNDetected || bSmallClusterDetected) {
907 CKFREE(piKClusterAssignments);
908 CKFREE(pdVectorsInClusterToSplit);
909 continue;
910 }
911
912
913 /* update cluster centers: pdClusterCenters
914 *
915 */
916 /* reuse memory of existing/old/split cluster
917 */
918 for (iN=0; iN<iDim; iN++) {
919 double dCoord = pdKClusterCenters[0*iDim+iN];
920 (*prKMeansResult_p)->ppdClusterCenters[iClusterToSplot][iN] = dCoord;
921 }
922 /* realloc and set new one
923 */
924 (*prKMeansResult_p)->ppdClusterCenters = (double **)
925 CKREALLOC((*prKMeansResult_p)->ppdClusterCenters,
926 ((*prKMeansResult_p)->iNClusters+1) * sizeof(double *));
927 (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *)
928 CKMALLOC(iDim * sizeof(double));
929 for (iD=0; iD<iDim; iD++) {
930 double dCoord = pdKClusterCenters[1*iDim+iD];
931 (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx][iD] = dCoord;
932 }
933 #if TRACE
934 Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:");
935 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
936 fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =",
937 __FILE__, __FUNCTION__, __LINE__, iN);
938 for (iD=0; iD<iDim; iD++) {
939 fprintf(stderr, " %f", (*prKMeansResult_p)->ppdClusterCenters[iN][iD]);
940 }
941 fprintf(stderr, "\n");
942 (void) fflush(stderr);
943 }
944 #endif
945
946
947 /* update #seqs per cluster: piNObjsPerCluster
948 *
949 */
950 (*prKMeansResult_p)->piNObjsPerCluster = (int *)
951 CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster,
952 ((*prKMeansResult_p)->iNClusters+1) * sizeof(int));
953 /* init new and old one to zero */
954 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0;
955 (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0;
956 /* now update values */
957 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
958 if (0 == piKClusterAssignments[iN]) {
959 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] += 1;
960 } else if (1 == piKClusterAssignments[iN]) {
961 (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1;
962 } else {
963 /* there used to be code for iK>=2 in r101 */
964 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
965 piKClusterAssignments[iN]);
966 }
967 }
968 #if TRACE
969 Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:");
970 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
971 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences",
972 iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]);
973 }
974 #endif
975 /* queue clusters if still they are still too big
976 */
977 if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) {
978 INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot);
979 }
980 if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) {
981 INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx);
982 }
983
984
985
986 /* update which objects belong to which cluster:
987 *
988 * note: piNObjsPerCluster needs to be already updated
989 *
990 */
991 /* create a copy of the object indices in the split cluster
992 * (original will be overwritten)
993 */
994 piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
995 memcpy(piObjIndicesOfSplitCluster,
996 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
997 iNObjsInClusterToSplit * sizeof(int));
998
999 (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **)
1000 CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster,
1001 ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *));
1002
1003
1004 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] =
1005 (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
1006 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int));
1007
1008 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] =
1009 (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int));
1010
1011
1012 /* now reassign the object indices to the assigned cluster
1013 */
1014 piCurObjToUpdate[0] = 0;
1015 piCurObjToUpdate[1] = 0;
1016 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
1017 int iThisObjIdx = piObjIndicesOfSplitCluster[iN];
1018 int iThisClusterAssignment = piKClusterAssignments[iN];
1019 int iThisOffset = piCurObjToUpdate[iThisClusterAssignment];
1020 int iThisClusterIndex = 0;
1021
1022 if (0 == iThisClusterAssignment) {
1023 iThisClusterIndex = iClusterToSplot;
1024 } else if (1 == iThisClusterAssignment) {
1025 iThisClusterIndex = iNewClusterIdx;
1026 } else {
1027 /* there used to be code for iK>=2 in r101 */
1028 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
1029 piKClusterAssignments[iN]);
1030 }
1031 #if 0
1032 Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d",
1033 iThisClusterIndex, iThisOffset, iThisObjIdx);
1034 #endif
1035 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx;
1036 piCurObjToUpdate[iThisClusterAssignment]+=1;
1037 }
1038 CKFREE(piObjIndicesOfSplitCluster);
1039 #if TRACE
1040 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
1041 int iObj;
1042 fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ",
1043 __FILE__, __FUNCTION__, __LINE__, iN);
1044 for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) {
1045 fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]);
1046 }
1047 fprintf(stderr, "\n");
1048 (void) fflush(stderr);
1049 }
1050 #endif
1051
1052
1053 #if TIMING
1054 StopwatchStop(stopwatch);
1055 StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch);
1056 #endif
1057
1058
1059 /* finally: increase number of clusters
1060 */
1061 (*prKMeansResult_p)->iNClusters += 1;
1062 iNRounds += 1;
1063 CKFREE(piKClusterAssignments);
1064 CKFREE(pdVectorsInClusterToSplit);
1065
1066 } /* while */
1067 INT_QUEUE_DESTROY(&rClusterSplitQueue);
1068
1069
1070 Log(&rLog, LOG_DEBUG,
1071 "Bisecting K-means finished after %d rounds (no more clusters to split)",
1072 iNRounds);
1073
1074 #if TIMING
1075 StopwatchFree(stopwatch);
1076 #endif
1077
1078 /* @note could use progress/timer */
1079
1080 CKFREE(pdKClusterCenters);
1081 CKFREE(piCurObjToUpdate);
1082
1083 return;
1084 }
1085 /*** end: BisectingKmeans() ***/
1086
1087
1088
1089 /**
1090 *
1091 * @brief From scratch reimplementation of mBed: Blackshields et al.
1092 * (2010); PMID 20470396.
1093 *
1094 * Idea is a follows:
1095 * - convert sequences into vectors of distances
1096 * - cluster the vectors using k-means
1097 * - cluster each of the k clusters using upgma (used cached distances
1098 * from above?)
1099 * - join the sub-clusters to create on tree (use UPGMA on k-means
1100 * medoids)
1101 *
1102 *
1103 * @param[out] prMbedTree_p
1104 * Created upgma tree. will be allocated here. use FreeMuscleTree()
1105 * to free
1106 * @param[in] prMSeq
1107 * Multiple sequences
1108 * @param[in] iPairDistType
1109 * Distance measure for pairwise alignments
1110 * @param[in] pcGuidetreeOut
1111 * Passed down to GuideTreeUpgma()
1112 *
1113 * @return Zero on success, non-zero on error
1114 *
1115 */
1116 int
1117 Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
1118 const char *pcGuidetreeOut)
1119 {
1120 /* number of seeds */
1121 int iNumSeeds;
1122 /* seed indices matching prMSeq */
1123 int *piSeeds;
1124 /* seqs represented as distance vectors */
1125 double **ppdSeqVec;
1126 /* kmeans result */
1127 bisecting_kmeans_result_t *prKMeansResult = NULL;
1128 /* distance matrix of kmeans (pre-)cluster centers */
1129 symmatrix_t *prPreClusterDistmat = NULL;
1130 /* auxiliary for symmetric matrix output; tree routines etc */
1131 char **ppcLabels;
1132 int iNodeIndex;
1133 /* mapping of cluster-center tree node indices to corresponding
1134 cluster */
1135 int *piClusterToTreeNode;
1136 int iClusterIndex;
1137 int iI, uJ;
1138 FILE *pfOut;
1139 progress_t *prSubClusterDistanceProgress;
1140 bool bPrintCR = (LOG_VERBOSE<=rLog.iLogLevelEnabled) ? FALSE : TRUE;
1141
1142 #if FULL_WITHIN_CLUSTER_DISTANCES
1143 Log(&rLog, LOG_WARN,
1144 "FULL_WITHIN_CLUSTER_DISTANCES (using min %d / max %d seqs per subcluster) is a quick-hack. Brace yourself...",
1145 MIN_REQUIRED_SEQ_PER_PRECLUSTER, MAX_ALLOWED_SEQ_PER_PRECLUSTER);
1146 #endif
1147
1148 #if MBED_TIMING
1149 Stopwatch_t *stopwatch = StopwatchCreate();
1150 #endif
1151
1152 assert(NULL != prMbedTree_p);
1153 assert(NULL != prMSeq);
1154
1155
1156 iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs);
1157 if (iNumSeeds >= prMSeq->nseqs) {
1158 /* -1 is condition for RandomUniqueIntArray */
1159 iNumSeeds = prMSeq->nseqs-1;
1160 Log(&rLog, LOG_DEBUG,
1161 "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d",
1162 iNumSeeds);
1163 }
1164
1165
1166 /* Turn sequences into vectors of distances to the seeds
1167 *
1168 */
1169 piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int));
1170 if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) {
1171 Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed");
1172 return -1;
1173 }
1174 ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
1175 for (iI=0; iI<prMSeq->nseqs; iI++) {
1176 ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
1177 }
1178 if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) {
1179 Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
1180 return -1;
1181 }
1182 CKFREE(piSeeds);
1183
1184
1185 /* Calculate (pre-)clusters of sequence vectors by applying
1186 * bisecting kmeans
1187 *
1188 */
1189 #if MBED_TIMING
1190 /* start clock only here, to make sure we don't include pairwise
1191 * distance computation */
1192 StopwatchZero(stopwatch);
1193 StopwatchStart(stopwatch);
1194 #endif
1195
1196 BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec,
1197 MIN_REQUIRED_SEQ_PER_PRECLUSTER,
1198 MAX_ALLOWED_SEQ_PER_PRECLUSTER);
1199 Log(&rLog, LOG_INFO,
1200 "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)",
1201 prKMeansResult->iNClusters,
1202 MIN_REQUIRED_SEQ_PER_PRECLUSTER,
1203 MAX_ALLOWED_SEQ_PER_PRECLUSTER);
1204
1205
1206 #if PRINT_CLUSTER_DISTRIBUTION
1207 Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters);
1208 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1209 #if TRACE
1210 int iD;
1211 Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI);
1212 fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =",
1213 __FILE__, __FUNCTION__, __LINE__);
1214 for (iD=0; iD<iNumSeeds; iD++) {
1215 fprintf(stderr, " %f", prKMeansResult->ppdClusterCenters[iI][iD]);
1216 }
1217 fprintf(stderr, "\n");
1218 fflush(stderr);
1219 #endif
1220 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned",
1221 iI, prKMeansResult->piNObjsPerCluster[iI]);
1222 #if TRACE
1223 for (uJ=0; uJ<prKMeansResult->piNObjsPerCluster[iI]; uJ++) {
1224 int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][uJ];
1225
1226 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)",
1227 iI, uJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name);
1228 }
1229 #endif
1230 }
1231 #endif
1232
1233
1234 /* Cluster pre-clusters produced by k-means.
1235 *
1236 * Do this by calculating the vector distances of the cluster
1237 * centers and applying UPGMA.
1238 *
1239 * @note could try to force-balance the tree here
1240 *
1241 */
1242 if (0 != NewSymMatrix(&prPreClusterDistmat,
1243 prKMeansResult->iNClusters, prKMeansResult->iNClusters)) {
1244 Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed");
1245 }
1246 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1247 for (uJ=iI+1; uJ<prKMeansResult->iNClusters; uJ++) {
1248 double dDist;
1249 dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI],
1250 prKMeansResult->ppdClusterCenters[uJ],
1251 iNumSeeds);
1252 SymMatrixSetValue(prPreClusterDistmat, iI, uJ, dDist);
1253 /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f",
1254 iI, uJ, dDist); */
1255 }
1256 }
1257
1258
1259 /* labels needed for the guide tree building routine only */
1260 ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*));
1261 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1262 ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char));
1263 (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI);
1264 }
1265 #if TRACE
1266 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:");
1267 SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL);
1268 #endif
1269
1270 GuideTreeUpgma(prMbedTree_p,
1271 ppcLabels, prPreClusterDistmat, NULL);
1272
1273 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1274 CKFREE(ppcLabels[iI]);
1275 }
1276 CKFREE(ppcLabels);
1277 #if TRACE
1278 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:");
1279 LogTree(*prMbedTree_p);
1280 #endif
1281
1282
1283
1284 /* Now replace each leaf in the pre-cluster-center tree
1285 * appropriately, i.e. with the corresponding sub-cluster.
1286 *
1287 * For each leaf/sub-cluster, create a distance matrix for the
1288 * corresponding sequences. Use distances between vectors as
1289 * approximated distances between sequences. Then create a
1290 * guide-tree by applying UPGMA.
1291 *
1292 */
1293
1294 /* Get a mapping of (pre)cluster number and leaf-node indices in
1295 * the cluster-center tree. We can add trees to prMbedTree_p
1296 * because AppendTrees() guarantees that no other than the node to
1297 * append to changes.
1298 */
1299 piClusterToTreeNode = (int*)
1300 CKMALLOC(prKMeansResult->iNClusters * sizeof(int));
1301 iNodeIndex = FirstDepthFirstNode(*prMbedTree_p);
1302 do {
1303 if (IsLeaf(iNodeIndex, *prMbedTree_p)) {
1304 int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p);
1305 piClusterToTreeNode[iLeafId] = iNodeIndex;
1306 }
1307 iNodeIndex = NextDepthFirstNode(iNodeIndex, *prMbedTree_p);
1308 } while (NULL_NEIGHBOR != iNodeIndex);
1309
1310
1311
1312
1313 /* Now step through all the leafs and replace them with the
1314 * corresponding sub-trees
1315 */
1316 NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO),
1317 "Distance calculation within sub-clusters", bPrintCR);
1318 /* for each cluster */
1319 for (iClusterIndex=0;
1320 iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) {
1321 /* distance matrix for the sub-cluster */
1322 symmatrix_t *prWithinClusterDistances = NULL;
1323 int iNSeqInCluster;
1324 tree_t *prSubClusterTree = NULL;
1325
1326 ProgressLog(prSubClusterDistanceProgress,
1327 iClusterIndex, prKMeansResult->iNClusters, FALSE);
1328
1329 #if FULL_WITHIN_CLUSTER_DISTANCES
1330 mseq_t *prSubClusterMSeq;
1331 int iPairDistType;
1332 int iSeqIndex;
1333
1334 Log(&rLog, LOG_DEBUG,
1335 "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER");
1336
1337 if (TRUE == prMSeq->aligned) {
1338 iPairDistType = PAIRDIST_SQUIDID_KIMURA;
1339 } else {
1340 iPairDistType = PAIRDIST_KTUPLE;
1341 }
1342 #endif
1343
1344 iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex];
1345 #if TRACE
1346 Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d",
1347 iClusterIndex, iNSeqInCluster);
1348 #endif
1349
1350 #if FULL_WITHIN_CLUSTER_DISTANCES
1351 /* create an mseq structure for sequences in this cluster
1352 * don't need most of the members (e.g. orig_seq) but copy
1353 * them anyway for the sake of completeness
1354 */
1355 NewMSeq(&prSubClusterMSeq);
1356
1357 prSubClusterMSeq->nseqs = iNSeqInCluster;
1358 prSubClusterMSeq->seqtype = prMSeq->seqtype;
1359 if (NULL!=prMSeq->filename) {
1360 prSubClusterMSeq->filename = CkStrdup(prMSeq->filename);
1361 }
1362 prSubClusterMSeq->seq = (char **)
1363 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
1364 prSubClusterMSeq->orig_seq = (char **)
1365 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
1366 prSubClusterMSeq->sqinfo = (SQINFO *)
1367 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO));
1368
1369 for (iSeqIndex=0; iSeqIndex<iNSeqInCluster; iSeqIndex++) {
1370 int iRealSeqIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iSeqIndex];
1371 prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]);
1372 prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]);
1373 SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]);
1374 #if TRACE
1375 Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)",
1376 iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name,
1377 iRealSeqIndex);
1378 #endif
1379 }
1380 #endif
1381
1382
1383 /* Create a distance matrix for this sub-cluster
1384 * (prWithinClusterDistances) by using the vector distances or
1385 * ktuple distances.
1386 *
1387 * Then apply UPGMA to get a subcluster tree
1388 * (prSubClusterTree) and append created tree to the
1389 * pre-cluster-tree (prMbedTree_p)
1390 *
1391 */
1392 #if FULL_WITHIN_CLUSTER_DISTANCES
1393
1394 /* compute distances, but be quiet */
1395 if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType,
1396 0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs,
1397 NULL, NULL)) {
1398 Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
1399 return -1;
1400 }
1401 #else
1402 if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) {
1403 Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed");
1404 }
1405 for (iI=0; iI<iNSeqInCluster; iI++) {
1406 int iRealIndexI = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
1407
1408 for (uJ=iI+1; uJ<iNSeqInCluster; uJ++) {
1409 int iRealIndexJ = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][uJ];
1410 double dist;
1411
1412 /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s",
1413 iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name,
1414 uJ, prMSeq->sqinfo[iRealIndexJ].name); */
1415
1416 if (1 == USE_EUCLIDEAN_DISTANCE) {
1417 dist = EuclDist(ppdSeqVec[iRealIndexI],
1418 ppdSeqVec[iRealIndexJ], iNumSeeds);
1419 } else {
1420 dist = CosDist(ppdSeqVec[iRealIndexI],
1421 ppdSeqVec[iRealIndexJ], iNumSeeds);
1422 }
1423 SymMatrixSetValue(prWithinClusterDistances, iI, uJ, dist);
1424 }
1425 }
1426 #endif
1427
1428 /* labels needed for the guide tree building routine only */
1429 ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*));
1430 for (iI=0; iI<iNSeqInCluster; iI++) {
1431 #if FULL_WITHIN_CLUSTER_DISTANCES
1432 ppcLabels[iI] = prSubClusterMSeq->sqinfo[iI].name;
1433 #else
1434 int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
1435 ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name;
1436 #endif
1437 }
1438 #if TRACE
1439 Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d",
1440 iClusterIndex, prKMeansResult->iNClusters);
1441 SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL);
1442 #endif
1443
1444
1445 GuideTreeUpgma(&prSubClusterTree, ppcLabels,
1446 prWithinClusterDistances, NULL);
1447
1448 CKFREE(ppcLabels); /* don't free members, they just point */
1449 #if 0
1450 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex);
1451 LogTree(prSubClusterTree);
1452 #endif
1453
1454
1455 /* The guide tree id's (that point to the sequences) now start
1456 * from 0, i.e. the association with the prMSeq numbering is
1457 * broken and fixed in the following
1458 */
1459 for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) {
1460 if (IsLeaf(iNodeIndex, prSubClusterTree)) {
1461 int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree);
1462 int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId];
1463 #if 0
1464 Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)",
1465 iNodeIndex, iLeafId,
1466 GetLeafName(iNodeIndex, prSubClusterTree),
1467 iRealId, prMSeq->sqinfo[iRealId].name);
1468 #endif
1469 SetLeafId(prSubClusterTree, iNodeIndex, iRealId);
1470 }
1471 }
1472
1473
1474 /* Append the newly created tree (prSubClusterTree) to the
1475 * corresponding node index of prMbedTree_p.
1476 */
1477 #if TRACE
1478 Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s",
1479 piClusterToTreeNode[iClusterIndex],
1480 GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p));
1481 #endif
1482
1483 AppendTree(*prMbedTree_p,
1484 piClusterToTreeNode[iClusterIndex],
1485 prSubClusterTree);
1486 /* Note: piClusterToTreeNode is still valid, because
1487 * AppendTrees() guarantees that no other than the node to
1488 * append to changes. */
1489
1490 #if 0
1491 Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex);
1492 LogTree(*prMbedTree_p);
1493
1494 if (0) {
1495 char fname[] = "mbed-joined-tree.dnd";
1496 FILE *pfOut;
1497 if (NULL == (pfOut = fopen(fname, "w"))) {
1498 Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname);
1499 }
1500 MuscleTreeToFile(pfOut, *prMbedTree_p);
1501 Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname);
1502 fclose(pfOut);
1503 Log(&rLog, LOG_FATAL, "DEBUG EXIT");
1504 }
1505 #endif
1506
1507 /* cleanup
1508 */
1509 FreeMuscleTree(prSubClusterTree);
1510 FreeSymMatrix(&prWithinClusterDistances);
1511 #if FULL_WITHIN_CLUSTER_DISTANCES
1512 FreeMSeq(&prSubClusterMSeq);
1513 #endif
1514 } /* end for each cluster */
1515 ProgressDone(prSubClusterDistanceProgress);
1516 FreeProgress(&prSubClusterDistanceProgress);
1517
1518
1519 if (NULL != pcGuidetreeOut) {
1520 if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) {
1521 Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut);
1522 } else {
1523 MuscleTreeToFile(pfOut, *prMbedTree_p);
1524 Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut);
1525 (void) fclose(pfOut);
1526 }
1527 }
1528
1529
1530 /* cleanup
1531 *
1532 */
1533 #if MBED_TIMING
1534 StopwatchStop(stopwatch);
1535 StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch);
1536 StopwatchFree(stopwatch);
1537 #endif
1538
1539 FreeKMeansResult(&prKMeansResult);
1540 FreeSymMatrix(&prPreClusterDistmat);
1541 for (iI=0; iI<prMSeq->nseqs; iI++) {
1542 CKFREE(ppdSeqVec[iI]);
1543 }
1544 CKFREE(ppdSeqVec);
1545 CKFREE(piClusterToTreeNode);
1546
1547 #ifndef NDEBUG
1548 TreeValidate(*prMbedTree_p);
1549 #endif
1550
1551 return 0;
1552 }
1553 /*** end: Mbed() ***/