Mercurial > repos > clustalomega > clustalomega
diff clustalomega/clustal-omega-0.2.0/src/clustal/mbed.c @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author | clustalomega |
---|---|
date | Tue, 07 Jun 2011 17:04:25 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/clustalomega/clustal-omega-0.2.0/src/clustal/mbed.c Tue Jun 07 17:04:25 2011 -0400 @@ -0,0 +1,1553 @@ +/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ + +/********************************************************************* + * Clustal Omega - Multiple sequence alignment + * + * Copyright (C) 2010 University College Dublin + * + * Clustal-Omega is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This file is part of Clustal-Omega. + * + ********************************************************************/ + +/* + * RCS $Id: mbed.c 242 2011-05-27 14:04:21Z andreas $ + * + * + * Reimplementation from scratch of mBed: + * Blackshields et al. (2010); PMID 20470396 + * + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <assert.h> +#include <float.h> + +#include "mbed.h" +#include "pair_dist.h" +#include "symmatrix.h" +#include "ktuple_pair.h" +#include "tree.h" +#include "util.h" +#include "progress.h" +#include "queue.h" + +#include "log.h" +#include "kmpp/KMeans.h" +#include "mbed.h" + +#define TIMING 0 +#if TIMING +#include "squid/stopwatch.h" +#endif + + +/* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each + * bisecting kmeans subcluster are not estimated using the vectors, + * but calculated normally (using ktuple or kimura). Surprisingly this + * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000 + * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS + * log2 instead of log2^2). And of course it slows things down. + */ +#define FULL_WITHIN_CLUSTER_DISTANCES 1 + + +/* Cluster size limits. Maximum is a soft limit, which might be + * exceeded if a K-Means split was unsuccesful, where unsuccesful + * might also mean that the minimum required number seqs. was not + * reached */ +#if FULL_WITHIN_CLUSTER_DISTANCES +static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100; +static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1; +#else +static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000; +static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100; +#endif + +/* How many restarts per bisecting kmeans split. 10 give 0.5% increase + * in quality on original HOMFAM over just 1. It also increases kmeans + * time by a factor of ~3, but overall time is insignificant + * compared to pairdist/progressive alignment part. + */ +static const int RESTARTS_PER_SPLIT = 10; + + +/* Use standard kmeans (lloyds) or kmeans++. Both are almost + * indistinguishable here, although kmeans++ is slightly ahead the + * fewer seeds you pick (and it should be better in theory) + */ +#define USE_KMEANS_LLOYDS 0 + + +#ifndef HAVE_LOG2 +#define log2(x) (log(x) / 0.69314718055994530942) +#endif +#define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2) + + +/* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed + * approach: Sample iSeeds sequence with constant stride from length-sorted X. + * It might be better to pick the seeds randomly, because length sorting might + * be more prone to including outliers (e.g. very long and very short seqs). + * However, we could never observer such a thing. So just stick to the + * original version as this also removes the random element. + */ +enum SEED_SELECTION_TYPE { + SELECT_SEEDS_RANDOMLY, + SELECT_SEEDS_BY_LENGTH +}; +#define SEED_SELECTION SELECT_SEEDS_BY_LENGTH + + +/* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is + * no difference between mbed-trees created from cosine or euclidean + * distances (simple version, just using disparities). + */ +#define USE_EUCLIDEAN_DISTANCE 1 + + +/* print some mbed pre-cluster usage to screen */ +#define PRINT_CLUSTER_DISTRIBUTION 0 + + +#define TRACE 0 + + +typedef struct { + /* Number of final clusters + */ + int iNClusters; + + /* Coordinates (columns) for each cluster (rows) + * valid indices: [0...iNClusters][0...dim-1] + */ + double **ppdClusterCenters; + + /* Dimensionality of input data and cluster center coordinates + */ + int iDim; + + /* Number of objects per cluster + */ + int *piNObjsPerCluster; + + /* Objects (indices) for each cluster. [i][j] will point to (index + * of) object j in cluster i + */ + int **ppiObjIndicesPerCluster; +} bisecting_kmeans_result_t; + + + +/** + * @brief Free KMeans result structure. + * + * @param[out] prResult_p + * K-Means result to free + * + * @see NewKMeansResult() + */ +void +FreeKMeansResult(bisecting_kmeans_result_t **prResult_p) +{ + int iAux; + CKFREE((*prResult_p)->piNObjsPerCluster); + for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) { + CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]); + CKFREE((*prResult_p)->ppdClusterCenters[iAux]); + } + CKFREE((*prResult_p)->ppiObjIndicesPerCluster); + CKFREE((*prResult_p)->ppdClusterCenters); + (*prResult_p)->iNClusters = 0; + (*prResult_p)->iDim = 0; + CKFREE(*prResult_p); +} +/*** end: FreeKMeansResult() ***/ + + + +/** + * @brief Allocate new KMeans result + * + * @param[out] prKMeansResult_p + * K-Means result to free + * + * @see NewKMeansResult() + */ +void +NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p) +{ + (*prKMeansResult_p) = (bisecting_kmeans_result_t *) + CKCALLOC(1, sizeof(bisecting_kmeans_result_t)); + (*prKMeansResult_p)->iNClusters = 0; + (*prKMeansResult_p)->iDim = 0; + (*prKMeansResult_p)->ppdClusterCenters = NULL; + (*prKMeansResult_p)->piNObjsPerCluster = NULL; + (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL; + +} +/*** end: NewKMeansResult() ***/ + + + +/** + * @brief Calculate the euclidean distance between two vectors + * + * @param[in] v1 + * First vector with dim dimensions + * @param[in] v2 + * Second vector with dim dimensions + * @param[in] dim + * Dimensionality of v1 and v2 + * + * @return euclidean distance as double + * + * @note Could probably be inlined. Also perfect for SSE + */ +double +EuclDist(const double *v1, const double *v2, const int dim) +{ + int i; /* aux */ + double dist; /* returned distance */ + + assert(v1!=NULL); + assert(v2!=NULL); + + dist = 0.0; + for (i=0; i<dim; i++) { + dist += pow(v1[i]-v2[i], 2.0); + } + + return sqrt(dist); +} +/*** end: EuclDist() ***/ + + + +/** + * @brief Calculate the cosine distance between two vectors + * + * @param[in] v1 + * First vector with dim dimensions + * @param[in] v2 + * Second vector with dim dimensions + * @param[in] dim + * Dimensionality of v1 and v2 + * + * @return cosine distance as double + * + * @note Could probably be inlined. Also perfect for SSE + */ +double +CosDist(const double *v1, const double *v2, const int dim) +{ + int i; /* aux */ + double dist; /* returned distance */ + double s, sq1, sq2; + + assert(v1!=NULL); + assert(v2!=NULL); + + s = 0.0; + sq1 = 0.0; + sq2 = 0.0; + for (i=0; i<dim; i++) { + s += (v1[i]*v2[i]); + sq1 += pow(v1[i], 2.0); + sq2 += pow(v2[i], 2.0); + } + sq1 = sqrt(sq1); + sq2 = sqrt(sq2); + + if ((sq1 * sq2) < DBL_EPSILON) { + dist = 1 - s / (sq1 * sq2); + } else { + /* if the denominator gets small, the fraction gets big, hence dist + gets small: */ + dist = 0.0; + } + return dist; +} +/*** end: CosDist() ***/ + + + +/** + * @brief convert sequences into mbed-like (distance) vector + * representation. Seeds (prMSeq sequence indices) have to be picked before + * + * @param[out] ppdSeqVec + * Pointer to preallocated matrix of size nseqs x iSeeds + * @param[in] prMSeq + * Sequences which are to be converted + * @param[in] piSeeds + * Array of sequences in prMSeq which are to be used as seeds. + * @param[in] iNumSeeds + * Number of seeds/elements in piSeeds + * @param[in] iPairDistType + * Type of pairwise distance comparison + * + */ +int +SeqToVec(double **ppdSeqVec, mseq_t *prMSeq, + int *piSeeds, const int iNumSeeds, + const int iPairDistType) +{ + symmatrix_t *prDistmat = NULL; + int iSeqIndex; + int iSeedIndex; + /* indices for restoring order */ + int *restore; + /* sorted copy of piSeeds */ + int *piSortedSeeds; + +#if TIMING + Stopwatch_t *stopwatch = StopwatchCreate(); + StopwatchZero(stopwatch); + StopwatchStart(stopwatch); +#endif + + assert(NULL != ppdSeqVec); + assert(iNumSeeds>0 && iNumSeeds<=prMSeq->nseqs); + + piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int)); + memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int)); + + /* need them sorted, otherwise the swapping approach below might + * break + */ + qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp); + + + /* rearrange mseq order so that we can use ktuple_pairdist code as + * is. That is, swap seeds and non-seeds such that the seeds end + * up in front of mseq. This way we can use the KTuplePairDist() + * code, without making a copy of mseq. Also, keep an array of + * indices which serves to restore the original sequence order + * after putting the seeds in front + * + */ + restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int)); + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + restore[iSeqIndex] = iSeqIndex; + } + for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) { +#if TRACE + Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %u", piSortedSeeds[iSeedIndex], iSeedIndex); +#endif + if (piSortedSeeds[iSeedIndex]!=iSeedIndex) { + int swap; + SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex); + + swap = restore[iSeedIndex]; + restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]]; + restore[piSortedSeeds[iSeedIndex]] = swap; + } + } +#if TRACE + printf("DEBUG(%s|%s():%d): restore indices =", + __FILE__, __FUNCTION__, __LINE__); + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + printf(" %u:%u", iSeqIndex, restore[iSeqIndex]); + } + printf("\n"); + + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now: seq = %s", + iSeqIndex, prMSeq->sqinfo[iSeqIndex].name); + } +#endif + + + /* convert sequences into vectors of distances by calling pairwise + * distance function for each seq/seed pair + * + */ + if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType, + 0, iNumSeeds, 0, prMSeq->nseqs, + NULL, NULL)) { + Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed."); + FreeSymMatrix(&prDistmat); + CKFREE(piSortedSeeds); + CKFREE(restore); + return -1; + } +#if TRACE + { + char **labels; + labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *)); + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name; + } + SymMatrixPrint(prDistmat, labels, NULL); + CKFREE(labels); + } +#endif + + + /* update ppdSeqVec according to prDistmat. keep in mind that we + * changed mseq order + * + */ + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) { + ppdSeqVec[restore[iSeqIndex]][iSeedIndex] = + SymMatrixGetValue(prDistmat, iSeqIndex, iSeedIndex); + } + } + + + /* restore mseq order by reverse swapping + * + * need reverse order now, so start from top index + */ + iSeedIndex=iNumSeeds-1; + do { +#if TRACE + Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %d", piSortedSeeds[iSeedIndex], iSeedIndex); +#endif + if (piSortedSeeds[iSeedIndex]!=iSeedIndex) { + int swap; + + SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex); + + swap = restore[iSeedIndex]; + restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]]; + restore[piSortedSeeds[iSeedIndex]] = swap; + } + } while (0 != iSeedIndex--); +#if TRACE + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d: seq = %s %s", + iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]); + } +#endif + +#if TRACE + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + printf("DEBUG: seq %-4u as distance vector =", iSeqIndex); + for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) { + printf(" %f", ppdSeqVec[iSeqIndex][iSeedIndex]); + } + printf("\n"); + } +#endif + + + FreeSymMatrix(&prDistmat); + CKFREE(restore); +#if TIMING + StopwatchStop(stopwatch); + StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch); + StopwatchFree(stopwatch); +#endif + + + return 0; +} +/*** end: SeqToVec() ***/ + + + +/** + * @brief Select seeds to be used from an prMSeq + * + * @param[out] piSeeds + * Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated. + * @param[in] iNumSeeds + * Number of seeds to be picked + * @param[in] iSelectionMethod + * Seed selection method to be used + * @param[in] prMSeq + * The prMSeq structure to pick sequences from + * + * @return: Non-zero on failure + * + */ +int +SeedSelection(int *piSeeds, int iNumSeeds, int iSelectionMethod, mseq_t *prMSeq) +{ + /* seed iterator */ + int iSeedIndex; + /* sequence iterator */ + int iSeqIndex; + + if (SELECT_SEEDS_RANDOMLY == iSelectionMethod) { + int *piPermArray; + + Log(&rLog, LOG_INFO, + "Using %d seeds (randomly chosen) for mBed (from a total of %d sequences)", + iNumSeeds, prMSeq->nseqs); + + PermutationArray(&piPermArray, iNumSeeds); + /* copy to piSeeds */ + for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) { + piSeeds[iSeedIndex] = piPermArray[iSeedIndex]; + } + CKFREE(piPermArray); + + } else if (SELECT_SEEDS_BY_LENGTH == iSelectionMethod) { + + /* step size for picking with constant stride */ + int iStep; + int *piSeqLen = CKMALLOC(prMSeq->nseqs * sizeof(int)); + int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int)); + + Log(&rLog, LOG_INFO, + "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)", + iNumSeeds, prMSeq->nseqs); + + iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big + * due to rounding */ + /* first get an array of seq indices order according to + corresponding sequence length: piOrder */ + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len; + } + QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE); +#if 0 + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)", + iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]); + } +#endif + CKFREE(piSeqLen); + iSeedIndex = 0; + for (iSeqIndex=0; iSeedIndex<iNumSeeds; iSeqIndex+=iStep) { + piSeeds[iSeedIndex++] = piOrder[iSeqIndex]; + } + CKFREE(piOrder); + + } else { + + Log(&rLog, LOG_ERROR, "Internal error: unknown seed selection type"); + return -1; + } + + +#ifdef PICK_SEEDS_FROM_KMEANS_CLUSTERS + /* do initial mbedding and kmeans. then pick seeds from each cluster and + do full mbed with these seeds. idea is that those seeds represent the + sequence space better than random seeds. however, this reduces the + quality slightly. random is almost always better. + */ + + if (1) { + /* seqs represented as distance vectors */ + double **ppdSeqVec; + double dCost = -1.0; + /* assignments for each seq to the K newly created clusters */ + int *piKMeansClusterAssignments = CKMALLOC(prMSeq->nseqs * sizeof(int)); + double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double)); + /* create a copy of ppdSeqVec suitable for KMeans */ + double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double)); + bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool)); + + Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds"); + Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters"); + Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round"); + Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE"); + Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good"); + + /* + Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds"); + iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2); + */ + + ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *)); + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double)); + } + if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) { + Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed"); + return -1; + } + + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + int iDim; + for (iDim=0; iDim<iNumSeeds; ++iDim) { + pdKMeansVectors[iDim*iSeqIndex + iDim] = ppdSeqVec[iSeqIndex][iDim]; + } + } + + + Log(&rLog, LOG_FORCED_DEBUG, "%s\n", "FIXME hardcoded RESTARTS_PER_SPLIT"); + dCost = KMeans(prMSeq->nseqs, iNumSeeds, iNumSeeds, + pdKMeansVectors, + RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS, + pdKMeansClusterCenters, piKMeansClusterAssignments); + Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost); + + Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers"); + +#if TRACE + Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:"); + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d", + iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, + piKMeansClusterAssignments[iSeqIndex]); + } +#endif + + Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds"); + iSeedIndex = 0; + for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) { + int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex]; + if (pbClusterUsed[iAssignedCluster]) { + continue; + } else { + /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/ + piSeeds[iSeedIndex++] = iSeqIndex; + pbClusterUsed[iAssignedCluster] = TRUE; + } + } + CKFREE(pbClusterUsed); + + CKFREE(pdKMeansVectors); + CKFREE(pdKMeansClusterCenters); + CKFREE(piKMeansClusterAssignments); + + } + /* if (1) */ +#endif + + + if (LOG_DEBUG <= rLog.iLogLevelEnabled) { + for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) { + Log(&rLog, LOG_DEBUG, "Picked sequence %d (%s) as seed no %d", + piSeeds[iSeedIndex], prMSeq->sqinfo[piSeeds[iSeedIndex]].name, iSeedIndex); + } + } + + return 0; +} +/* end of SeedSelection() */ + + + + +/** + * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with + * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster. + * + * @param[out] prKMeansResult_p + * Result of Bisecting KMeans. Will be allocated here. + * Caller has to free. See @see FreeKMeansResult() + * @param[in] iNObjs + * Number of objects/sequences to cluster + * @param[in] iDim + * Dimensionality of input data + * @param[in] ppdVectors + * each row holds iDim points for this object's coordinates + * @param[in] iMinRequiredObjsPerCluster + * Minimum number of objects per Cluster (inclusive)/ + * @param[in] iMaxAllowedObjsPerCluster + * Maximum number of objects per Cluster (inclusive). Function returns once no + * cluster contains more then this number of objects. Soft limit! + * + * @note Convoluted code. Could use some restructuring. My apologies. + * AW + * + */ +void +BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p, + const int iNObjs, const int iDim, double **ppdVectors, + const int iMinRequiredObjsPerCluster, + const int iMaxAllowedObjsPerCluster) +{ + int iN, iD; + /* cluster centers for each cluster created at each split */ + double *pdKClusterCenters; + /* keep track of updated object indices per newly created + * cluster */ + int *piCurObjToUpdate; + /* number of times we called 2-means */ + int iNRounds; + /* flag for unsuccessful cluster split */ + bool bNaNDetected = FALSE; + /* flag for detected small cluster after split */ + bool bSmallClusterDetected; + /* queue of clusters which are to be split */ + int_queue_t rClusterSplitQueue; + +#if TIMING + Stopwatch_t *stopwatch = StopwatchCreate(); +#endif + + + piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int)); + + NewKMeansResult(prKMeansResult_p); + + /* new cluster centers created at each split/KMeans run + */ + pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double)); + + /* init results by setting a first cluster that contains all objects + */ + (*prKMeansResult_p)->iNClusters = 1; + (*prKMeansResult_p)->iDim = iDim; + /* fake center coordinates of first cluster */ + (*prKMeansResult_p)->ppdClusterCenters = + (double **) CKMALLOC(1 * sizeof(double *)); + (*prKMeansResult_p)->ppdClusterCenters[0] = + (double *) CKMALLOC(iDim * sizeof(double)); + /* objects per cluster */ + (*prKMeansResult_p)->piNObjsPerCluster = + (int *) CKMALLOC(1 * sizeof(int)); + (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs; + /* object indices per cluster */ + (*prKMeansResult_p)->ppiObjIndicesPerCluster = + (int **) CKMALLOC(1 * sizeof(int *)); + (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *) + CKMALLOC(iNObjs * sizeof(int)); + for (iN=0; iN<iNObjs; iN++) { + (*prKMeansResult_p)->ppiObjIndicesPerCluster[0][iN] = iN; + } + + + + /* Starting with the first cluster that now contains all the + * sequences keep splitting until no cluster contains more than + * iMaxAllowedObjsPerCluster + * + * Keep a queue of clusters (rClusterSplitQueue) to split + * + * At each split values/memory of the just split cluster will be + * reused and exactly one new only allocated. + * + * Example: + * Given the following cluster assignments + * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4 + * and a K-Means split in cluster 3 at |: + * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4 + * The result should be this: + * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4 + * + * + */ + INT_QUEUE_INIT(&rClusterSplitQueue); + if (iNObjs>iMaxAllowedObjsPerCluster) { + /* pish fake first cluster index */ + INT_QUEUE_PUSH(&rClusterSplitQueue, 0); + } + iNRounds = 0; + while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) { + /* assignments for each seq to the K newly created clusters */ + int *piKClusterAssignments; + /* number of objects in cluster that is to be split */ + int iNObjsInClusterToSplit; + /* coordinates of points in cluster that is to be split + * array of size n*d where [d*i + j] gives coordinate j of point i + */ + double *pdVectorsInClusterToSplit; + /* copy of object indices in split cluster */ + int *piObjIndicesOfSplitCluster; + /* best cost of kmeans rounds */ + double dCost = -1.0; + + /* indices for the two created clusters + */ + /* index of cluster to split */ + int iClusterToSplot; + /* index of newly created cluster at each round. data for the + other created cluster goes to the one just split, + i.e. (*piClusterToSplot) */ + int iNewClusterIdx; + +#if TIMING + StopwatchZero(stopwatch); + StopwatchStart(stopwatch); +#endif + + INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot); + + iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot]; + piKClusterAssignments = (int *) + CKMALLOC(iNObjsInClusterToSplit * sizeof(int)); + + pdVectorsInClusterToSplit = (double *) + CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double)); + for (iN=0; iN<iNObjsInClusterToSplit; iN++) { + for (iD=0; iD<iDim; ++iD) { + int iThisObjIdx = + (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]; + pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD]; + } + } + +#if TRACE + Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects", + iNRounds, iClusterToSplot, iNObjsInClusterToSplit); + fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:", + __FILE__, __FUNCTION__, __LINE__); + for (iN=0; iN<iNObjsInClusterToSplit; iN++) { + fprintf(stderr, " %u", + (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]); + } + fprintf(stderr, "\n"); + (void) fflush(stderr); +#endif + +#if TRACE + for (iN=0; iN<iNObjsInClusterToSplit; iN++) { + fprintf(stderr, + "DEBUG(%s|%s():%d): Coordinate of object %u (real index %u) in cluster to split:", + __FILE__, __FUNCTION__, __LINE__, iN, + (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]); + for (iD=0; iD<iDim; iD++) { + fprintf(stderr, " %f", pdVectorsInClusterToSplit[iDim*iN + iD]); + } + fprintf(stderr, "\n"); + (void) fflush(stderr); + } +#endif + + /* KMeans(1 "The number of points in the data set", + * 2 "The number of clusters to look for", + * 3 "The number of dimensions that the data set lives in", + * 4 "points: An array of size n*d where points[d*i + j] gives coordinate j of point i", + * 5 "attempts: The number of times to independently run k-means", + * 6 "use_lloyds_method: uses kmpp if false, otherwise lloyds method", + * 7 "centers: This can either be null or an array of size k*d. + * In the latter case, centers[d*i + j] will give coordinate j of center i. + * If the cluster is unused, it will contain NaN instead.", + * 8 "assignments: This can either be null or an array of size n. + * In the latter case, it will be filled with the cluster that each point is assigned to + * (an integer between 0 and k-1 inclusive)."); + */ + dCost = KMeans(iNObjsInClusterToSplit, 2, iDim, + pdVectorsInClusterToSplit, + RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS, + pdKClusterCenters, piKClusterAssignments); + +#if TRACE + Log(&rLog, LOG_FORCED_DEBUG, "%s", "Raw K-Means output:"); + for (iN=0; iN<2; iN++) { + fprintf(stderr, "DEBUG(%s|%s():%d): Cluster Center %u =", + __FILE__, __FUNCTION__, __LINE__, iN); + for (iD=0; iD<iDim; iD++) { + fprintf(stderr, " %f", pdKClusterCenters[iN*iDim+iD]); + } + fprintf(stderr, "\n"); + (void) fflush(stderr); + } + for (iN=0; iN<iNObjsInClusterToSplit; iN++) { + Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u to cluster %d (of #%u)", + iN, piKClusterAssignments[iN], 2); + } +#endif + + /* real index of one of the newly created clusters. the other + * one is iClusterToSplot */ + iNewClusterIdx = (*prKMeansResult_p)->iNClusters; + + + /* We don't want Clusters which are too small. Check here if a + * split created a small cluster and if yes, discard the + * solution. Because the cluster has already been removed from + * the queue, we can just continue. + */ + bSmallClusterDetected = FALSE; + if (iMinRequiredObjsPerCluster>1) { + int iNObjsCluster[2]; + iNObjsCluster[0] = 0; /* first cluster */ + iNObjsCluster[1] = 0; /* second cluster */ + for (iN=0; iN<iNObjsInClusterToSplit; iN++) { + iNObjsCluster[piKClusterAssignments[iN]]+=1; + } + + if (iNObjsCluster[0]<iMinRequiredObjsPerCluster + || + iNObjsCluster[1]<iMinRequiredObjsPerCluster) { + bSmallClusterDetected = TRUE; + Log(&rLog, LOG_FORCED_DEBUG, "Skipping this split because objs in 1st/2nd cluster = %d/%d < %d", + iNObjsCluster[0], iNObjsCluster[1], iMinRequiredObjsPerCluster); + } + } + + /* Same logic as for small clusters applies if KMeans couldn't + * split the cluster. In this case one of its center + * coordinates will be NaN, which we check for in the + * following. + */ + if (! bSmallClusterDetected) { + for (iN=0; iN<2; iN++) { + bNaNDetected = FALSE; + for (iD=0; iD<iDim; iD++) { + if (0 != isnan(pdKClusterCenters[iN*iDim+iN])) { + /* Got NaN as coordinate after splitting */ + Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s", + __FUNCTION__, + iClusterToSplot, iNObjsInClusterToSplit, + "Hope it's not too big and doesn't slow things down." + ); + bNaNDetected = TRUE; + break; + } + } + if (bNaNDetected) { + break; + } + } + } + + /* Discarding split of this cluster. It has been removed from the + * queue already. so just continue + */ + if (bNaNDetected || bSmallClusterDetected) { + CKFREE(piKClusterAssignments); + CKFREE(pdVectorsInClusterToSplit); + continue; + } + + + /* update cluster centers: pdClusterCenters + * + */ + /* reuse memory of existing/old/split cluster + */ + for (iN=0; iN<iDim; iN++) { + double dCoord = pdKClusterCenters[0*iDim+iN]; + (*prKMeansResult_p)->ppdClusterCenters[iClusterToSplot][iN] = dCoord; + } + /* realloc and set new one + */ + (*prKMeansResult_p)->ppdClusterCenters = (double **) + CKREALLOC((*prKMeansResult_p)->ppdClusterCenters, + ((*prKMeansResult_p)->iNClusters+1) * sizeof(double *)); + (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *) + CKMALLOC(iDim * sizeof(double)); + for (iD=0; iD<iDim; iD++) { + double dCoord = pdKClusterCenters[1*iDim+iD]; + (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx][iD] = dCoord; + } +#if TRACE + Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:"); + for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) { + fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =", + __FILE__, __FUNCTION__, __LINE__, iN); + for (iD=0; iD<iDim; iD++) { + fprintf(stderr, " %f", (*prKMeansResult_p)->ppdClusterCenters[iN][iD]); + } + fprintf(stderr, "\n"); + (void) fflush(stderr); + } +#endif + + + /* update #seqs per cluster: piNObjsPerCluster + * + */ + (*prKMeansResult_p)->piNObjsPerCluster = (int *) + CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster, + ((*prKMeansResult_p)->iNClusters+1) * sizeof(int)); + /* init new and old one to zero */ + (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0; + (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0; + /* now update values */ + for (iN=0; iN<iNObjsInClusterToSplit; iN++) { + if (0 == piKClusterAssignments[iN]) { + (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] += 1; + } else if (1 == piKClusterAssignments[iN]) { + (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1; + } else { + /* there used to be code for iK>=2 in r101 */ + Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)", + piKClusterAssignments[iN]); + } + } +#if TRACE + Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:"); + for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) { + Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences", + iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]); + } +#endif + /* queue clusters if still they are still too big + */ + if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) { + INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot); + } + if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) { + INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx); + } + + + + /* update which objects belong to which cluster: + * + * note: piNObjsPerCluster needs to be already updated + * + */ + /* create a copy of the object indices in the split cluster + * (original will be overwritten) + */ + piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int)); + memcpy(piObjIndicesOfSplitCluster, + (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot], + iNObjsInClusterToSplit * sizeof(int)); + + (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **) + CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster, + ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *)); + + + (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] = + (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot], + (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int)); + + (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] = + (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int)); + + + /* now reassign the object indices to the assigned cluster + */ + piCurObjToUpdate[0] = 0; + piCurObjToUpdate[1] = 0; + for (iN=0; iN<iNObjsInClusterToSplit; iN++) { + int iThisObjIdx = piObjIndicesOfSplitCluster[iN]; + int iThisClusterAssignment = piKClusterAssignments[iN]; + int iThisOffset = piCurObjToUpdate[iThisClusterAssignment]; + int iThisClusterIndex = 0; + + if (0 == iThisClusterAssignment) { + iThisClusterIndex = iClusterToSplot; + } else if (1 == iThisClusterAssignment) { + iThisClusterIndex = iNewClusterIdx; + } else { + /* there used to be code for iK>=2 in r101 */ + Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)", + piKClusterAssignments[iN]); + } +#if 0 + Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d", + iThisClusterIndex, iThisOffset, iThisObjIdx); +#endif + (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx; + piCurObjToUpdate[iThisClusterAssignment]+=1; + } + CKFREE(piObjIndicesOfSplitCluster); +#if TRACE + for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) { + int iObj; + fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ", + __FILE__, __FUNCTION__, __LINE__, iN); + for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) { + fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]); + } + fprintf(stderr, "\n"); + (void) fflush(stderr); + } +#endif + + +#if TIMING + StopwatchStop(stopwatch); + StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch); +#endif + + + /* finally: increase number of clusters + */ + (*prKMeansResult_p)->iNClusters += 1; + iNRounds += 1; + CKFREE(piKClusterAssignments); + CKFREE(pdVectorsInClusterToSplit); + + } /* while */ + INT_QUEUE_DESTROY(&rClusterSplitQueue); + + + Log(&rLog, LOG_DEBUG, + "Bisecting K-means finished after %d rounds (no more clusters to split)", + iNRounds); + +#if TIMING + StopwatchFree(stopwatch); +#endif + + /* @note could use progress/timer */ + + CKFREE(pdKClusterCenters); + CKFREE(piCurObjToUpdate); + + return; +} +/*** end: BisectingKmeans() ***/ + + + +/** + * + * @brief From scratch reimplementation of mBed: Blackshields et al. + * (2010); PMID 20470396. + * + * Idea is a follows: + * - convert sequences into vectors of distances + * - cluster the vectors using k-means + * - cluster each of the k clusters using upgma (used cached distances + * from above?) + * - join the sub-clusters to create on tree (use UPGMA on k-means + * medoids) + * + * + * @param[out] prMbedTree_p + * Created upgma tree. will be allocated here. use FreeMuscleTree() + * to free + * @param[in] prMSeq + * Multiple sequences + * @param[in] iPairDistType + * Distance measure for pairwise alignments + * @param[in] pcGuidetreeOut + * Passed down to GuideTreeUpgma() + * + * @return Zero on success, non-zero on error + * + */ +int +Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, + const char *pcGuidetreeOut) +{ + /* number of seeds */ + int iNumSeeds; + /* seed indices matching prMSeq */ + int *piSeeds; + /* seqs represented as distance vectors */ + double **ppdSeqVec; + /* kmeans result */ + bisecting_kmeans_result_t *prKMeansResult = NULL; + /* distance matrix of kmeans (pre-)cluster centers */ + symmatrix_t *prPreClusterDistmat = NULL; + /* auxiliary for symmetric matrix output; tree routines etc */ + char **ppcLabels; + int iNodeIndex; + /* mapping of cluster-center tree node indices to corresponding + cluster */ + int *piClusterToTreeNode; + int iClusterIndex; + int iI, uJ; + FILE *pfOut; + progress_t *prSubClusterDistanceProgress; + bool bPrintCR = (LOG_VERBOSE<=rLog.iLogLevelEnabled) ? FALSE : TRUE; + +#if FULL_WITHIN_CLUSTER_DISTANCES + Log(&rLog, LOG_WARN, + "FULL_WITHIN_CLUSTER_DISTANCES (using min %d / max %d seqs per subcluster) is a quick-hack. Brace yourself...", + MIN_REQUIRED_SEQ_PER_PRECLUSTER, MAX_ALLOWED_SEQ_PER_PRECLUSTER); +#endif + +#if MBED_TIMING + Stopwatch_t *stopwatch = StopwatchCreate(); +#endif + + assert(NULL != prMbedTree_p); + assert(NULL != prMSeq); + + + iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs); + if (iNumSeeds >= prMSeq->nseqs) { + /* -1 is condition for RandomUniqueIntArray */ + iNumSeeds = prMSeq->nseqs-1; + Log(&rLog, LOG_DEBUG, + "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d", + iNumSeeds); + } + + + /* Turn sequences into vectors of distances to the seeds + * + */ + piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int)); + if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) { + Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed"); + return -1; + } + ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *)); + for (iI=0; iI<prMSeq->nseqs; iI++) { + ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double)); + } + if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) { + Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed"); + return -1; + } + CKFREE(piSeeds); + + + /* Calculate (pre-)clusters of sequence vectors by applying + * bisecting kmeans + * + */ +#if MBED_TIMING + /* start clock only here, to make sure we don't include pairwise + * distance computation */ + StopwatchZero(stopwatch); + StopwatchStart(stopwatch); +#endif + + BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec, + MIN_REQUIRED_SEQ_PER_PRECLUSTER, + MAX_ALLOWED_SEQ_PER_PRECLUSTER); + Log(&rLog, LOG_INFO, + "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)", + prKMeansResult->iNClusters, + MIN_REQUIRED_SEQ_PER_PRECLUSTER, + MAX_ALLOWED_SEQ_PER_PRECLUSTER); + + +#if PRINT_CLUSTER_DISTRIBUTION + Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters); + for (iI=0; iI<prKMeansResult->iNClusters; iI++) { +#if TRACE + int iD; + Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI); + fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =", + __FILE__, __FUNCTION__, __LINE__); + for (iD=0; iD<iNumSeeds; iD++) { + fprintf(stderr, " %f", prKMeansResult->ppdClusterCenters[iI][iD]); + } + fprintf(stderr, "\n"); + fflush(stderr); +#endif + Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned", + iI, prKMeansResult->piNObjsPerCluster[iI]); +#if TRACE + for (uJ=0; uJ<prKMeansResult->piNObjsPerCluster[iI]; uJ++) { + int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][uJ]; + + Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)", + iI, uJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name); + } +#endif + } +#endif + + + /* Cluster pre-clusters produced by k-means. + * + * Do this by calculating the vector distances of the cluster + * centers and applying UPGMA. + * + * @note could try to force-balance the tree here + * + */ + if (0 != NewSymMatrix(&prPreClusterDistmat, + prKMeansResult->iNClusters, prKMeansResult->iNClusters)) { + Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed"); + } + for (iI=0; iI<prKMeansResult->iNClusters; iI++) { + for (uJ=iI+1; uJ<prKMeansResult->iNClusters; uJ++) { + double dDist; + dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI], + prKMeansResult->ppdClusterCenters[uJ], + iNumSeeds); + SymMatrixSetValue(prPreClusterDistmat, iI, uJ, dDist); + /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f", + iI, uJ, dDist); */ + } + } + + + /* labels needed for the guide tree building routine only */ + ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*)); + for (iI=0; iI<prKMeansResult->iNClusters; iI++) { + ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char)); + (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI); + } +#if TRACE + Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:"); + SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL); +#endif + + GuideTreeUpgma(prMbedTree_p, + ppcLabels, prPreClusterDistmat, NULL); + + for (iI=0; iI<prKMeansResult->iNClusters; iI++) { + CKFREE(ppcLabels[iI]); + } + CKFREE(ppcLabels); +#if TRACE + Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:"); + LogTree(*prMbedTree_p); +#endif + + + + /* Now replace each leaf in the pre-cluster-center tree + * appropriately, i.e. with the corresponding sub-cluster. + * + * For each leaf/sub-cluster, create a distance matrix for the + * corresponding sequences. Use distances between vectors as + * approximated distances between sequences. Then create a + * guide-tree by applying UPGMA. + * + */ + + /* Get a mapping of (pre)cluster number and leaf-node indices in + * the cluster-center tree. We can add trees to prMbedTree_p + * because AppendTrees() guarantees that no other than the node to + * append to changes. + */ + piClusterToTreeNode = (int*) + CKMALLOC(prKMeansResult->iNClusters * sizeof(int)); + iNodeIndex = FirstDepthFirstNode(*prMbedTree_p); + do { + if (IsLeaf(iNodeIndex, *prMbedTree_p)) { + int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p); + piClusterToTreeNode[iLeafId] = iNodeIndex; + } + iNodeIndex = NextDepthFirstNode(iNodeIndex, *prMbedTree_p); + } while (NULL_NEIGHBOR != iNodeIndex); + + + + + /* Now step through all the leafs and replace them with the + * corresponding sub-trees + */ + NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO), + "Distance calculation within sub-clusters", bPrintCR); + /* for each cluster */ + for (iClusterIndex=0; + iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) { + /* distance matrix for the sub-cluster */ + symmatrix_t *prWithinClusterDistances = NULL; + int iNSeqInCluster; + tree_t *prSubClusterTree = NULL; + + ProgressLog(prSubClusterDistanceProgress, + iClusterIndex, prKMeansResult->iNClusters, FALSE); + +#if FULL_WITHIN_CLUSTER_DISTANCES + mseq_t *prSubClusterMSeq; + int iPairDistType; + int iSeqIndex; + + Log(&rLog, LOG_DEBUG, + "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER"); + + if (TRUE == prMSeq->aligned) { + iPairDistType = PAIRDIST_SQUIDID_KIMURA; + } else { + iPairDistType = PAIRDIST_KTUPLE; + } +#endif + + iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex]; +#if TRACE + Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d", + iClusterIndex, iNSeqInCluster); +#endif + +#if FULL_WITHIN_CLUSTER_DISTANCES + /* create an mseq structure for sequences in this cluster + * don't need most of the members (e.g. orig_seq) but copy + * them anyway for the sake of completeness + */ + NewMSeq(&prSubClusterMSeq); + + prSubClusterMSeq->nseqs = iNSeqInCluster; + prSubClusterMSeq->seqtype = prMSeq->seqtype; + if (NULL!=prMSeq->filename) { + prSubClusterMSeq->filename = CkStrdup(prMSeq->filename); + } + prSubClusterMSeq->seq = (char **) + CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *)); + prSubClusterMSeq->orig_seq = (char **) + CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *)); + prSubClusterMSeq->sqinfo = (SQINFO *) + CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO)); + + for (iSeqIndex=0; iSeqIndex<iNSeqInCluster; iSeqIndex++) { + int iRealSeqIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iSeqIndex]; + prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]); + prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]); + SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]); +#if TRACE + Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)", + iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name, + iRealSeqIndex); +#endif + } +#endif + + + /* Create a distance matrix for this sub-cluster + * (prWithinClusterDistances) by using the vector distances or + * ktuple distances. + * + * Then apply UPGMA to get a subcluster tree + * (prSubClusterTree) and append created tree to the + * pre-cluster-tree (prMbedTree_p) + * + */ +#if FULL_WITHIN_CLUSTER_DISTANCES + + /* compute distances, but be quiet */ + if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType, + 0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs, + NULL, NULL)) { + Log(&rLog, LOG_ERROR, "Couldn't compute pair distances"); + return -1; + } +#else + if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) { + Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed"); + } + for (iI=0; iI<iNSeqInCluster; iI++) { + int iRealIndexI = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI]; + + for (uJ=iI+1; uJ<iNSeqInCluster; uJ++) { + int iRealIndexJ = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][uJ]; + double dist; + + /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s", + iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name, + uJ, prMSeq->sqinfo[iRealIndexJ].name); */ + + if (1 == USE_EUCLIDEAN_DISTANCE) { + dist = EuclDist(ppdSeqVec[iRealIndexI], + ppdSeqVec[iRealIndexJ], iNumSeeds); + } else { + dist = CosDist(ppdSeqVec[iRealIndexI], + ppdSeqVec[iRealIndexJ], iNumSeeds); + } + SymMatrixSetValue(prWithinClusterDistances, iI, uJ, dist); + } + } +#endif + + /* labels needed for the guide tree building routine only */ + ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*)); + for (iI=0; iI<iNSeqInCluster; iI++) { +#if FULL_WITHIN_CLUSTER_DISTANCES + ppcLabels[iI] = prSubClusterMSeq->sqinfo[iI].name; +#else + int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI]; + ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name; +#endif + } +#if TRACE + Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d", + iClusterIndex, prKMeansResult->iNClusters); + SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL); +#endif + + + GuideTreeUpgma(&prSubClusterTree, ppcLabels, + prWithinClusterDistances, NULL); + + CKFREE(ppcLabels); /* don't free members, they just point */ +#if 0 + Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex); + LogTree(prSubClusterTree); +#endif + + + /* The guide tree id's (that point to the sequences) now start + * from 0, i.e. the association with the prMSeq numbering is + * broken and fixed in the following + */ + for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) { + if (IsLeaf(iNodeIndex, prSubClusterTree)) { + int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree); + int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId]; +#if 0 + Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)", + iNodeIndex, iLeafId, + GetLeafName(iNodeIndex, prSubClusterTree), + iRealId, prMSeq->sqinfo[iRealId].name); +#endif + SetLeafId(prSubClusterTree, iNodeIndex, iRealId); + } + } + + + /* Append the newly created tree (prSubClusterTree) to the + * corresponding node index of prMbedTree_p. + */ +#if TRACE + Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s", + piClusterToTreeNode[iClusterIndex], + GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p)); +#endif + + AppendTree(*prMbedTree_p, + piClusterToTreeNode[iClusterIndex], + prSubClusterTree); + /* Note: piClusterToTreeNode is still valid, because + * AppendTrees() guarantees that no other than the node to + * append to changes. */ + +#if 0 + Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex); + LogTree(*prMbedTree_p); + + if (0) { + char fname[] = "mbed-joined-tree.dnd"; + FILE *pfOut; + if (NULL == (pfOut = fopen(fname, "w"))) { + Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname); + } + MuscleTreeToFile(pfOut, *prMbedTree_p); + Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname); + fclose(pfOut); + Log(&rLog, LOG_FATAL, "DEBUG EXIT"); + } +#endif + + /* cleanup + */ + FreeMuscleTree(prSubClusterTree); + FreeSymMatrix(&prWithinClusterDistances); +#if FULL_WITHIN_CLUSTER_DISTANCES + FreeMSeq(&prSubClusterMSeq); +#endif + } /* end for each cluster */ + ProgressDone(prSubClusterDistanceProgress); + FreeProgress(&prSubClusterDistanceProgress); + + + if (NULL != pcGuidetreeOut) { + if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) { + Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut); + } else { + MuscleTreeToFile(pfOut, *prMbedTree_p); + Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut); + (void) fclose(pfOut); + } + } + + + /* cleanup + * + */ +#if MBED_TIMING + StopwatchStop(stopwatch); + StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch); + StopwatchFree(stopwatch); +#endif + + FreeKMeansResult(&prKMeansResult); + FreeSymMatrix(&prPreClusterDistmat); + for (iI=0; iI<prMSeq->nseqs; iI++) { + CKFREE(ppdSeqVec[iI]); + } + CKFREE(ppdSeqVec); + CKFREE(piClusterToTreeNode); + +#ifndef NDEBUG + TreeValidate(*prMbedTree_p); +#endif + + return 0; +} +/*** end: Mbed() ***/