Mercurial > repos > clustalomega > clustalomega

diff clustalomega/clustal-omega-0.2.0/src/clustal/mbed.c @ 0:ff1768533a07
Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author: clustalomega
date: Tue, 07 Jun 2011 17:04:25 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/clustalomega/clustal-omega-0.2.0/src/clustal/mbed.c	Tue Jun 07 17:04:25 2011 -0400
@@ -0,0 +1,1553 @@
+/* -*- mode: c; tab-width: 4; c-basic-offset: 4;  indent-tabs-mode: nil -*- */
+
+/*********************************************************************
+ * Clustal Omega - Multiple sequence alignment
+ *
+ * Copyright (C) 2010 University College Dublin
+ *
+ * Clustal-Omega is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This file is part of Clustal-Omega.
+ *
+ ********************************************************************/
+
+/*
+ *  RCS $Id: mbed.c 242 2011-05-27 14:04:21Z andreas $
+ *
+ *
+ * Reimplementation from scratch of mBed:
+ * Blackshields et al. (2010); PMID 20470396
+ *
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <assert.h>
+#include <float.h>
+
+#include "mbed.h"
+#include "pair_dist.h"
+#include "symmatrix.h"
+#include "ktuple_pair.h"
+#include "tree.h"
+#include "util.h"
+#include "progress.h"
+#include "queue.h"
+
+#include "log.h"
+#include "kmpp/KMeans.h"
+#include "mbed.h"
+
+#define TIMING 0
+#if TIMING
+#include "squid/stopwatch.h"
+#endif
+
+
+/* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each
+ * bisecting kmeans subcluster are not estimated using the vectors,
+ * but calculated normally (using ktuple or kimura). Surprisingly this
+ * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000
+ * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS
+ * log2 instead of log2^2). And of course it slows things down.
+ */
+#define FULL_WITHIN_CLUSTER_DISTANCES 1
+
+
+/* Cluster size limits. Maximum is a soft limit, which might be
+ * exceeded if a K-Means split was unsuccesful, where unsuccesful
+ * might also mean that the minimum required number seqs. was not
+ * reached */
+#if FULL_WITHIN_CLUSTER_DISTANCES
+static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100;
+static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1;
+#else
+static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000;
+static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100;
+#endif
+
+/* How many restarts per bisecting kmeans split. 10 give 0.5% increase
+ * in quality on original HOMFAM over just 1. It also increases kmeans
+ * time by a factor of ~3, but overall time is insignificant
+ * compared to pairdist/progressive alignment part.
+ */
+static const int RESTARTS_PER_SPLIT = 10;
+
+
+/* Use standard kmeans (lloyds) or kmeans++. Both are almost
+ * indistinguishable here, although kmeans++ is slightly ahead the
+ * fewer seeds you pick (and it should be better in theory)
+ */
+#define USE_KMEANS_LLOYDS 0
+
+
+#ifndef HAVE_LOG2
+#define log2(x)  (log(x) / 0.69314718055994530942)
+#endif
+#define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2)
+
+
+/* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed
+ * approach: Sample iSeeds sequence with constant stride from length-sorted X.
+ * It might be better to pick the seeds randomly, because length sorting might
+ * be more prone to including outliers (e.g. very long and very short seqs).
+ * However, we could never observer such a thing. So just stick to the
+ * original version as this also removes the random element.
+ */
+enum SEED_SELECTION_TYPE {
+    SELECT_SEEDS_RANDOMLY,
+    SELECT_SEEDS_BY_LENGTH
+};
+#define SEED_SELECTION SELECT_SEEDS_BY_LENGTH
+
+
+/* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is
+ * no difference between mbed-trees created from cosine or euclidean
+ * distances (simple version, just using disparities).
+ */
+#define USE_EUCLIDEAN_DISTANCE 1
+
+
+/* print some mbed pre-cluster usage to screen */
+#define PRINT_CLUSTER_DISTRIBUTION 0
+
+
+#define TRACE 0
+
+
+typedef struct {
+    /* Number of final clusters
+     */
+    int iNClusters;
+
+    /* Coordinates (columns) for each cluster (rows)
+     * valid indices: [0...iNClusters][0...dim-1]
+     */
+    double **ppdClusterCenters;
+
+    /* Dimensionality of input data and cluster center coordinates
+     */
+    int iDim;
+
+    /* Number of objects per cluster
+     */
+    int *piNObjsPerCluster;
+
+    /* Objects (indices) for each cluster. [i][j] will point to (index
+     * of) object j in cluster i
+     */
+    int **ppiObjIndicesPerCluster;
+} bisecting_kmeans_result_t;
+
+
+
+/**
+ * @brief Free KMeans result structure.
+ *
+ * @param[out]  prResult_p
+ * K-Means result to free
+ *
+ * @see NewKMeansResult()
+ */
+void
+FreeKMeansResult(bisecting_kmeans_result_t **prResult_p)
+{
+    int iAux;
+    CKFREE((*prResult_p)->piNObjsPerCluster);
+    for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) {
+        CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]);
+        CKFREE((*prResult_p)->ppdClusterCenters[iAux]);
+    }
+    CKFREE((*prResult_p)->ppiObjIndicesPerCluster);
+    CKFREE((*prResult_p)->ppdClusterCenters);
+    (*prResult_p)->iNClusters = 0;
+    (*prResult_p)->iDim = 0;
+    CKFREE(*prResult_p);
+}
+/***   end: FreeKMeansResult()   ***/
+
+
+
+/**
+ * @brief Allocate new KMeans result
+ *
+ * @param[out] prKMeansResult_p
+ * K-Means result to free
+ *
+ * @see NewKMeansResult()
+ */
+void
+NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p)
+{
+    (*prKMeansResult_p) = (bisecting_kmeans_result_t *)
+        CKCALLOC(1, sizeof(bisecting_kmeans_result_t));
+    (*prKMeansResult_p)->iNClusters = 0;
+    (*prKMeansResult_p)->iDim = 0;
+    (*prKMeansResult_p)->ppdClusterCenters = NULL;
+    (*prKMeansResult_p)->piNObjsPerCluster = NULL;
+    (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL;
+
+}
+/***   end: NewKMeansResult()   ***/
+
+
+
+/**
+ * @brief Calculate the euclidean distance between two vectors
+ *
+ * @param[in] v1
+ * First vector with dim dimensions
+ * @param[in] v2
+ * Second vector with dim dimensions
+ * @param[in] dim
+ * Dimensionality of v1 and v2
+ *
+ * @return euclidean distance as double
+ *
+ * @note Could probably be inlined. Also perfect for SSE
+ */
+double
+EuclDist(const double *v1, const double *v2, const int dim)
+{
+    int i; /* aux */
+    double dist; /* returned distance */
+
+    assert(v1!=NULL);
+    assert(v2!=NULL);
+
+    dist = 0.0;
+    for (i=0; i<dim; i++) {
+        dist += pow(v1[i]-v2[i], 2.0);
+    }
+
+    return sqrt(dist);
+}
+/***   end: EuclDist()   ***/
+
+
+
+/**
+ * @brief Calculate the cosine distance between two vectors
+ *
+ * @param[in] v1
+ * First vector with dim dimensions
+ * @param[in] v2
+ * Second vector with dim dimensions
+ * @param[in] dim
+ * Dimensionality of v1 and v2
+ *
+ * @return cosine distance as double
+ *
+ * @note Could probably be inlined. Also perfect for SSE
+ */
+double
+CosDist(const double *v1, const double *v2, const int dim)
+{
+    int i; /* aux */
+    double dist; /* returned distance */
+    double s, sq1, sq2;
+
+    assert(v1!=NULL);
+    assert(v2!=NULL);
+
+    s = 0.0;
+    sq1 = 0.0;
+    sq2 = 0.0;
+    for (i=0; i<dim; i++) {
+        s += (v1[i]*v2[i]);
+        sq1 += pow(v1[i], 2.0);
+        sq2 += pow(v2[i], 2.0);
+    }
+    sq1 = sqrt(sq1);
+    sq2 = sqrt(sq2);
+
+    if ((sq1 * sq2) < DBL_EPSILON) {
+        dist = 1 - s / (sq1 * sq2);
+    } else {
+        /* if the denominator gets small, the fraction gets big, hence dist
+           gets small: */
+        dist = 0.0;
+    }
+    return dist;
+}
+/***   end: CosDist()   ***/
+
+
+
+/**
+ * @brief convert sequences into mbed-like (distance) vector
+ * representation. Seeds (prMSeq sequence indices) have to be picked before
+ *
+ * @param[out] ppdSeqVec
+ * Pointer to preallocated matrix of size nseqs x iSeeds
+ * @param[in] prMSeq
+ * Sequences which are to be converted
+ * @param[in] piSeeds
+ * Array of sequences in prMSeq which are to be used as seeds.
+ * @param[in] iNumSeeds
+ * Number of seeds/elements in piSeeds
+ * @param[in] iPairDistType
+ * Type of pairwise distance comparison
+ *
+ */
+int
+SeqToVec(double **ppdSeqVec, mseq_t *prMSeq, 
+         int *piSeeds, const int iNumSeeds,
+         const int iPairDistType)
+{
+    symmatrix_t *prDistmat = NULL;
+    int iSeqIndex;
+    int iSeedIndex;
+   /* indices for restoring order */
+    int *restore; 
+    /* sorted copy of piSeeds */
+    int *piSortedSeeds; 
+
+#if TIMING
+    Stopwatch_t *stopwatch = StopwatchCreate();
+    StopwatchZero(stopwatch);
+    StopwatchStart(stopwatch);
+#endif
+    
+    assert(NULL != ppdSeqVec);
+    assert(iNumSeeds>0 && iNumSeeds<=prMSeq->nseqs);
+
+    piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int));
+    memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int));
+
+    /* need them sorted, otherwise the swapping approach below might
+     * break 
+     */
+    qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp);
+
+
+    /* rearrange mseq order so that we can use ktuple_pairdist code as
+     * is. That is, swap seeds and non-seeds such that the seeds end
+     * up in front of mseq. This way we can use the KTuplePairDist()
+     * code, without making a copy of mseq. Also, keep an array of
+     * indices which serves to restore the original sequence order
+     * after putting the seeds in front
+     *
+     */
+    restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
+    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+        restore[iSeqIndex] = iSeqIndex;
+    }
+    for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
+#if TRACE
+        Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %u", piSortedSeeds[iSeedIndex], iSeedIndex);
+#endif
+        if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
+            int swap;
+            SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
+
+            swap = restore[iSeedIndex];
+            restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
+            restore[piSortedSeeds[iSeedIndex]] = swap;
+        }
+    }
+#if TRACE
+    printf("DEBUG(%s|%s():%d): restore indices =",
+           __FILE__, __FUNCTION__, __LINE__);
+    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+        printf(" %u:%u", iSeqIndex, restore[iSeqIndex]);
+    }
+    printf("\n");
+
+    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+        Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now:  seq = %s",
+                  iSeqIndex, prMSeq->sqinfo[iSeqIndex].name);
+    }
+#endif
+
+
+    /* convert sequences into vectors of distances by calling pairwise
+     * distance function for each seq/seed pair
+     *
+     */
+    if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType,
+                           0, iNumSeeds, 0, prMSeq->nseqs,
+                           NULL, NULL)) {
+        Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed.");
+        FreeSymMatrix(&prDistmat);
+        CKFREE(piSortedSeeds);
+        CKFREE(restore);
+        return -1;
+    }
+#if TRACE
+    {
+        char **labels;
+        labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *));
+        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+            labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name;
+        }
+        SymMatrixPrint(prDistmat, labels, NULL);
+        CKFREE(labels);
+    }
+#endif
+
+
+    /* update ppdSeqVec according to prDistmat. keep in mind that we
+     * changed mseq order
+     *
+     */
+    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
+            ppdSeqVec[restore[iSeqIndex]][iSeedIndex] =
+                SymMatrixGetValue(prDistmat, iSeqIndex, iSeedIndex);
+        }
+    }
+
+
+    /* restore mseq order by reverse swapping
+     *
+     * need reverse order now, so start from top index
+     */
+    iSeedIndex=iNumSeeds-1;
+    do {
+#if TRACE
+        Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %d", piSortedSeeds[iSeedIndex], iSeedIndex);
+#endif
+        if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
+            int swap;
+
+            SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
+
+            swap = restore[iSeedIndex];
+            restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
+            restore[piSortedSeeds[iSeedIndex]] = swap;
+        }
+    } while (0 != iSeedIndex--);
+#if TRACE
+     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+         Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d:  seq = %s %s",
+                   iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]);
+     }
+#endif
+
+#if TRACE
+    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+        printf("DEBUG: seq %-4u as distance vector =", iSeqIndex);
+        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
+            printf(" %f", ppdSeqVec[iSeqIndex][iSeedIndex]);
+        }
+        printf("\n");
+    }
+#endif
+
+    
+    FreeSymMatrix(&prDistmat);
+    CKFREE(restore);
+#if TIMING
+    StopwatchStop(stopwatch);
+    StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
+    StopwatchFree(stopwatch);
+#endif
+
+
+    return 0;
+}
+/***   end: SeqToVec()   ***/
+
+
+
+/**
+ * @brief Select seeds to be used from an prMSeq
+ *
+ * @param[out] piSeeds
+ * Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated.
+ * @param[in] iNumSeeds
+ * Number of seeds to be picked
+ * @param[in] iSelectionMethod
+ * Seed selection method to be used
+ * @param[in] prMSeq
+ * The prMSeq structure to pick sequences from
+ *
+ * @return: Non-zero on failure
+ *
+ */    
+int
+SeedSelection(int *piSeeds, int iNumSeeds, int iSelectionMethod,  mseq_t *prMSeq)
+{
+    /* seed iterator */
+    int iSeedIndex;
+    /* sequence iterator */
+    int iSeqIndex;
+
+    if (SELECT_SEEDS_RANDOMLY == iSelectionMethod) {
+        int *piPermArray;
+
+        Log(&rLog, LOG_INFO,
+             "Using %d seeds (randomly chosen) for mBed (from a total of %d sequences)",
+             iNumSeeds, prMSeq->nseqs);
+
+        PermutationArray(&piPermArray, iNumSeeds);
+        /* copy to piSeeds */
+        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
+            piSeeds[iSeedIndex] = piPermArray[iSeedIndex];
+        }
+        CKFREE(piPermArray);
+        
+    }  else if (SELECT_SEEDS_BY_LENGTH == iSelectionMethod) {
+
+        /* step size for picking with constant stride */
+        int iStep;
+        int *piSeqLen =  CKMALLOC(prMSeq->nseqs * sizeof(int));
+        int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int));
+
+        Log(&rLog, LOG_INFO,
+             "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)",
+             iNumSeeds, prMSeq->nseqs);
+
+        iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big
+                                            * due to rounding */
+        /* first get an array of seq indices order according to
+           corresponding sequence length: piOrder */
+        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+            piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
+        }
+        QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE);
+#if 0
+        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+            Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)",
+                      iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]);
+        }
+#endif
+        CKFREE(piSeqLen);
+        iSeedIndex = 0;
+        for (iSeqIndex=0; iSeedIndex<iNumSeeds; iSeqIndex+=iStep) {
+            piSeeds[iSeedIndex++] = piOrder[iSeqIndex];
+        }
+        CKFREE(piOrder);
+
+    } else {
+        
+        Log(&rLog, LOG_ERROR, "Internal error: unknown seed selection type");
+        return -1;
+    }
+
+
+#ifdef PICK_SEEDS_FROM_KMEANS_CLUSTERS
+    /* do initial mbedding and kmeans. then pick seeds from each cluster and
+       do full mbed with these seeds. idea is that those seeds represent the
+       sequence space better than random seeds. however, this reduces the
+       quality slightly. random is almost always better.
+    */
+
+    if (1) {
+        /* seqs represented as distance vectors */
+        double **ppdSeqVec;
+        double dCost = -1.0;
+        /* assignments for each seq to the K newly created clusters */
+        int *piKMeansClusterAssignments = CKMALLOC(prMSeq->nseqs * sizeof(int));
+        double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double));
+        /* create a copy of ppdSeqVec suitable for KMeans */
+        double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double));
+        bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool));
+
+        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds");
+        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters");
+        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round");
+        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE");
+        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good");
+
+        /*
+          Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds");
+          iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2);
+        */
+
+        ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
+        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+            ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
+        }
+        if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) {
+            Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
+            return -1;
+        }
+
+        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+            int iDim;
+            for (iDim=0; iDim<iNumSeeds; ++iDim) {
+                pdKMeansVectors[iDim*iSeqIndex + iDim] = ppdSeqVec[iSeqIndex][iDim];
+            }
+        }
+
+
+        Log(&rLog, LOG_FORCED_DEBUG, "%s\n", "FIXME hardcoded RESTARTS_PER_SPLIT");
+        dCost = KMeans(prMSeq->nseqs, iNumSeeds, iNumSeeds,
+                       pdKMeansVectors,
+                       RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
+                       pdKMeansClusterCenters, piKMeansClusterAssignments);
+        Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost);
+        
+        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers");
+
+#if TRACE
+        Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:");
+        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+            Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d",
+                      iSeqIndex,  prMSeq->sqinfo[iSeqIndex].name,
+                      piKMeansClusterAssignments[iSeqIndex]);
+        }
+#endif
+
+        Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds");
+        iSeedIndex = 0;
+        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+            int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex];
+            if (pbClusterUsed[iAssignedCluster]) {
+                continue;
+            } else {
+                /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/
+                piSeeds[iSeedIndex++] = iSeqIndex;
+                pbClusterUsed[iAssignedCluster] = TRUE;
+            }
+        }
+        CKFREE(pbClusterUsed);
+
+        CKFREE(pdKMeansVectors);
+        CKFREE(pdKMeansClusterCenters);
+        CKFREE(piKMeansClusterAssignments);
+        
+    }
+    /* if (1) */
+#endif
+
+
+    if (LOG_DEBUG <= rLog.iLogLevelEnabled) {
+        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
+            Log(&rLog, LOG_DEBUG, "Picked sequence %d (%s) as seed no %d",
+                 piSeeds[iSeedIndex], prMSeq->sqinfo[piSeeds[iSeedIndex]].name, iSeedIndex);
+        }
+    }
+    
+    return 0; 
+}
+/* end of SeedSelection() */
+
+
+
+
+/**
+ * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with
+ * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
+ *
+ * @param[out] prKMeansResult_p
+ * Result of Bisecting KMeans. Will be allocated here.
+ * Caller has to free. See @see FreeKMeansResult()
+ * @param[in] iNObjs
+ * Number of objects/sequences to cluster
+ * @param[in] iDim
+ * Dimensionality of input data
+ * @param[in] ppdVectors
+ * each row holds iDim points for this object's coordinates
+ * @param[in] iMinRequiredObjsPerCluster
+ * Minimum number of objects per Cluster (inclusive)/
+ * @param[in] iMaxAllowedObjsPerCluster
+ * Maximum number of objects per Cluster (inclusive). Function returns once no
+ * cluster contains more then this number of objects. Soft limit!
+ *
+ * @note Convoluted code. Could use some restructuring. My apologies.
+ * AW
+ *
+ */
+void
+BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
+                const int iNObjs, const int iDim, double **ppdVectors,
+                const int iMinRequiredObjsPerCluster,
+                const int iMaxAllowedObjsPerCluster)
+{
+    int iN, iD;
+    /* cluster centers for each cluster created at each split */
+    double *pdKClusterCenters;
+    /* keep track of updated object indices per newly created
+     * cluster */
+    int *piCurObjToUpdate;
+    /* number of times we called 2-means */
+    int iNRounds;
+    /* flag for unsuccessful cluster split */
+    bool bNaNDetected = FALSE;
+    /* flag for detected small cluster after split  */
+    bool bSmallClusterDetected;
+    /* queue of clusters which are to be split */
+    int_queue_t rClusterSplitQueue;
+    
+#if TIMING
+    Stopwatch_t *stopwatch = StopwatchCreate();
+#endif
+
+
+    piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int));
+
+    NewKMeansResult(prKMeansResult_p);
+
+    /* new cluster centers created at each split/KMeans run
+     */
+    pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double));
+
+    /* init results by setting a first cluster that contains all objects
+     */
+    (*prKMeansResult_p)->iNClusters = 1;
+    (*prKMeansResult_p)->iDim = iDim;
+    /* fake center coordinates of first cluster */
+    (*prKMeansResult_p)->ppdClusterCenters =
+        (double **) CKMALLOC(1 * sizeof(double *));
+    (*prKMeansResult_p)->ppdClusterCenters[0] =
+        (double *) CKMALLOC(iDim * sizeof(double));
+    /* objects per cluster */
+    (*prKMeansResult_p)->piNObjsPerCluster =
+        (int *) CKMALLOC(1 * sizeof(int));
+    (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs;
+    /* object indices per cluster */
+    (*prKMeansResult_p)->ppiObjIndicesPerCluster =
+        (int **) CKMALLOC(1 * sizeof(int *));
+    (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *)
+        CKMALLOC(iNObjs * sizeof(int));
+    for (iN=0; iN<iNObjs; iN++) {
+        (*prKMeansResult_p)->ppiObjIndicesPerCluster[0][iN] = iN;
+    }
+
+
+
+    /* Starting with the first cluster that now contains all the
+     * sequences keep splitting until no cluster contains more than
+     * iMaxAllowedObjsPerCluster
+     *
+     * Keep a queue of clusters (rClusterSplitQueue) to split
+     *
+     * At each split values/memory of the just split cluster will be
+     * reused and exactly one new only allocated.
+     *
+     * Example:
+     * Given the following cluster assignments
+     * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4
+     * and a K-Means split in cluster 3 at |:
+     * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4
+     * The result should be this:
+     * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4
+     *
+     *
+     */
+    INT_QUEUE_INIT(&rClusterSplitQueue);
+    if (iNObjs>iMaxAllowedObjsPerCluster) {
+        /* pish fake first cluster index */
+        INT_QUEUE_PUSH(&rClusterSplitQueue, 0); 
+    }
+    iNRounds = 0;
+    while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) {
+        /* assignments for each seq to the K newly created clusters */
+        int *piKClusterAssignments;
+        /* number of objects in cluster that is to be split */
+        int iNObjsInClusterToSplit;
+        /* coordinates of points in cluster that is to be split
+         * array of size n*d where [d*i + j] gives coordinate j of point i
+         */
+        double *pdVectorsInClusterToSplit;
+        /* copy of object indices in split cluster */
+        int *piObjIndicesOfSplitCluster;
+        /* best cost of kmeans rounds */
+        double dCost = -1.0;
+
+        /*  indices for the two created clusters
+         */
+        /* index of cluster to split */
+        int iClusterToSplot;
+        /* index of newly created cluster at each round. data for the
+           other created cluster goes to the one just split,
+           i.e. (*piClusterToSplot) */
+        int iNewClusterIdx;
+
+#if TIMING
+        StopwatchZero(stopwatch);
+        StopwatchStart(stopwatch);
+#endif
+
+        INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot);
+        
+        iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot];
+        piKClusterAssignments = (int *)
+            CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
+
+        pdVectorsInClusterToSplit = (double *)
+            CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double));
+        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+            for (iD=0; iD<iDim; ++iD) {
+                int iThisObjIdx =
+                    (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN];
+                pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD];
+            }
+        }
+
+#if TRACE
+        Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
+                  iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
+        fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:",
+                __FILE__, __FUNCTION__, __LINE__);
+        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+            fprintf(stderr, " %u",
+                    (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
+        }
+        fprintf(stderr, "\n");
+        (void) fflush(stderr);
+#endif
+
+#if TRACE
+        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+            fprintf(stderr,
+                    "DEBUG(%s|%s():%d): Coordinate of object %u (real index %u) in cluster to split:",
+                    __FILE__, __FUNCTION__, __LINE__, iN,
+                    (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
+            for (iD=0; iD<iDim; iD++) {
+                fprintf(stderr, " %f", pdVectorsInClusterToSplit[iDim*iN + iD]);
+            }
+            fprintf(stderr, "\n");
+            (void) fflush(stderr);
+        }
+#endif
+        
+       /* KMeans(1 "The number of points in the data set",
+        *        2 "The number of clusters to look for",
+        *        3 "The number of dimensions that the data set lives in",
+        *        4 "points: An array of size n*d where points[d*i + j] gives coordinate j of point i",
+        *        5 "attempts: The number of times to independently run k-means",
+        *        6 "use_lloyds_method: uses kmpp if false, otherwise lloyds method",
+        *        7 "centers: This can either be null or an array of size k*d.
+        *           In the latter case, centers[d*i + j] will give coordinate j of center i.
+        *           If the cluster is unused, it will contain NaN instead.",
+        *        8 "assignments: This can either be null or an array of size n.
+        *           In the latter case, it will be filled with the cluster that each point is assigned to
+        *           (an integer between 0 and k-1 inclusive).");
+        */
+        dCost = KMeans(iNObjsInClusterToSplit, 2, iDim,
+                       pdVectorsInClusterToSplit,
+                       RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
+                       pdKClusterCenters, piKClusterAssignments);
+
+#if TRACE
+        Log(&rLog, LOG_FORCED_DEBUG, "%s", "Raw K-Means output:");
+        for (iN=0; iN<2; iN++) {
+            fprintf(stderr, "DEBUG(%s|%s():%d): Cluster Center %u =",
+                    __FILE__, __FUNCTION__, __LINE__, iN);
+            for (iD=0; iD<iDim; iD++) {
+                fprintf(stderr, " %f", pdKClusterCenters[iN*iDim+iD]);
+            }
+            fprintf(stderr, "\n");
+            (void) fflush(stderr);
+        }
+        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+            Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u to cluster %d (of #%u)",
+                      iN,  piKClusterAssignments[iN], 2);
+        }
+#endif
+        
+        /* real index of one of the newly created clusters. the other
+         * one is iClusterToSplot */
+        iNewClusterIdx = (*prKMeansResult_p)->iNClusters;
+
+        
+        /* We don't want Clusters which are too small. Check here if a
+         * split created a small cluster and if yes, discard the
+         * solution. Because the cluster has already been removed from
+         * the queue, we can just continue.
+         */
+        bSmallClusterDetected = FALSE;
+        if (iMinRequiredObjsPerCluster>1) {
+            int iNObjsCluster[2];
+            iNObjsCluster[0] = 0; /* first cluster */
+            iNObjsCluster[1] = 0; /* second cluster */
+            for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+                iNObjsCluster[piKClusterAssignments[iN]]+=1;
+            }
+            
+            if (iNObjsCluster[0]<iMinRequiredObjsPerCluster
+                ||
+                iNObjsCluster[1]<iMinRequiredObjsPerCluster) {
+                bSmallClusterDetected = TRUE;
+                Log(&rLog, LOG_FORCED_DEBUG, "Skipping this split because objs in 1st/2nd cluster = %d/%d < %d",
+                          iNObjsCluster[0], iNObjsCluster[1], iMinRequiredObjsPerCluster);
+            }
+        }
+        
+        /* Same logic as for small clusters applies if KMeans couldn't
+         * split the cluster. In this case one of its center
+         * coordinates will be NaN, which we check for in the
+         * following.
+         */
+        if (! bSmallClusterDetected) {
+            for (iN=0; iN<2; iN++) {
+                bNaNDetected = FALSE;
+                for (iD=0; iD<iDim; iD++) {
+                    if (0 != isnan(pdKClusterCenters[iN*iDim+iN])) {
+                        /* Got NaN as coordinate after splitting */
+                        Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s",
+                             __FUNCTION__,
+                             iClusterToSplot, iNObjsInClusterToSplit,
+                         "Hope it's not too big and doesn't slow things down."
+                            );
+                        bNaNDetected = TRUE;
+                        break;
+                    }
+                }
+                if (bNaNDetected) {
+                    break;
+                }
+            }
+        }
+        
+        /* Discarding split of this cluster. It has been removed from the
+         * queue already. so just continue
+         */
+        if (bNaNDetected || bSmallClusterDetected) {
+            CKFREE(piKClusterAssignments);
+            CKFREE(pdVectorsInClusterToSplit);
+            continue;
+        }
+
+
+        /* update cluster centers: pdClusterCenters
+         *
+         */
+        /* reuse memory of existing/old/split cluster
+         */
+        for (iN=0; iN<iDim; iN++) {
+            double dCoord = pdKClusterCenters[0*iDim+iN];
+            (*prKMeansResult_p)->ppdClusterCenters[iClusterToSplot][iN] = dCoord;
+        }
+        /* realloc and set new one
+         */
+        (*prKMeansResult_p)->ppdClusterCenters = (double **)
+            CKREALLOC((*prKMeansResult_p)->ppdClusterCenters,
+                      ((*prKMeansResult_p)->iNClusters+1)  * sizeof(double *));
+        (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *)
+            CKMALLOC(iDim * sizeof(double));
+        for (iD=0; iD<iDim; iD++) {
+            double dCoord = pdKClusterCenters[1*iDim+iD];
+            (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx][iD] = dCoord;
+        }
+#if TRACE
+        Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:");
+        for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
+            fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =",
+                    __FILE__, __FUNCTION__, __LINE__, iN);
+            for (iD=0; iD<iDim; iD++) {
+                fprintf(stderr, " %f", (*prKMeansResult_p)->ppdClusterCenters[iN][iD]);
+            }
+            fprintf(stderr, "\n");
+            (void) fflush(stderr);
+        }
+#endif
+
+
+        /* update #seqs per cluster: piNObjsPerCluster
+         *
+         */
+        (*prKMeansResult_p)->piNObjsPerCluster = (int *)
+            CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster,
+                      ((*prKMeansResult_p)->iNClusters+1) * sizeof(int));
+        /* init new and old one to zero */
+        (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0;
+        (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0;
+        /* now update values */
+        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+            if (0 == piKClusterAssignments[iN]) {
+                (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] += 1;
+            } else if (1 == piKClusterAssignments[iN]) {
+                (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1;
+            } else {
+                /* there used to be code for iK>=2 in r101 */
+                Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
+                      piKClusterAssignments[iN]);
+            }
+        }
+#if TRACE
+        Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:");
+        for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
+            Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences",
+                      iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]);
+        }
+#endif
+        /* queue clusters if still they are still too big
+         */
+        if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) {
+            INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot);
+        }
+        if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) {
+            INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx);
+        }
+        
+        
+
+        /* update which objects belong to which cluster:
+         *
+         * note:  piNObjsPerCluster needs to be already updated
+         *
+         */
+        /* create a copy of the object indices in the split cluster
+         * (original will be overwritten)
+         */
+        piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
+        memcpy(piObjIndicesOfSplitCluster,
+               (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
+               iNObjsInClusterToSplit * sizeof(int));
+
+        (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **)
+            CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster,
+                      ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *));
+
+
+        (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] =
+            (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
+                              (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int));
+
+        (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] =
+            (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int));
+
+
+        /* now reassign the object indices to the assigned cluster
+         */
+        piCurObjToUpdate[0] = 0;
+        piCurObjToUpdate[1] = 0;
+        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+            int iThisObjIdx = piObjIndicesOfSplitCluster[iN];
+            int iThisClusterAssignment =  piKClusterAssignments[iN];
+            int iThisOffset = piCurObjToUpdate[iThisClusterAssignment];
+            int iThisClusterIndex = 0;
+
+            if (0 == iThisClusterAssignment) {
+                iThisClusterIndex = iClusterToSplot;
+            } else if (1 == iThisClusterAssignment) {
+                iThisClusterIndex = iNewClusterIdx;
+            } else {
+                /* there used to be code for iK>=2 in r101 */
+                Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
+                      piKClusterAssignments[iN]);
+            }
+#if 0
+            Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d",
+                      iThisClusterIndex, iThisOffset, iThisObjIdx);
+#endif
+            (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx;
+            piCurObjToUpdate[iThisClusterAssignment]+=1;
+        }
+        CKFREE(piObjIndicesOfSplitCluster);
+#if TRACE
+        for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
+            int iObj;
+            fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ",
+                    __FILE__, __FUNCTION__, __LINE__, iN);
+            for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) {
+                fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]);
+            }
+            fprintf(stderr, "\n");
+            (void) fflush(stderr);
+        }
+#endif
+
+
+#if TIMING
+        StopwatchStop(stopwatch);
+        StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch);
+#endif
+
+
+        /* finally: increase number of clusters
+         */
+        (*prKMeansResult_p)->iNClusters += 1;
+        iNRounds += 1;
+        CKFREE(piKClusterAssignments);
+        CKFREE(pdVectorsInClusterToSplit);
+
+    } /* while */
+    INT_QUEUE_DESTROY(&rClusterSplitQueue);
+
+
+    Log(&rLog, LOG_DEBUG,
+         "Bisecting K-means finished after %d rounds (no more clusters to split)",
+         iNRounds);
+         
+#if TIMING
+    StopwatchFree(stopwatch);
+#endif
+
+    /* @note could use progress/timer */
+
+    CKFREE(pdKClusterCenters);
+    CKFREE(piCurObjToUpdate);
+
+    return;
+}
+/***   end: BisectingKmeans()   ***/
+
+
+
+/**
+ *
+ * @brief From scratch reimplementation of mBed: Blackshields et al.
+ * (2010); PMID 20470396.
+ *
+ * Idea is a follows:
+ * - convert sequences into vectors of distances
+ * - cluster the vectors using k-means
+ * - cluster each of the k clusters using upgma (used cached distances
+ *    from above?)
+ * - join the sub-clusters to create on tree (use UPGMA on k-means
+ *   medoids)
+ *
+ *
+ * @param[out] prMbedTree_p
+ * Created upgma tree. will be allocated here. use FreeMuscleTree()
+ * to free
+ * @param[in] prMSeq
+ * Multiple sequences
+ * @param[in] iPairDistType
+ * Distance measure for pairwise alignments
+ * @param[in] pcGuidetreeOut
+ * Passed down to GuideTreeUpgma()
+ *
+ * @return Zero on success, non-zero on error
+ *
+ */
+int
+Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
+     const char *pcGuidetreeOut)
+{
+    /* number of seeds */
+    int iNumSeeds;
+    /* seed indices matching prMSeq */
+    int *piSeeds; 
+    /* seqs represented as distance vectors */
+    double **ppdSeqVec;
+    /* kmeans result */
+    bisecting_kmeans_result_t *prKMeansResult = NULL;
+    /* distance matrix of kmeans (pre-)cluster centers */
+    symmatrix_t *prPreClusterDistmat = NULL;
+    /* auxiliary for symmetric matrix output; tree routines etc */
+    char **ppcLabels;
+    int iNodeIndex;
+    /* mapping of cluster-center tree node indices to corresponding
+       cluster */
+    int *piClusterToTreeNode;
+    int iClusterIndex;
+    int iI, uJ;
+    FILE *pfOut;
+    progress_t *prSubClusterDistanceProgress;
+    bool bPrintCR = (LOG_VERBOSE<=rLog.iLogLevelEnabled) ? FALSE : TRUE;
+
+#if FULL_WITHIN_CLUSTER_DISTANCES
+    Log(&rLog, LOG_WARN,
+        "FULL_WITHIN_CLUSTER_DISTANCES (using min %d / max %d seqs per subcluster) is a quick-hack. Brace yourself...",
+        MIN_REQUIRED_SEQ_PER_PRECLUSTER, MAX_ALLOWED_SEQ_PER_PRECLUSTER);
+#endif
+    
+#if MBED_TIMING
+    Stopwatch_t *stopwatch = StopwatchCreate();
+#endif
+
+    assert(NULL != prMbedTree_p);
+    assert(NULL != prMSeq);
+
+
+    iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs);
+    if (iNumSeeds >= prMSeq->nseqs) {
+        /* -1 is condition for RandomUniqueIntArray */
+        iNumSeeds = prMSeq->nseqs-1; 
+        Log(&rLog, LOG_DEBUG,
+             "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d",
+             iNumSeeds);
+    }
+
+       
+    /* Turn sequences into vectors of distances to the seeds
+     *
+     */
+    piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int));
+    if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) {
+        Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed");
+        return -1;
+    }
+    ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
+    for (iI=0; iI<prMSeq->nseqs; iI++) {
+        ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
+    }
+    if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) {
+        Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
+        return -1;
+    }
+    CKFREE(piSeeds);
+
+    
+    /* Calculate (pre-)clusters of sequence vectors by applying
+     * bisecting kmeans
+     *
+     */
+#if MBED_TIMING
+    /* start clock only here, to make sure we don't include pairwise
+     * distance computation */
+    StopwatchZero(stopwatch);
+    StopwatchStart(stopwatch);
+#endif
+
+    BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec,
+                    MIN_REQUIRED_SEQ_PER_PRECLUSTER,
+                    MAX_ALLOWED_SEQ_PER_PRECLUSTER);
+    Log(&rLog, LOG_INFO,
+         "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)",
+         prKMeansResult->iNClusters,
+         MIN_REQUIRED_SEQ_PER_PRECLUSTER,
+         MAX_ALLOWED_SEQ_PER_PRECLUSTER);
+
+
+#if PRINT_CLUSTER_DISTRIBUTION
+    Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters);
+    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
+#if TRACE
+        int iD;
+        Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI);
+        fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =",
+                __FILE__, __FUNCTION__, __LINE__);
+        for (iD=0; iD<iNumSeeds; iD++) {
+            fprintf(stderr, " %f", prKMeansResult->ppdClusterCenters[iI][iD]);
+        }
+        fprintf(stderr, "\n");
+        fflush(stderr);
+#endif
+        Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned",
+                  iI, prKMeansResult->piNObjsPerCluster[iI]);
+#if TRACE
+        for (uJ=0; uJ<prKMeansResult->piNObjsPerCluster[iI]; uJ++) {
+            int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][uJ];
+            
+            Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)",
+                      iI, uJ,  iRealIndex, prMSeq->sqinfo[iRealIndex].name);
+        }
+#endif
+    }
+#endif
+
+    
+    /* Cluster pre-clusters produced by k-means.
+     *
+     * Do this by calculating the vector distances of the cluster
+     * centers and applying UPGMA.
+     *
+     * @note could try to force-balance the tree here
+     *
+     */
+    if (0 != NewSymMatrix(&prPreClusterDistmat,
+                     prKMeansResult->iNClusters, prKMeansResult->iNClusters)) {
+        Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed");
+    }
+    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
+        for (uJ=iI+1; uJ<prKMeansResult->iNClusters; uJ++) {
+            double dDist;
+            dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI],
+                             prKMeansResult->ppdClusterCenters[uJ],
+                             iNumSeeds);
+            SymMatrixSetValue(prPreClusterDistmat, iI, uJ, dDist);
+            /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f",
+               iI, uJ, dDist); */
+        }
+    }
+    
+
+    /* labels needed for the guide tree building routine only */
+    ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*));
+    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
+        ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char));
+        (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI);
+    }
+#if TRACE
+    Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:");
+    SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL);
+#endif
+
+    GuideTreeUpgma(prMbedTree_p,
+                   ppcLabels, prPreClusterDistmat, NULL);
+
+    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
+        CKFREE(ppcLabels[iI]);
+    }
+    CKFREE(ppcLabels);
+#if TRACE
+    Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:");
+    LogTree(*prMbedTree_p);
+#endif
+
+
+
+    /* Now replace each leaf in the pre-cluster-center tree
+     * appropriately, i.e. with the corresponding sub-cluster.
+     *
+     * For each leaf/sub-cluster, create a distance matrix for the
+     * corresponding sequences. Use distances between vectors as
+     * approximated distances between sequences. Then create a
+     * guide-tree by applying UPGMA.
+     *
+     */
+
+    /* Get a mapping of (pre)cluster number and leaf-node indices in
+     * the cluster-center tree. We can add trees to prMbedTree_p
+     * because AppendTrees() guarantees that no other than the node to
+     * append to changes.
+     */
+    piClusterToTreeNode = (int*)
+        CKMALLOC(prKMeansResult->iNClusters * sizeof(int));
+    iNodeIndex = FirstDepthFirstNode(*prMbedTree_p);
+    do {
+        if (IsLeaf(iNodeIndex, *prMbedTree_p)) {
+            int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p);
+            piClusterToTreeNode[iLeafId] = iNodeIndex;
+        }
+        iNodeIndex =  NextDepthFirstNode(iNodeIndex, *prMbedTree_p);
+    } while (NULL_NEIGHBOR != iNodeIndex);
+
+
+
+
+    /* Now step through all the leafs and replace them with the
+     * corresponding sub-trees
+     */
+    NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO),
+                "Distance calculation within sub-clusters", bPrintCR);
+    /* for each cluster */
+    for (iClusterIndex=0;
+         iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) {
+        /* distance matrix for the sub-cluster */
+        symmatrix_t *prWithinClusterDistances = NULL;
+        int iNSeqInCluster;
+        tree_t *prSubClusterTree = NULL;
+        
+        ProgressLog(prSubClusterDistanceProgress,
+                    iClusterIndex, prKMeansResult->iNClusters, FALSE);
+
+#if FULL_WITHIN_CLUSTER_DISTANCES
+        mseq_t *prSubClusterMSeq;
+        int iPairDistType;
+        int iSeqIndex;
+
+        Log(&rLog, LOG_DEBUG, 
+            "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER");
+
+        if (TRUE == prMSeq->aligned)  {
+            iPairDistType = PAIRDIST_SQUIDID_KIMURA;
+        } else {
+            iPairDistType = PAIRDIST_KTUPLE;
+        }
+#endif
+            
+        iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex];
+#if TRACE
+        Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d",
+                  iClusterIndex, iNSeqInCluster);
+#endif
+
+#if FULL_WITHIN_CLUSTER_DISTANCES
+        /* create an mseq structure for sequences in this cluster
+         * don't need most of the members (e.g. orig_seq) but copy
+         * them anyway for the sake of completeness
+         */
+        NewMSeq(&prSubClusterMSeq);
+    
+        prSubClusterMSeq->nseqs = iNSeqInCluster;
+        prSubClusterMSeq->seqtype = prMSeq->seqtype;
+        if (NULL!=prMSeq->filename) {
+            prSubClusterMSeq->filename = CkStrdup(prMSeq->filename);
+        }
+        prSubClusterMSeq->seq =  (char **)
+            CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
+        prSubClusterMSeq->orig_seq =  (char **)
+            CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
+        prSubClusterMSeq->sqinfo =  (SQINFO *)
+            CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO));
+        
+        for (iSeqIndex=0; iSeqIndex<iNSeqInCluster; iSeqIndex++) {
+            int iRealSeqIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iSeqIndex];
+            prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]);
+            prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]);
+            SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]);
+#if TRACE
+            Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)",
+                iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name,
+                iRealSeqIndex);
+#endif
+        }
+#endif
+
+        
+        /* Create a distance matrix for this sub-cluster
+         * (prWithinClusterDistances) by using the vector distances or
+         * ktuple distances.
+         *
+         * Then apply UPGMA to get a subcluster tree
+         * (prSubClusterTree) and append created tree to the
+         * pre-cluster-tree (prMbedTree_p)
+         *
+         */
+#if FULL_WITHIN_CLUSTER_DISTANCES
+
+        /* compute distances, but be quiet */
+        if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType,
+                          0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs,
+                          NULL, NULL)) {
+            Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
+            return -1;
+        }
+#else
+        if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) {
+            Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed");
+        }
+        for (iI=0; iI<iNSeqInCluster; iI++) {
+            int iRealIndexI = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
+
+            for (uJ=iI+1; uJ<iNSeqInCluster; uJ++) {
+                int iRealIndexJ = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][uJ];
+                double dist;
+
+                /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s",
+                   iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name,
+                   uJ, prMSeq->sqinfo[iRealIndexJ].name); */
+
+                if (1 == USE_EUCLIDEAN_DISTANCE) {
+                    dist = EuclDist(ppdSeqVec[iRealIndexI],
+                                    ppdSeqVec[iRealIndexJ], iNumSeeds);
+                } else {
+                    dist = CosDist(ppdSeqVec[iRealIndexI],
+                                   ppdSeqVec[iRealIndexJ], iNumSeeds);
+                }
+                SymMatrixSetValue(prWithinClusterDistances, iI, uJ, dist);
+            }
+        }
+#endif
+
+        /* labels needed for the guide tree building routine only */
+        ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*));
+        for (iI=0; iI<iNSeqInCluster; iI++) {
+#if FULL_WITHIN_CLUSTER_DISTANCES
+            ppcLabels[iI] = prSubClusterMSeq->sqinfo[iI].name;
+#else       
+            int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
+            ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name;
+#endif
+        }        
+#if TRACE
+        Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d",
+                  iClusterIndex, prKMeansResult->iNClusters);
+        SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL);
+#endif
+
+        
+        GuideTreeUpgma(&prSubClusterTree, ppcLabels,
+                       prWithinClusterDistances, NULL);
+
+        CKFREE(ppcLabels); /* don't free members, they just point */
+#if 0
+        Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex);
+        LogTree(prSubClusterTree);
+#endif
+
+        
+        /* The guide tree id's (that point to the sequences) now start
+         * from 0, i.e. the association with the prMSeq numbering is
+         * broken and fixed in the following
+         */
+        for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) {
+            if (IsLeaf(iNodeIndex, prSubClusterTree)) {
+                int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree);
+                int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId];
+#if 0
+                Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)",
+                          iNodeIndex, iLeafId,
+                          GetLeafName(iNodeIndex, prSubClusterTree),
+                          iRealId, prMSeq->sqinfo[iRealId].name);
+#endif
+                SetLeafId(prSubClusterTree, iNodeIndex, iRealId);
+            }
+        }
+
+
+        /* Append the newly created tree (prSubClusterTree) to the
+         * corresponding node index of prMbedTree_p.
+         */
+#if TRACE
+        Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s",
+                  piClusterToTreeNode[iClusterIndex],
+                  GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p));
+#endif
+
+        AppendTree(*prMbedTree_p,
+                   piClusterToTreeNode[iClusterIndex],
+                   prSubClusterTree);
+        /* Note: piClusterToTreeNode is still valid, because
+         * AppendTrees() guarantees that no other than the node to
+         * append to changes. */
+
+#if 0
+        Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex);
+        LogTree(*prMbedTree_p);
+
+        if (0) {
+            char fname[] = "mbed-joined-tree.dnd";
+            FILE *pfOut;
+            if (NULL == (pfOut = fopen(fname, "w"))) {
+                Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname);
+            }
+            MuscleTreeToFile(pfOut, *prMbedTree_p);
+            Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname);
+            fclose(pfOut);
+            Log(&rLog, LOG_FATAL, "DEBUG EXIT");
+        }
+#endif
+
+        /* cleanup
+         */
+        FreeMuscleTree(prSubClusterTree);
+        FreeSymMatrix(&prWithinClusterDistances);
+#if FULL_WITHIN_CLUSTER_DISTANCES
+        FreeMSeq(&prSubClusterMSeq);
+#endif
+    } /* end for each cluster */
+    ProgressDone(prSubClusterDistanceProgress);
+    FreeProgress(&prSubClusterDistanceProgress);
+
+    
+    if (NULL != pcGuidetreeOut) {
+        if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) {
+            Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut);
+        } else {
+            MuscleTreeToFile(pfOut, *prMbedTree_p);
+            Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut);
+            (void) fclose(pfOut);
+        }
+    }
+
+    
+    /* cleanup
+     *
+     */
+#if MBED_TIMING
+    StopwatchStop(stopwatch);
+    StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch);
+    StopwatchFree(stopwatch);
+#endif
+
+    FreeKMeansResult(&prKMeansResult);
+    FreeSymMatrix(&prPreClusterDistmat);
+    for (iI=0; iI<prMSeq->nseqs; iI++) {
+        CKFREE(ppdSeqVec[iI]);
+    }
+    CKFREE(ppdSeqVec);
+    CKFREE(piClusterToTreeNode);
+
+#ifndef NDEBUG
+    TreeValidate(*prMbedTree_p);
+#endif
+    
+    return 0;
+}
+/***   end: Mbed()   ***/
author	clustalomega
date	Tue, 07 Jun 2011 17:04:25 -0400
parents
children