Mercurial > repos > clustalomega > clustalomega
comparison clustalomega/clustal-omega-1.0.2/src/kmpp/KmTree.h @ 1:bc707542e5de
Uploaded
author | clustalomega |
---|---|
date | Thu, 21 Jul 2011 13:35:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:ff1768533a07 | 1:bc707542e5de |
---|---|
1 // BEWARE: BETA VERSION | |
2 // -------------------- | |
3 // | |
4 // A k-d tree that vastly speeds up an iteration of k-means (in any number of dimensions). The main | |
5 // idea for this data structure is from Kanungo/Mount. This is used internally by Kmeans.cpp, and | |
6 // will most likely not need to be used directly. | |
7 // | |
8 // The stucture works as follows: | |
9 // - All data points are placed into a tree where we choose child nodes by partitioning all data | |
10 // points along a plane parallel to the axis. | |
11 // - We maintain for each node, the bounding box of all data points stored at that node. | |
12 // - To do a k-means iteration, we need to assign points to clusters and calculate the sum and | |
13 // the number of points assigned to each cluster. For each node in the tree, we can rule out | |
14 // some cluster centers as being too far away from every single point in that bounding box. | |
15 // Once only one cluster is left, all points in the node can be assigned to that cluster in | |
16 // batch. | |
17 // | |
18 // Author: David Arthur (darthur@gmail.com), 2009 | |
19 | |
20 #ifndef KM_TREE_H__ | |
21 #define KM_TREE_H__ | |
22 | |
23 // Includes | |
24 #include "KmUtils.h" | |
25 | |
26 // KmTree class definition | |
27 class KmTree { | |
28 public: | |
29 // Constructs a tree out of the given n data points living in R^d. | |
30 KmTree(int n, int d, Scalar *points); | |
31 ~KmTree(); | |
32 | |
33 // Given k cluster centers, this runs a full k-means iterations, choosing the next set of | |
34 // centers and returning the cost function for this set of centers. If assignment is not null, | |
35 // it should be an array of size n that will be filled with the index of the cluster (0 - k-1) | |
36 // that each data point is assigned to. The new center values will overwrite the old ones. | |
37 Scalar DoKMeansStep(int k, Scalar *centers, int *assignment) const; | |
38 | |
39 // Choose k initial centers for k-means using the kmeans++ seeding procedure. The resulting | |
40 // centers are returned via the centers variable, which should be pre-allocated to size k*d. | |
41 // The cost of the initial clustering is returned. | |
42 Scalar SeedKMeansPlusPlus(int k, Scalar *centers) const; | |
43 | |
44 private: | |
45 struct Node { | |
46 int num_points; // Number of points stored in this node | |
47 int first_point_index; // The smallest point index stored in this node | |
48 Scalar *median, *radius; // Bounding box center and half side-lengths | |
49 Scalar *sum; // Sum of the points stored in this node | |
50 Scalar opt_cost; // Min cost for putting all points in this node in 1 cluster | |
51 Node *lower_node, *upper_node; // Child nodes | |
52 mutable int kmpp_cluster_index; // The cluster these points are assigned to or -1 if variable | |
53 }; | |
54 | |
55 // Helper functions for constructor | |
56 Node *BuildNodes(Scalar *points, int first_index, int last_index, char **next_node_data); | |
57 Scalar GetNodeCost(const Node *node, Scalar *center) const; | |
58 | |
59 // Helper functions for DoKMeans step | |
60 Scalar DoKMeansStepAtNode(const Node *node, int k, int *candidates, Scalar *centers, | |
61 Scalar *sums, int *counts, int *assignment) const; | |
62 bool ShouldBePruned(Scalar *box_median, Scalar *box_radius, Scalar *centers, int best_index, | |
63 int test_index) const; | |
64 | |
65 // Helper functions for SeedKMeansPlusPlus | |
66 void SeedKmppSetClusterIndex(const Node *node, int index) const; | |
67 Scalar SeedKmppUpdateAssignment(const Node *node, int new_cluster, Scalar *centers, | |
68 Scalar *dist_sq) const; | |
69 | |
70 int n_, d_; | |
71 Scalar *points_; | |
72 Node *top_node_; | |
73 char *node_data_; | |
74 int *point_indices_; | |
75 }; | |
76 | |
77 #endif |