added multiple trials runs to K-means++ clustering algorithm.

JIRA: MATH-548 git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@1137759 13f79535-47bb-0310-9956-ffa450edef68
2011-06-20 19:48:44 +00:00 · 2011-06-20 19:48:44 +00:00 · ff92629a3b
parent 234db70446
commit ff92629a3b
3 changed files with 58 additions and 1 deletions
--- a/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java
+++ b/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java
@ -84,6 +84,60 @@ public class KMeansPlusPlusClusterer<T extends Clusterable<T>> {
        this.emptyStrategy = emptyStrategy;
    }

+    /**
+     * Runs the K-means++ clustering algorithm.
+     *
+     * @param points the points to cluster
+     * @param k the number of clusters to split the data into
+     * @param maxIterations the maximum number of iterations to run the algorithm
+     *     for.  If negative, no maximum will be used
+     * @return a list of clusters containing the points
+     * @throws MathIllegalArgumentException if the data points are null or the number
+     *     of clusters is larger than the number of data points
+     */
+    public List<Cluster<T>> cluster(final Collection<T> points, final int k,
+                                    int numTrials, int maxIterationsPerTrial)
+        throws MathIllegalArgumentException {
+
+        // at first, we have not found any clusters list yet
+        List<Cluster<T>> best = null;
+        double bestVarianceSum = Double.POSITIVE_INFINITY;
+
+        // do several clustering trials
+        for (int i = 0; i < numTrials; ++i) {
+
+            // compute a clusters list
+            List<Cluster<T>> clusters = cluster(points, k, maxIterationsPerTrial);
+
+            // compute the variance of the current list
+            double varianceSum = 0.0;
+            for (final Cluster<T> cluster : clusters) {
+                if (!cluster.getPoints().isEmpty()) {
+
+                    // compute the distance variance of the current cluster
+                    final T center = cluster.getCenter();
+                    final Variance stat = new Variance();
+                    for (final T point : cluster.getPoints()) {
+                        stat.increment(point.distanceFrom(center));
+                    }
+                    varianceSum += stat.getResult();
+
+                }
+            }
+
+            if (varianceSum <= bestVarianceSum) {
+                // this one is the best we have found so far, remember it
+                best            = clusters;
+                bestVarianceSum = varianceSum;
+            }
+
+        }
+
+        // return the best clusters list found
+        return best;
+
+    }
+
    /**
     * Runs the K-means++ clustering algorithm.
     *
--- a/src/site/xdoc/changes.xml
+++ b/src/site/xdoc/changes.xml
@ -52,6 +52,9 @@ The <action> type attribute can be add,update,fix,remove.
    If the output is not quite correct, check for invisible trailing spaces!
     -->
    <release version="3.0" date="TBD" description="TBD">
+      <action dev="luc" type="add" issue="MATH-548">
+        K-means++ clustering can now run multiple trials
+      </action>
      <action dev="luc" type="add" issue="MATH-591">
        Added a way to compute sub-lines intersections, considering sub-lines either
        as open sets or closed sets
--- a/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java
+++ b/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java
@ -65,7 +65,7 @@ public class KMeansPlusPlusClustererTest {

        };
        List<Cluster<EuclideanIntegerPoint>> clusters =
-            transformer.cluster(Arrays.asList(points), 3, 10);
+            transformer.cluster(Arrays.asList(points), 3, 5, 10);

        Assert.assertEquals(3, clusters.size());
        boolean cluster1Found = false;