Added a consistency check for number of points with respect to the number of clusters in Kmeans++ clustering

JIRA: MATH-436 git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@1084577 13f79535-47bb-0310-9956-ffa450edef68
2025-02-09 19:45:52 +00:00 · 2011-03-23 13:19:23 +00:00 · 2011-03-23 13:19:23 +00:00 · c83a4bc0c4
commit c83a4bc0c4
parent 862103b3a8
3 changed files with 45 additions and 2 deletions
--- a/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java
+++ b/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java
@ -23,8 +23,12 @@ import java.util.List;
 import java.util.Random;
 import org.apache.commons.math.exception.ConvergenceException;
 import org.apache.commons.math.exception.MathIllegalArgumentException;
 import org.apache.commons.math.exception.NullArgumentException;
 import org.apache.commons.math.exception.NumberIsTooSmallException;
 import org.apache.commons.math.exception.util.LocalizedFormats;
 import org.apache.commons.math.stat.descriptive.moment.Variance;
 import org.apache.commons.math.util.MathUtils;
 /**
 * Clustering algorithm based on David Arthur and Sergei Vassilvitski k-means++ algorithm.
@ -88,9 +92,21 @@ public class KMeansPlusPlusClusterer<T extends Clusterable<T>> {
     * @param maxIterations the maximum number of iterations to run the algorithm
     *     for.  If negative, no maximum will be used
     * @return a list of clusters containing the points
     * @throws MathIllegalArgumentException if the data points are null or the number
     *     of clusters is larger than the number of data points
     */
-    public List<Cluster<T>> cluster(final Collection<T> points,
+    public List<Cluster<T>> cluster(final Collection<T> points, final int k,
-                                    final int k, final int maxIterations) {
+                                    final int maxIterations)
        throws MathIllegalArgumentException {
        // sanity checks
        MathUtils.checkNotNull(points);
        // number of clusters has to be smaller or equal the number of data points
        if (points.size() < k) {
            throw new NumberIsTooSmallException(points.size(), k, false);
        }
        // create the initial clusters
        List<Cluster<T>> clusters = chooseInitialCenters(points, k, random);
        assignPointsToClusters(clusters, points);
--- a/src/site/xdoc/changes.xml
+++ b/src/site/xdoc/changes.xml
@ -52,6 +52,10 @@ The <action> type attribute can be add,update,fix,remove.
    If the output is not quite correct, check for invisible trailing spaces!
     -->
    <release version="3.0" date="TBD" description="TBD">
      <action dev="luc" type="add" issue="MATH-436" due-to="Thomas Neidhart">
        Added a consistency check for number of points with respect to the number
        of clusters in Kmeans++ clustering
      </action>
      <action dev="mikl" type="add" issue="MATH-437">
        Added two sided Kolmogorov-Smirnov distribution using modified 
        Marsaglia et al. (2003) implementation and quick decisions for certain 
--- a/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java
+++ b/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java
@ -24,6 +24,7 @@ import java.util.Collection;
 import java.util.List;
 import java.util.Random;
 import org.apache.commons.math.exception.NumberIsTooSmallException;
 import org.junit.Assert;
 import org.junit.Test;
@ -246,4 +247,26 @@ public class KMeansPlusPlusClustererTest {
        }
        Assert.assertTrue(uniquePointIsCenter);
    }
    /**
     * 2 variables cannot be clustered into 3 clusters. See issue MATH-436.
     */
    @Test(expected=NumberIsTooSmallException.class)
    public void testPerformClusterAnalysisToManyClusters() {
        KMeansPlusPlusClusterer<EuclideanIntegerPoint> transformer = 
            new KMeansPlusPlusClusterer<EuclideanIntegerPoint>(
                    new Random(1746432956321l));
        EuclideanIntegerPoint[] points = new EuclideanIntegerPoint[] {
            new EuclideanIntegerPoint(new int[] {
                1959, 325100
            }), new EuclideanIntegerPoint(new int[] {
                1960, 373200
            })
        };
        transformer.cluster(Arrays.asList(points), 3, 1);
    }
 }