From c83a4bc0c40e0aa4ee3607e779b96091bf3ba2f5 Mon Sep 17 00:00:00 2001 From: Luc Maisonobe Date: Wed, 23 Mar 2011 13:19:23 +0000 Subject: [PATCH] Added a consistency check for number of points with respect to the number of clusters in Kmeans++ clustering JIRA: MATH-436 git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@1084577 13f79535-47bb-0310-9956-ffa450edef68 --- .../clustering/KMeansPlusPlusClusterer.java | 20 ++++++++++++++-- src/site/xdoc/changes.xml | 4 ++++ .../KMeansPlusPlusClustererTest.java | 23 +++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java b/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java index e09bbc357..db304b45a 100644 --- a/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java +++ b/src/main/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClusterer.java @@ -23,8 +23,12 @@ import java.util.List; import java.util.Random; import org.apache.commons.math.exception.ConvergenceException; +import org.apache.commons.math.exception.MathIllegalArgumentException; +import org.apache.commons.math.exception.NullArgumentException; +import org.apache.commons.math.exception.NumberIsTooSmallException; import org.apache.commons.math.exception.util.LocalizedFormats; import org.apache.commons.math.stat.descriptive.moment.Variance; +import org.apache.commons.math.util.MathUtils; /** * Clustering algorithm based on David Arthur and Sergei Vassilvitski k-means++ algorithm. @@ -88,9 +92,21 @@ public class KMeansPlusPlusClusterer> { * @param maxIterations the maximum number of iterations to run the algorithm * for. If negative, no maximum will be used * @return a list of clusters containing the points + * @throws MathIllegalArgumentException if the data points are null or the number + * of clusters is larger than the number of data points */ - public List> cluster(final Collection points, - final int k, final int maxIterations) { + public List> cluster(final Collection points, final int k, + final int maxIterations) + throws MathIllegalArgumentException { + + // sanity checks + MathUtils.checkNotNull(points); + + // number of clusters has to be smaller or equal the number of data points + if (points.size() < k) { + throw new NumberIsTooSmallException(points.size(), k, false); + } + // create the initial clusters List> clusters = chooseInitialCenters(points, k, random); assignPointsToClusters(clusters, points); diff --git a/src/site/xdoc/changes.xml b/src/site/xdoc/changes.xml index f9e19b50a..c02efd495 100644 --- a/src/site/xdoc/changes.xml +++ b/src/site/xdoc/changes.xml @@ -52,6 +52,10 @@ The type attribute can be add,update,fix,remove. If the output is not quite correct, check for invisible trailing spaces! --> + + Added a consistency check for number of points with respect to the number + of clusters in Kmeans++ clustering + Added two sided Kolmogorov-Smirnov distribution using modified Marsaglia et al. (2003) implementation and quick decisions for certain diff --git a/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java b/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java index 1a4402610..f7c9b646d 100644 --- a/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java +++ b/src/test/java/org/apache/commons/math/stat/clustering/KMeansPlusPlusClustererTest.java @@ -24,6 +24,7 @@ import java.util.Collection; import java.util.List; import java.util.Random; +import org.apache.commons.math.exception.NumberIsTooSmallException; import org.junit.Assert; import org.junit.Test; @@ -246,4 +247,26 @@ public class KMeansPlusPlusClustererTest { } Assert.assertTrue(uniquePointIsCenter); } + + /** + * 2 variables cannot be clustered into 3 clusters. See issue MATH-436. + */ + @Test(expected=NumberIsTooSmallException.class) + public void testPerformClusterAnalysisToManyClusters() { + KMeansPlusPlusClusterer transformer = + new KMeansPlusPlusClusterer( + new Random(1746432956321l)); + + EuclideanIntegerPoint[] points = new EuclideanIntegerPoint[] { + new EuclideanIntegerPoint(new int[] { + 1959, 325100 + }), new EuclideanIntegerPoint(new int[] { + 1960, 373200 + }) + }; + + transformer.cluster(Arrays.asList(points), 3, 1); + + } + }