mirror of
https://github.com/apache/commons-math.git
synced 2025-02-09 19:45:52 +00:00
Added a consistency check for number of points with respect to the number of clusters in Kmeans++ clustering
JIRA: MATH-436 git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@1084577 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
862103b3a8
commit
c83a4bc0c4
@ -23,8 +23,12 @@ import java.util.List;
|
|||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
import org.apache.commons.math.exception.ConvergenceException;
|
import org.apache.commons.math.exception.ConvergenceException;
|
||||||
|
import org.apache.commons.math.exception.MathIllegalArgumentException;
|
||||||
|
import org.apache.commons.math.exception.NullArgumentException;
|
||||||
|
import org.apache.commons.math.exception.NumberIsTooSmallException;
|
||||||
import org.apache.commons.math.exception.util.LocalizedFormats;
|
import org.apache.commons.math.exception.util.LocalizedFormats;
|
||||||
import org.apache.commons.math.stat.descriptive.moment.Variance;
|
import org.apache.commons.math.stat.descriptive.moment.Variance;
|
||||||
|
import org.apache.commons.math.util.MathUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clustering algorithm based on David Arthur and Sergei Vassilvitski k-means++ algorithm.
|
* Clustering algorithm based on David Arthur and Sergei Vassilvitski k-means++ algorithm.
|
||||||
@ -88,9 +92,21 @@ public class KMeansPlusPlusClusterer<T extends Clusterable<T>> {
|
|||||||
* @param maxIterations the maximum number of iterations to run the algorithm
|
* @param maxIterations the maximum number of iterations to run the algorithm
|
||||||
* for. If negative, no maximum will be used
|
* for. If negative, no maximum will be used
|
||||||
* @return a list of clusters containing the points
|
* @return a list of clusters containing the points
|
||||||
|
* @throws MathIllegalArgumentException if the data points are null or the number
|
||||||
|
* of clusters is larger than the number of data points
|
||||||
*/
|
*/
|
||||||
public List<Cluster<T>> cluster(final Collection<T> points,
|
public List<Cluster<T>> cluster(final Collection<T> points, final int k,
|
||||||
final int k, final int maxIterations) {
|
final int maxIterations)
|
||||||
|
throws MathIllegalArgumentException {
|
||||||
|
|
||||||
|
// sanity checks
|
||||||
|
MathUtils.checkNotNull(points);
|
||||||
|
|
||||||
|
// number of clusters has to be smaller or equal the number of data points
|
||||||
|
if (points.size() < k) {
|
||||||
|
throw new NumberIsTooSmallException(points.size(), k, false);
|
||||||
|
}
|
||||||
|
|
||||||
// create the initial clusters
|
// create the initial clusters
|
||||||
List<Cluster<T>> clusters = chooseInitialCenters(points, k, random);
|
List<Cluster<T>> clusters = chooseInitialCenters(points, k, random);
|
||||||
assignPointsToClusters(clusters, points);
|
assignPointsToClusters(clusters, points);
|
||||||
|
@ -52,6 +52,10 @@ The <action> type attribute can be add,update,fix,remove.
|
|||||||
If the output is not quite correct, check for invisible trailing spaces!
|
If the output is not quite correct, check for invisible trailing spaces!
|
||||||
-->
|
-->
|
||||||
<release version="3.0" date="TBD" description="TBD">
|
<release version="3.0" date="TBD" description="TBD">
|
||||||
|
<action dev="luc" type="add" issue="MATH-436" due-to="Thomas Neidhart">
|
||||||
|
Added a consistency check for number of points with respect to the number
|
||||||
|
of clusters in Kmeans++ clustering
|
||||||
|
</action>
|
||||||
<action dev="mikl" type="add" issue="MATH-437">
|
<action dev="mikl" type="add" issue="MATH-437">
|
||||||
Added two sided Kolmogorov-Smirnov distribution using modified
|
Added two sided Kolmogorov-Smirnov distribution using modified
|
||||||
Marsaglia et al. (2003) implementation and quick decisions for certain
|
Marsaglia et al. (2003) implementation and quick decisions for certain
|
||||||
|
@ -24,6 +24,7 @@ import java.util.Collection;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
|
import org.apache.commons.math.exception.NumberIsTooSmallException;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
@ -246,4 +247,26 @@ public class KMeansPlusPlusClustererTest {
|
|||||||
}
|
}
|
||||||
Assert.assertTrue(uniquePointIsCenter);
|
Assert.assertTrue(uniquePointIsCenter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 2 variables cannot be clustered into 3 clusters. See issue MATH-436.
|
||||||
|
*/
|
||||||
|
@Test(expected=NumberIsTooSmallException.class)
|
||||||
|
public void testPerformClusterAnalysisToManyClusters() {
|
||||||
|
KMeansPlusPlusClusterer<EuclideanIntegerPoint> transformer =
|
||||||
|
new KMeansPlusPlusClusterer<EuclideanIntegerPoint>(
|
||||||
|
new Random(1746432956321l));
|
||||||
|
|
||||||
|
EuclideanIntegerPoint[] points = new EuclideanIntegerPoint[] {
|
||||||
|
new EuclideanIntegerPoint(new int[] {
|
||||||
|
1959, 325100
|
||||||
|
}), new EuclideanIntegerPoint(new int[] {
|
||||||
|
1960, 373200
|
||||||
|
})
|
||||||
|
};
|
||||||
|
|
||||||
|
transformer.cluster(Arrays.asList(points), 3, 1);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user