[MATH-1031] Added new ClusterEvaluation base class and refactored code in MultiKMeansPlusPlusClusterer.
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@1542545 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
65646ba8bd
commit
3a45bc5b6d
|
@ -51,6 +51,11 @@ If the output is not quite correct, check for invisible trailing spaces!
|
||||||
</properties>
|
</properties>
|
||||||
<body>
|
<body>
|
||||||
<release version="3.3" date="TBD" description="TBD">
|
<release version="3.3" date="TBD" description="TBD">
|
||||||
|
<action dev="tn" type="update" issue="MATH-1031" due-to="Thorsten Schäfer">
|
||||||
|
Added new class "ClusterEvaluator" to evaluate the result of a clustering algorithm
|
||||||
|
and refactored existing evaluation code in "MultiKMeansPlusPlusClusterer"
|
||||||
|
into separate class "SumOfClusterVariances".
|
||||||
|
</action>
|
||||||
<action dev="psteitz" type="add" issue="MATH-1061">
|
<action dev="psteitz" type="add" issue="MATH-1061">
|
||||||
Added InsufficientDataException.
|
Added InsufficientDataException.
|
||||||
</action>
|
</action>
|
||||||
|
@ -96,7 +101,7 @@ If the output is not quite correct, check for invisible trailing spaces!
|
||||||
Added logDensity methods to AbstractReal/IntegerDistribution with naive default
|
Added logDensity methods to AbstractReal/IntegerDistribution with naive default
|
||||||
implementations and improved implementations for some current distributions.
|
implementations and improved implementations for some current distributions.
|
||||||
</action>
|
</action>
|
||||||
<action dev="psteitz" type="add" issue="MATH-1038" due-to="Thorsten Schaefer">
|
<action dev="psteitz" type="add" issue="MATH-1038" due-to="Thorsten Schäfer">
|
||||||
Added ConfidenceInterval class and BinomialConfidenceInterval providing several
|
Added ConfidenceInterval class and BinomialConfidenceInterval providing several
|
||||||
estimators for confidence intervals for binomial probabilities.
|
estimators for confidence intervals for binomial probabilities.
|
||||||
</action>
|
</action>
|
||||||
|
@ -127,7 +132,7 @@ If the output is not quite correct, check for invisible trailing spaces!
|
||||||
Fix a typo in the test class of "GeometricDistribution" and ensure that a meaningful
|
Fix a typo in the test class of "GeometricDistribution" and ensure that a meaningful
|
||||||
tolerance value is used when comparing test results with expected values.
|
tolerance value is used when comparing test results with expected values.
|
||||||
</action>
|
</action>
|
||||||
<action dev="psteitz" type="add" issue="MATH-1034" due-to="Thorsten Schaefer">
|
<action dev="psteitz" type="add" issue="MATH-1034" due-to="Thorsten Schäfer">
|
||||||
Added exact binomial test implementation.
|
Added exact binomial test implementation.
|
||||||
</action>
|
</action>
|
||||||
<action dev="tn" type="add" issue="MATH-1018" due-to="Ajo Fod">
|
<action dev="tn" type="add" issue="MATH-1018" due-to="Ajo Fod">
|
||||||
|
|
|
@ -22,7 +22,8 @@ import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.math3.exception.ConvergenceException;
|
import org.apache.commons.math3.exception.ConvergenceException;
|
||||||
import org.apache.commons.math3.exception.MathIllegalArgumentException;
|
import org.apache.commons.math3.exception.MathIllegalArgumentException;
|
||||||
import org.apache.commons.math3.stat.descriptive.moment.Variance;
|
import org.apache.commons.math3.ml.clustering.evaluation.ClusterEvaluator;
|
||||||
|
import org.apache.commons.math3.ml.clustering.evaluation.SumOfClusterVariances;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A wrapper around a k-means++ clustering algorithm which performs multiple trials
|
* A wrapper around a k-means++ clustering algorithm which performs multiple trials
|
||||||
|
@ -39,15 +40,31 @@ public class MultiKMeansPlusPlusClusterer<T extends Clusterable> extends Cluster
|
||||||
/** The number of trial runs. */
|
/** The number of trial runs. */
|
||||||
private final int numTrials;
|
private final int numTrials;
|
||||||
|
|
||||||
|
/** The cluster evaluator to use. */
|
||||||
|
private final ClusterEvaluator<T> evaluator;
|
||||||
|
|
||||||
/** Build a clusterer.
|
/** Build a clusterer.
|
||||||
* @param clusterer the k-means clusterer to use
|
* @param clusterer the k-means clusterer to use
|
||||||
* @param numTrials number of trial runs
|
* @param numTrials number of trial runs
|
||||||
*/
|
*/
|
||||||
public MultiKMeansPlusPlusClusterer(final KMeansPlusPlusClusterer<T> clusterer,
|
public MultiKMeansPlusPlusClusterer(final KMeansPlusPlusClusterer<T> clusterer,
|
||||||
final int numTrials) {
|
final int numTrials) {
|
||||||
|
this(clusterer, numTrials, new SumOfClusterVariances<T>(clusterer.getDistanceMeasure()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Build a clusterer.
|
||||||
|
* @param clusterer the k-means clusterer to use
|
||||||
|
* @param numTrials number of trial runs
|
||||||
|
* @param evaluator the cluster evaluator to use
|
||||||
|
* @since 3.3
|
||||||
|
*/
|
||||||
|
public MultiKMeansPlusPlusClusterer(final KMeansPlusPlusClusterer<T> clusterer,
|
||||||
|
final int numTrials,
|
||||||
|
final ClusterEvaluator<T> evaluator) {
|
||||||
super(clusterer.getDistanceMeasure());
|
super(clusterer.getDistanceMeasure());
|
||||||
this.clusterer = clusterer;
|
this.clusterer = clusterer;
|
||||||
this.numTrials = numTrials;
|
this.numTrials = numTrials;
|
||||||
|
this.evaluator = evaluator;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -66,6 +83,15 @@ public class MultiKMeansPlusPlusClusterer<T extends Clusterable> extends Cluster
|
||||||
return numTrials;
|
return numTrials;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the {@link ClusterEvaluator} used to determine the "best" clustering.
|
||||||
|
* @return the used {@link ClusterEvaluator}
|
||||||
|
* @since 3.3
|
||||||
|
*/
|
||||||
|
public ClusterEvaluator<T> getClusterEvaluator() {
|
||||||
|
return evaluator;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Runs the K-means++ clustering algorithm.
|
* Runs the K-means++ clustering algorithm.
|
||||||
*
|
*
|
||||||
|
@ -92,22 +118,9 @@ public class MultiKMeansPlusPlusClusterer<T extends Clusterable> extends Cluster
|
||||||
List<CentroidCluster<T>> clusters = clusterer.cluster(points);
|
List<CentroidCluster<T>> clusters = clusterer.cluster(points);
|
||||||
|
|
||||||
// compute the variance of the current list
|
// compute the variance of the current list
|
||||||
double varianceSum = 0.0;
|
final double varianceSum = evaluator.score(clusters);
|
||||||
for (final CentroidCluster<T> cluster : clusters) {
|
|
||||||
if (!cluster.getPoints().isEmpty()) {
|
|
||||||
|
|
||||||
// compute the distance variance of the current cluster
|
if (evaluator.isBetterScore(varianceSum, bestVarianceSum)) {
|
||||||
final Clusterable center = cluster.getCenter();
|
|
||||||
final Variance stat = new Variance();
|
|
||||||
for (final T point : cluster.getPoints()) {
|
|
||||||
stat.increment(distance(point, center));
|
|
||||||
}
|
|
||||||
varianceSum += stat.getResult();
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (varianceSum <= bestVarianceSum) {
|
|
||||||
// this one is the best we have found so far, remember it
|
// this one is the best we have found so far, remember it
|
||||||
best = clusters;
|
best = clusters;
|
||||||
bestVarianceSum = varianceSum;
|
bestVarianceSum = varianceSum;
|
||||||
|
|
|
@ -0,0 +1,123 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.commons.math3.ml.clustering.evaluation;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.math3.ml.clustering.CentroidCluster;
|
||||||
|
import org.apache.commons.math3.ml.clustering.Cluster;
|
||||||
|
import org.apache.commons.math3.ml.clustering.Clusterable;
|
||||||
|
import org.apache.commons.math3.ml.clustering.DoublePoint;
|
||||||
|
import org.apache.commons.math3.ml.distance.DistanceMeasure;
|
||||||
|
import org.apache.commons.math3.ml.distance.EuclideanDistance;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base class for cluster evaluation methods.
|
||||||
|
*
|
||||||
|
* @param <T> type of the clustered points
|
||||||
|
* @version $Id$
|
||||||
|
* @since 3.3
|
||||||
|
*/
|
||||||
|
public abstract class ClusterEvaluator<T extends Clusterable> {
|
||||||
|
|
||||||
|
/** The distance measure to use when evaluating the cluster. */
|
||||||
|
private final DistanceMeasure measure;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new cluster evaluator with an {@link EuclideanDistance}
|
||||||
|
* as distance measure.
|
||||||
|
*/
|
||||||
|
public ClusterEvaluator() {
|
||||||
|
this(new EuclideanDistance());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new cluster evaluator with the given distance measure.
|
||||||
|
* @param measure the distance measure to use
|
||||||
|
*/
|
||||||
|
public ClusterEvaluator(final DistanceMeasure measure) {
|
||||||
|
this.measure = measure;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the evaluation score for the given list of clusters.
|
||||||
|
* @param clusters the clusters to evaluate
|
||||||
|
* @return the computed score
|
||||||
|
*/
|
||||||
|
public abstract double score(List<? extends Cluster<T>> clusters);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether the first evaluation score is considered to be better
|
||||||
|
* than the second one by this evaluator.
|
||||||
|
* <p>
|
||||||
|
* Specific implementations shall override this method if the returned scores
|
||||||
|
* do not follow the same ordering, i.e. smaller score is better.
|
||||||
|
*
|
||||||
|
* @param score1 the first score
|
||||||
|
* @param score2 the second score
|
||||||
|
* @return {@code true} if the first score is considered to be better, {@code false} otherwise
|
||||||
|
*/
|
||||||
|
public boolean isBetterScore(double score1, double score2) {
|
||||||
|
return score1 < score2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the distance between two {@link Clusterable} instances
|
||||||
|
* with the configured {@link DistanceMeasure}.
|
||||||
|
*
|
||||||
|
* @param p1 the first clusterable
|
||||||
|
* @param p2 the second clusterable
|
||||||
|
* @return the distance between the two clusterables
|
||||||
|
*/
|
||||||
|
protected double distance(final Clusterable p1, final Clusterable p2) {
|
||||||
|
return measure.compute(p1.getPoint(), p2.getPoint());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the centroid for a cluster.
|
||||||
|
*
|
||||||
|
* @param cluster the cluster
|
||||||
|
* @return the computed centroid for the cluster,
|
||||||
|
* or {@code null} if the cluster does not contain any points
|
||||||
|
*/
|
||||||
|
protected Clusterable centroidOf(final Cluster<T> cluster) {
|
||||||
|
final List<T> points = cluster.getPoints();
|
||||||
|
if (points.isEmpty()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// in case the cluster is of type CentroidCluster, no need to compute the centroid
|
||||||
|
if (cluster instanceof CentroidCluster) {
|
||||||
|
return ((CentroidCluster<T>) cluster).getCenter();
|
||||||
|
}
|
||||||
|
|
||||||
|
final int dimension = points.get(0).getPoint().length;
|
||||||
|
final double[] centroid = new double[dimension];
|
||||||
|
for (final T p : points) {
|
||||||
|
final double[] point = p.getPoint();
|
||||||
|
for (int i = 0; i < centroid.length; i++) {
|
||||||
|
centroid[i] += point[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < centroid.length; i++) {
|
||||||
|
centroid[i] /= points.size();
|
||||||
|
}
|
||||||
|
return new DoublePoint(centroid);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,69 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.commons.math3.ml.clustering.evaluation;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.math3.ml.clustering.Cluster;
|
||||||
|
import org.apache.commons.math3.ml.clustering.Clusterable;
|
||||||
|
import org.apache.commons.math3.ml.distance.DistanceMeasure;
|
||||||
|
import org.apache.commons.math3.stat.descriptive.moment.Variance;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes the sum of intra-cluster distance variances according to the formula:
|
||||||
|
* <pre>
|
||||||
|
* \( score = \sum\limits_{i=1}^n \sigma_i^2 \)
|
||||||
|
* </pre>
|
||||||
|
* where n is the number of clusters and \( \sigma_i^2 \) is the variance of
|
||||||
|
* intra-cluster distances of cluster \( c_i \).
|
||||||
|
*
|
||||||
|
* @param <T> the type of the clustered points
|
||||||
|
* @version $Id$
|
||||||
|
* @since 3.3
|
||||||
|
*/
|
||||||
|
public class SumOfClusterVariances<T extends Clusterable> extends ClusterEvaluator<T> {
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param measure the distance measure to use
|
||||||
|
*/
|
||||||
|
public SumOfClusterVariances(final DistanceMeasure measure) {
|
||||||
|
super(measure);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double score(final List<? extends Cluster<T>> clusters) {
|
||||||
|
double varianceSum = 0.0;
|
||||||
|
for (final Cluster<T> cluster : clusters) {
|
||||||
|
if (!cluster.getPoints().isEmpty()) {
|
||||||
|
|
||||||
|
final Clusterable center = centroidOf(cluster);
|
||||||
|
|
||||||
|
// compute the distance variance of the current cluster
|
||||||
|
final Variance stat = new Variance();
|
||||||
|
for (final T point : cluster.getPoints()) {
|
||||||
|
stat.increment(distance(point, center));
|
||||||
|
}
|
||||||
|
varianceSum += stat.getResult();
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return varianceSum;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,20 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* Cluster evaluation methods.
|
||||||
|
*/
|
||||||
|
package org.apache.commons.math3.ml.clustering.evaluation;
|
|
@ -0,0 +1,80 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.commons.math3.ml.clustering.evaluation;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.math3.ml.clustering.Cluster;
|
||||||
|
import org.apache.commons.math3.ml.clustering.DoublePoint;
|
||||||
|
import org.apache.commons.math3.ml.distance.EuclideanDistance;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class SumOfClusterVariancesTest {
|
||||||
|
|
||||||
|
private ClusterEvaluator<DoublePoint> evaluator;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() {
|
||||||
|
evaluator = new SumOfClusterVariances<DoublePoint>(new EuclideanDistance());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testScore() {
|
||||||
|
final DoublePoint[] points1 = new DoublePoint[] {
|
||||||
|
new DoublePoint(new double[] { 1 }),
|
||||||
|
new DoublePoint(new double[] { 2 }),
|
||||||
|
new DoublePoint(new double[] { 3 })
|
||||||
|
};
|
||||||
|
|
||||||
|
final DoublePoint[] points2 = new DoublePoint[] {
|
||||||
|
new DoublePoint(new double[] { 1 }),
|
||||||
|
new DoublePoint(new double[] { 5 }),
|
||||||
|
new DoublePoint(new double[] { 10 })
|
||||||
|
};
|
||||||
|
|
||||||
|
final List<Cluster<DoublePoint>> clusters = new ArrayList<Cluster<DoublePoint>>();
|
||||||
|
|
||||||
|
final Cluster<DoublePoint> cluster1 = new Cluster<DoublePoint>();
|
||||||
|
for (DoublePoint p : points1) {
|
||||||
|
cluster1.addPoint(p);
|
||||||
|
}
|
||||||
|
clusters.add(cluster1);
|
||||||
|
|
||||||
|
assertEquals(1.0/3.0, evaluator.score(clusters), 1e-6);
|
||||||
|
|
||||||
|
final Cluster<DoublePoint> cluster2 = new Cluster<DoublePoint>();
|
||||||
|
for (DoublePoint p : points2) {
|
||||||
|
cluster2.addPoint(p);
|
||||||
|
}
|
||||||
|
clusters.add(cluster2);
|
||||||
|
|
||||||
|
assertEquals(6.148148148, evaluator.score(clusters), 1e-6);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOrdering() {
|
||||||
|
assertTrue(evaluator.isBetterScore(10, 20));
|
||||||
|
assertFalse(evaluator.isBetterScore(20, 1));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue