MATH-1371: Elkan's enhancement to "K-Means" algorithm.

Original implementation is a old PR provided by Artem Barger. Updated and changed to pass the code style checks. I did not review how this algorithm departs from the other implementations available in the same package, and whether some other design could minimize duplicate codes (but it would have been a pity to drop a potential performance improvement). A thorough examination should be considered, when tackling the refactoring of the "clustering" package. Closes #35.
2022-01-24 19:05:14 +01:00 · 2022-01-24 19:05:14 +01:00 · 74a851b611
parent 565b896873
commit 74a851b611
2 changed files with 546 additions and 0 deletions
--- a/commons-math-legacy/src/main/java/org/apache/commons/math4/legacy/ml/clustering/ElkanKMeansPlusPlusClusterer.java
+++ b/commons-math-legacy/src/main/java/org/apache/commons/math4/legacy/ml/clustering/ElkanKMeansPlusPlusClusterer.java
@ -0,0 +1,409 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.math4.legacy.ml.clustering;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.commons.rng.UniformRandomProvider;
+import org.apache.commons.math4.legacy.exception.NumberIsTooSmallException;
+import org.apache.commons.math4.legacy.ml.distance.DistanceMeasure;
+import org.apache.commons.math4.legacy.stat.descriptive.moment.VectorialMean;
+
+/**
+ * Implementation of k-means++ algorithm.
+ * It is based on
+ * <blockquote>
+ *  Elkan, Charles.
+ *  "Using the triangle inequality to accelerate k-means."
+ *  ICML. Vol. 3. 2003.
+ * </blockquote>
+ *
+ * <p>
+ * Algorithm uses triangle inequality to speed up computation, by reducing
+ * the amount of distances calculations.  Towards the last iterations of
+ * the algorithm, points which already assigned to some cluster are unlikely
+ * to move to a new cluster; updates of cluster centers are also usually
+ * relatively small.
+ * Triangle inequality is thus used to determine the cases where distance
+ * computation could be skipped since center move only a little, without
+ * affecting points partitioning.
+ *
+ * <p>
+ * For initial centers seeding, we apply the algorithm described in
+ * <blockquote>
+ *  Arthur, David, and Sergei Vassilvitskii.
+ *  "k-means++: The advantages of careful seeding."
+ *  Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms.
+ *  Society for Industrial and Applied Mathematics, 2007.
+ * </blockquote>
+ *
+ * @param <T> Type of the points to cluster.
+ */
+public class ElkanKMeansPlusPlusClusterer<T extends Clusterable>
+    extends KMeansPlusPlusClusterer<T> {
+
+    /**
+     * @param k Clustering parameter.
+     */
+    public ElkanKMeansPlusPlusClusterer(int k) {
+        super(k);
+    }
+
+    /**
+     * @param k Clustering parameter.
+     * @param maxIterations Allowed number of iterations.
+     * @param measure Distance measure.
+     * @param random Random generator.
+     */
+    public ElkanKMeansPlusPlusClusterer(int k,
+                                        int maxIterations,
+                                        DistanceMeasure measure,
+                                        UniformRandomProvider random) {
+        super(k, maxIterations, measure, random);
+    }
+
+    /**
+     * @param k Clustering parameter.
+     * @param maxIterations Allowed number of iterations.
+     * @param measure Distance measure.
+     * @param random Random generator.
+     * @param emptyStrategy Strategy for handling empty clusters that
+     * may appear during algorithm progress.
+     */
+    public ElkanKMeansPlusPlusClusterer(int k,
+                                        int maxIterations,
+                                        DistanceMeasure measure,
+                                        UniformRandomProvider random,
+                                        EmptyClusterStrategy emptyStrategy) {
+        super(k, maxIterations, measure, random, emptyStrategy);
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public List<CentroidCluster<T>> cluster(final Collection<T> points) {
+        final int k = getNumberOfClusters();
+
+        // Number of clusters has to be smaller or equal the number of data points.
+        if (points.size() < k) {
+            throw new NumberIsTooSmallException(points.size(), k, false);
+        }
+
+        final List<T> pointsList = new ArrayList<>(points);
+        final int n = points.size();
+        final int dim = pointsList.get(0).getPoint().length;
+
+        // Keep minimum intra cluster distance, e.g. for given cluster c s[c] is
+        // the distance to the closest cluster c' or s[c] = 1/2 * min_{c'!=c} dist(c', c)
+        final double[] s = new double[k];
+        Arrays.fill(s, Double.MAX_VALUE);
+        // Store the matrix of distances between all cluster centers, e.g. dcc[c1][c2] = distance(c1, c2)
+        final double[][] dcc = new double[k][k];
+
+        // For each point keeps the upper bound distance to the cluster center.
+        final double[] u = new double[n];
+        Arrays.fill(u, Double.MAX_VALUE);
+
+        // For each point and for each cluster keeps the lower bound for the distance between the point and cluster
+        final double[][] l = new double[n][k];
+
+        // Seed initial set of cluster centers.
+        final double[][] centers = seed(pointsList);
+
+        // Points partitioning induced by cluster centers, e.g. for point xi the value of partitions[xi] indicates
+        // the cluster or index of the cluster center which is closest to xi. partitions[xi] = min_{c} distance(xi, c).
+        final int[] partitions = partitionPoints(pointsList, centers, u, l);
+
+        final double[] deltas = new double[k];
+        VectorialMean[] means = new VectorialMean[k];
+        for (int it = 0, changes = 0, max = getMaxIterations();
+             it < max;
+             it++, changes = 0) {
+            // Step I.
+            // Compute inter-cluster distances.
+            updateIntraCentersDistances(centers, dcc, s);
+
+            for (int xi = 0; xi < n; xi++) {
+                boolean r = true;
+
+                // Step II.
+                if (u[xi] <= s[partitions[xi]]) {
+                    continue;
+                }
+
+                for (int c = 0; c < k; c++) {
+                    // Check condition III.
+                    if (isSkipNext(partitions, u, l, dcc, xi, c)) {
+                        continue;
+                    }
+
+                    final double[] x = pointsList.get(xi).getPoint();
+
+                    // III(a)
+                    if (r) {
+                        u[xi] = distance(x, centers[partitions[xi]]);
+                        l[xi][partitions[xi]] = u[xi];
+                        r = false;
+                    }
+                    // III(b)
+                    if (u[xi] > l[xi][c] || u[xi] > dcc[partitions[xi]][c]) {
+                        l[xi][c] = distance(x, centers[c]);
+                        if (l[xi][c] < u[xi]) {
+                            partitions[xi] = c;
+                            u[xi] = l[xi][c];
+                            ++changes;
+                        }
+                    }
+                }
+            }
+
+            // Stopping criterion.
+            if (changes == 0 &&
+                it != 0) { // First iteration needed (to update bounds).
+                break;
+            }
+
+            // Step IV.
+            Arrays.fill(means, null);
+            for (int i = 0; i < n; i++) {
+                if (means[partitions[i]] == null) {
+                    means[partitions[i]] = new VectorialMean(dim);
+                }
+                means[partitions[i]].increment(pointsList.get(i).getPoint());
+            }
+
+            for (int i = 0; i < k; i++) {
+                deltas[i] = distance(centers[i], means[i].getResult());
+                centers[i] = means[i].getResult();
+            }
+
+            updateBounds(partitions, u, l, deltas);
+        }
+
+        return buildResults(pointsList, partitions, centers);
+    }
+
+    /**
+     * kmeans++ seeding which provides guarantee of resulting with log(k) approximation
+     * for final clustering results
+     * <p>
+     * Arthur, David, and Sergei Vassilvitskii. "k-means++: The advantages of careful seeding."
+     * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms.
+     * Society for Industrial and Applied Mathematics, 2007.
+     *
+     * @param points input data points
+     * @return an array of initial clusters centers
+     *
+     */
+    private double[][] seed(final List<T> points) {
+        final int k = getNumberOfClusters();
+        final UniformRandomProvider random = getRandomGenerator();
+
+        final double[][] result = new double[k][];
+        final int n = points.size();
+        final int pointIndex = random.nextInt(n);
+
+        final double[] minDistances = new double[n];
+
+        int idx = 0;
+        result[idx] = points.get(pointIndex).getPoint();
+
+        double sumSqDist = 0;
+
+        for (int i = 0; i < n; i++) {
+            final double d = distance(result[idx], points.get(i).getPoint());
+            minDistances[i] = d * d;
+            sumSqDist += minDistances[i];
+        }
+
+        while (++idx < k) {
+            final double p = sumSqDist * random.nextDouble();
+            int next = 0;
+            for (double cdf = 0; cdf < p; next++) {
+                cdf += minDistances[next];
+            }
+
+            result[idx] = points.get(next - 1).getPoint();
+            for (int i = 0; i < n; i++) {
+                final double d = distance(result[idx], points.get(i).getPoint());
+                sumSqDist -= minDistances[i];
+                minDistances[i] = Math.min(minDistances[i], d * d);
+                sumSqDist += minDistances[i];
+            }
+        }
+
+        return result;
+    }
+
+
+    /**
+     * Once initial centers are chosen, we can actually go through data points and assign points to the
+     * cluster based on the distance between initial centers and points.
+     *
+     * @param pointsList data points list
+     * @param centers current clusters centers
+     * @param u points upper bounds
+     * @param l lower bounds for points to clusters centers
+     *
+     * @return initial assignment of points into clusters
+     */
+    private int[] partitionPoints(List<T> pointsList,
+                                  double[][] centers,
+                                  double[] u,
+                                  double[][] l) {
+        final int k = getNumberOfClusters();
+        final int n = pointsList.size();
+        // Points assignments vector.
+        final int[] assignments = new int[n];
+        Arrays.fill(assignments, -1);
+        // Need to assign points to the clusters for the first time and intitialize the lower bound l(x, c)
+        for (int i = 0; i < n; i++) {
+            final double[] x = pointsList.get(i).getPoint();
+            for (int j = 0; j < k; j++) {
+                l[i][j] = distance(x, centers[j]); // l(x, c) = d(x, c)
+                if (u[i] > l[i][j]) {
+                    u[i] = l[i][j]; // u(x) = min_c d(x, c)
+                    assignments[i] = j; // c(x) = argmin_c d(x, c)
+                }
+            }
+        }
+        return assignments;
+    }
+
+    /**
+     * Updated distances between clusters centers and for each cluster
+     * pick the closest neighbour and keep distance to it.
+     *
+     * @param centers cluster centers
+     * @param dcc matrix of distance between clusters centers, e.g.
+     * {@code dcc[i][j] = distance(centers[i], centers[j])}
+     * @param s For a given cluster, {@code s[si]} holds distance value
+     * to the closest cluster center.
+     */
+    private void updateIntraCentersDistances(double[][] centers,
+                                             double[][] dcc,
+                                             double[] s) {
+        final int k = getNumberOfClusters();
+        for (int i = 0; i < k; i++) {
+            // Since distance(xi, xj) == distance(xj, xi), we need to update
+            // only upper or lower triangle of the distances matrix and mirror
+            // to the lower of upper triangle accordingly, trace has to be all
+            // zeros, since distance(xi, xi) == 0.
+            for (int j = i + 1; j < k; j++) {
+                dcc[i][j] = 0.5 * distance(centers[i], centers[j]);
+                dcc[j][i] = dcc[i][j];
+                if (dcc[i][j] < s[i]) {
+                    s[i] = dcc[i][j];
+                }
+                if (dcc[j][i] < s[j]) {
+                    s[j] = dcc[j][i];
+                }
+            }
+        }
+    }
+
+    /**
+     * For given points and and cluster, check condition (3) of Elkan algorithm.
+     *
+     * <ul>
+     *  <li>c is not the cluster xi assigned to</li>
+     *  <li>{@code u[xi] > l[xi][x]} upper bound for point xi is greater than
+     *   lower bound between xi and some cluster c</li>
+     *  <li>{@code u[xi] > 1/2 * d(c(xi), c)} upper bound is greater than
+     *   distance between center of xi's cluster and c</li>
+     * </ul>
+     *
+     * @param partitions current partition of points into clusters
+     * @param u upper bounds for points
+     * @param l lower bounds for distance between cluster centers and points
+     * @param dcc matrix of distance between clusters centers
+     * @param xi index of the point
+     * @param c index of the cluster
+     * @return true if conditions above satisfied false otherwise
+     */
+    private boolean isSkipNext(int partitions[],
+                               double[] u,
+                               double[][] l,
+                               double[][] dcc,
+                               int xi,
+                               int c) {
+        return (c == partitions[xi]) ||
+                (u[xi] <= l[xi][c]) ||
+                (u[xi] <= dcc[partitions[xi]][c]);
+    }
+
+    /**
+     * Once kmeans iterations have been converged and no more movements, we can build up the final
+     * resulted list of cluster centroids ({@link CentroidCluster}) and assign input points based
+     * on the converged partitioning.
+     *
+     * @param pointsList list of data points
+     * @param partitions current partition of points into clusters
+     * @param centers cluster centers
+     * @return cluster partitioning
+     */
+    private List<CentroidCluster<T>> buildResults(List<T> pointsList,
+                                                  int[] partitions,
+                                                  double[][] centers) {
+        final int k = getNumberOfClusters();
+        final List<CentroidCluster<T>> result = new ArrayList<>();
+        for (int i = 0; i < k; i++) {
+            final CentroidCluster<T> cluster = new CentroidCluster<>(new DoublePoint(centers[i]));
+            result.add(cluster);
+        }
+        for (int i = 0; i < pointsList.size(); i++) {
+            result.get(partitions[i]).addPoint(pointsList.get(i));
+        }
+        return result;
+    }
+
+    /**
+     * Based on the distance that cluster center has moved we need to update our upper and lower bound.
+     * Worst case assumption, the center of the assigned to given cluster moves away from the point, while
+     * centers of over clusters become closer.
+     *
+     * @param partitions current points assiments to the clusters
+     * @param u points upper bounds
+     * @param l lower bounds for distances between point and corresponding cluster
+     * @param deltas the movement delta for each cluster center
+     */
+    private void updateBounds(int[] partitions,
+                              double[] u,
+                              double[][] l,
+                              double[] deltas) {
+        final int k = getNumberOfClusters();
+        for (int i = 0; i < partitions.length; i++) {
+            u[i] += deltas[partitions[i]];
+            for (int j = 0; j < k; j++) {
+                l[i][j] = Math.max(0, l[i][j] - deltas[j]);
+            }
+        }
+    }
+
+    /**
+     * @param a Coordinates.
+     * @param b Coordinates.
+     * @return the distance between {@code a} and {@code b}.
+     */
+    private double distance(final double[] a,
+                            final double[] b) {
+        return getDistanceMeasure().compute(a, b);
+    }
+}
--- a/commons-math-legacy/src/test/java/org/apache/commons/math4/legacy/ml/clustering/ElkanKMeansPlusPlusClustererTest.java
+++ b/commons-math-legacy/src/test/java/org/apache/commons/math4/legacy/ml/clustering/ElkanKMeansPlusPlusClustererTest.java
@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.math4.legacy.ml.clustering;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.commons.rng.simple.RandomSource;
+import org.apache.commons.rng.sampling.shape.BoxSampler;
+import org.apache.commons.math4.legacy.exception.NotStrictlyPositiveException;
+import org.apache.commons.math4.legacy.exception.NumberIsTooSmallException;
+import org.apache.commons.math4.legacy.stat.descriptive.moment.VectorialMean;
+import org.apache.commons.math4.legacy.core.MathArrays;
+
+/**
+ * Tests for {@link ElkanKmeansPlusPlusClusterer}.
+ */
+public class ElkanKMeansPlusPlusClustererTest {
+    @Test
+    public void validateOneDimensionSingleClusterZeroMean() {
+        final List<DoublePoint> testPoints = Arrays.asList(new DoublePoint(new double[]{1}),
+                                                           new DoublePoint(new double[]{2}),
+                                                           new DoublePoint(new double[]{-3}));
+        final ElkanKMeansPlusPlusClusterer<DoublePoint> clusterer = new ElkanKMeansPlusPlusClusterer<>(1);
+        final List<CentroidCluster<DoublePoint>> clusters = clusterer.cluster(testPoints);
+        Assert.assertEquals(1, clusters.size());
+        Assert.assertTrue(MathArrays.equals(new double[]{0}, clusters.get(0).getCenter().getPoint()));
+    }
+
+    @Test(expected = NumberIsTooSmallException.class)
+    public void illegalKParameter() {
+        final int n = 20;
+        final int d = 3;
+        final int k = 100;
+
+        final List<DoublePoint> testPoints = generatePoints(n, d);
+        final ElkanKMeansPlusPlusClusterer<DoublePoint> clusterer = new ElkanKMeansPlusPlusClusterer<>(k);
+        clusterer.cluster(testPoints);
+    }
+
+    @Test
+    public void numberOfClustersSameAsInputSize() {
+        final int n = 3;
+        final int d = 2;
+        final int k = 3;
+
+        final List<DoublePoint> testPoints = generatePoints(n, d);
+        final ElkanKMeansPlusPlusClusterer<DoublePoint> clusterer = new ElkanKMeansPlusPlusClusterer<>(k);
+        final List<CentroidCluster<DoublePoint>> clusters = clusterer.cluster(testPoints);
+        Assert.assertEquals(k, clusters.size());
+        Assert.assertEquals(1, clusters.get(0).getPoints().size());
+        Assert.assertEquals(1, clusters.get(1).getPoints().size());
+        Assert.assertEquals(1, clusters.get(2).getPoints().size());
+    }
+
+    @Test(expected = NullPointerException.class)
+    public void illegalInputParameter() {
+        final ElkanKMeansPlusPlusClusterer<DoublePoint> clusterer = new ElkanKMeansPlusPlusClusterer<>(10);
+        clusterer.cluster(null);
+    }
+
+    @Test(expected = NumberIsTooSmallException.class)
+    public void emptyInputPointsList() {
+        final ElkanKMeansPlusPlusClusterer<DoublePoint> clusterer = new ElkanKMeansPlusPlusClusterer<>(10);
+        clusterer.cluster(Collections.<DoublePoint>emptyList());
+    }
+
+    @Test(expected = NotStrictlyPositiveException.class)
+    public void negativeKParameterValue() {
+        new ElkanKMeansPlusPlusClusterer<>(-1);
+    }
+
+    @Test(expected = NotStrictlyPositiveException.class)
+    public void kParameterEqualsZero() {
+        new ElkanKMeansPlusPlusClusterer<>(0);
+    }
+
+    @Test
+    public void oneClusterCenterShouldBeTheMean() {
+        final int n = 100;
+        final int d = 2;
+
+        final List<DoublePoint> testPoints = generatePoints(n, d);
+        final KMeansPlusPlusClusterer<DoublePoint> clusterer = new KMeansPlusPlusClusterer<>(1);
+
+        final List<CentroidCluster<DoublePoint>> clusters = clusterer.cluster(testPoints);
+
+        final VectorialMean mean = new VectorialMean(d);
+        for (DoublePoint each : testPoints) {
+            mean.increment(each.getPoint());
+        }
+        Assert.assertEquals(1, clusters.size());
+        Assert.assertArrayEquals(mean.getResult(), clusters.get(0).getCenter().getPoint(), 1e-6);
+    }
+
+    /**
+     * Generates a list of random uncorrelated points to cluster.
+     *
+     * @param n number of points
+     * @param d dimensionality
+     * @return list of n generated random vectors of dimension d.
+     */
+    private static List<DoublePoint> generatePoints(int n, int d) {
+        final List<DoublePoint> results = new ArrayList<>();
+        final double[] lower = new double[d];
+        final double[] upper = new double[d];
+        Arrays.fill(upper, 1);
+        final BoxSampler rnd = BoxSampler.of(RandomSource.KISS.create(),
+                                             lower,
+                                             upper);
+
+        for (int i = 0; i < n; i++) {
+            results.add(new DoublePoint(rnd.sample()));
+        }
+
+        return results;
+    }
+}