From 2c94388179fd38bc95e3b47af96666493027f57d Mon Sep 17 00:00:00 2001 From: tn Date: Thu, 19 Feb 2015 09:25:46 +0100 Subject: [PATCH] Remove deprecated stat.clustering package. --- findbugs-exclude-filter.xml | 20 +- .../math4/stat/clustering/Cluster.java | 76 --- .../math4/stat/clustering/Clusterable.java | 48 -- .../stat/clustering/DBSCANClusterer.java | 226 -------- .../stat/clustering/EuclideanDoublePoint.java | 100 ---- .../clustering/EuclideanIntegerPoint.java | 101 ---- .../clustering/KMeansPlusPlusClusterer.java | 514 ------------------ .../math4/stat/clustering/package-info.java | 29 - .../stat/clustering/DBSCANClustererTest.java | 195 ------- .../clustering/EuclideanDoublePointTest.java | 64 --- .../clustering/EuclideanIntegerPointTest.java | 66 --- .../KMeansPlusPlusClustererTest.java | 277 ---------- 12 files changed, 2 insertions(+), 1714 deletions(-) delete mode 100644 src/main/java/org/apache/commons/math4/stat/clustering/Cluster.java delete mode 100644 src/main/java/org/apache/commons/math4/stat/clustering/Clusterable.java delete mode 100644 src/main/java/org/apache/commons/math4/stat/clustering/DBSCANClusterer.java delete mode 100644 src/main/java/org/apache/commons/math4/stat/clustering/EuclideanDoublePoint.java delete mode 100644 src/main/java/org/apache/commons/math4/stat/clustering/EuclideanIntegerPoint.java delete mode 100644 src/main/java/org/apache/commons/math4/stat/clustering/KMeansPlusPlusClusterer.java delete mode 100644 src/main/java/org/apache/commons/math4/stat/clustering/package-info.java delete mode 100644 src/test/java/org/apache/commons/math4/stat/clustering/DBSCANClustererTest.java delete mode 100644 src/test/java/org/apache/commons/math4/stat/clustering/EuclideanDoublePointTest.java delete mode 100644 src/test/java/org/apache/commons/math4/stat/clustering/EuclideanIntegerPointTest.java delete mode 100644 src/test/java/org/apache/commons/math4/stat/clustering/KMeansPlusPlusClustererTest.java diff --git a/findbugs-exclude-filter.xml b/findbugs-exclude-filter.xml index 6659ad5db..db03c2758 100644 --- a/findbugs-exclude-filter.xml +++ b/findbugs-exclude-filter.xml @@ -312,28 +312,12 @@ - - - - - - - - - - - - - - + - - - - + diff --git a/src/main/java/org/apache/commons/math4/stat/clustering/Cluster.java b/src/main/java/org/apache/commons/math4/stat/clustering/Cluster.java deleted file mode 100644 index 3fbc11ba0..000000000 --- a/src/main/java/org/apache/commons/math4/stat/clustering/Cluster.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.math4.stat.clustering; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; - -/** - * Cluster holding a set of {@link Clusterable} points. - * @param the type of points that can be clustered - * @since 2.0 - * @deprecated As of 3.2 (to be removed in 4.0), - * use {@link org.apache.commons.math4.ml.clustering.Cluster} instead - */ -@Deprecated -public class Cluster> implements Serializable { - - /** Serializable version identifier. */ - private static final long serialVersionUID = -3442297081515880464L; - - /** The points contained in this cluster. */ - private final List points; - - /** Center of the cluster. */ - private final T center; - - /** - * Build a cluster centered at a specified point. - * @param center the point which is to be the center of this cluster - */ - public Cluster(final T center) { - this.center = center; - points = new ArrayList(); - } - - /** - * Add a point to this cluster. - * @param point point to add - */ - public void addPoint(final T point) { - points.add(point); - } - - /** - * Get the points contained in the cluster. - * @return points contained in the cluster - */ - public List getPoints() { - return points; - } - - /** - * Get the point chosen to be the center of this cluster. - * @return chosen cluster center - */ - public T getCenter() { - return center; - } - -} diff --git a/src/main/java/org/apache/commons/math4/stat/clustering/Clusterable.java b/src/main/java/org/apache/commons/math4/stat/clustering/Clusterable.java deleted file mode 100644 index f9f75b4a5..000000000 --- a/src/main/java/org/apache/commons/math4/stat/clustering/Clusterable.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.math4.stat.clustering; - -import java.util.Collection; - -/** - * Interface for points that can be clustered together. - * @param the type of point that can be clustered - * @since 2.0 - * @deprecated As of 3.2 (to be removed in 4.0), - * use {@link org.apache.commons.math4.ml.clustering.Clusterable} instead - */ -@Deprecated -public interface Clusterable { - - /** - * Returns the distance from the given point. - * - * @param p the point to compute the distance from - * @return the distance from the given point - */ - double distanceFrom(T p); - - /** - * Returns the centroid of the given Collection of points. - * - * @param p the Collection of points to compute the centroid of - * @return the centroid of the given Collection of Points - */ - T centroidOf(Collection p); - -} diff --git a/src/main/java/org/apache/commons/math4/stat/clustering/DBSCANClusterer.java b/src/main/java/org/apache/commons/math4/stat/clustering/DBSCANClusterer.java deleted file mode 100644 index 0122f4831..000000000 --- a/src/main/java/org/apache/commons/math4/stat/clustering/DBSCANClusterer.java +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.math4.stat.clustering; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.commons.math4.exception.NotPositiveException; -import org.apache.commons.math4.exception.NullArgumentException; -import org.apache.commons.math4.util.MathUtils; - -/** - * DBSCAN (density-based spatial clustering of applications with noise) algorithm. - *

- * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e. - * a point p is density connected to another point q, if there exists a chain of - * points pi, with i = 1 .. n and p1 = p and pn = q, - * such that each pair <pi, pi+1> is directly density-reachable. - * A point q is directly density-reachable from point p if it is in the ε-neighborhood - * of this point. - *

- * Any point that is not density-reachable from a formed cluster is treated as noise, and - * will thus not be present in the result. - *

- * The algorithm requires two parameters: - *

    - *
  • eps: the distance that defines the ε-neighborhood of a point - *
  • minPoints: the minimum number of density-connected points required to form a cluster - *
- *

- * Note: as DBSCAN is not a centroid-based clustering algorithm, the resulting - * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will - * return {@code null}. - * - * @param type of the points to cluster - * @see DBSCAN (wikipedia) - * @see - * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise - * @since 3.1 - * @deprecated As of 3.2 (to be removed in 4.0), - * use {@link org.apache.commons.math4.ml.clustering.DBSCANClusterer} instead - */ -@Deprecated -public class DBSCANClusterer> { - - /** Maximum radius of the neighborhood to be considered. */ - private final double eps; - - /** Minimum number of points needed for a cluster. */ - private final int minPts; - - /** Status of a point during the clustering process. */ - private enum PointStatus { - /** The point has is considered to be noise. */ - NOISE, - /** The point is already part of a cluster. */ - PART_OF_CLUSTER - } - - /** - * Creates a new instance of a DBSCANClusterer. - * - * @param eps maximum radius of the neighborhood to be considered - * @param minPts minimum number of points needed for a cluster - * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0} - */ - public DBSCANClusterer(final double eps, final int minPts) - throws NotPositiveException { - if (eps < 0.0d) { - throw new NotPositiveException(eps); - } - if (minPts < 0) { - throw new NotPositiveException(minPts); - } - this.eps = eps; - this.minPts = minPts; - } - - /** - * Returns the maximum radius of the neighborhood to be considered. - * - * @return maximum radius of the neighborhood - */ - public double getEps() { - return eps; - } - - /** - * Returns the minimum number of points needed for a cluster. - * - * @return minimum number of points needed for a cluster - */ - public int getMinPts() { - return minPts; - } - - /** - * Performs DBSCAN cluster analysis. - *

- * Note: as DBSCAN is not a centroid-based clustering algorithm, the resulting - * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will - * return {@code null}. - * - * @param points the points to cluster - * @return the list of clusters - * @throws NullArgumentException if the data points are null - */ - public List> cluster(final Collection points) throws NullArgumentException { - - // sanity checks - MathUtils.checkNotNull(points); - - final List> clusters = new ArrayList>(); - final Map, PointStatus> visited = new HashMap, PointStatus>(); - - for (final T point : points) { - if (visited.get(point) != null) { - continue; - } - final List neighbors = getNeighbors(point, points); - if (neighbors.size() >= minPts) { - // DBSCAN does not care about center points - final Cluster cluster = new Cluster(null); - clusters.add(expandCluster(cluster, point, neighbors, points, visited)); - } else { - visited.put(point, PointStatus.NOISE); - } - } - - return clusters; - } - - /** - * Expands the cluster to include density-reachable items. - * - * @param cluster Cluster to expand - * @param point Point to add to cluster - * @param neighbors List of neighbors - * @param points the data set - * @param visited the set of already visited points - * @return the expanded cluster - */ - private Cluster expandCluster(final Cluster cluster, - final T point, - final List neighbors, - final Collection points, - final Map, PointStatus> visited) { - cluster.addPoint(point); - visited.put(point, PointStatus.PART_OF_CLUSTER); - - List seeds = new ArrayList(neighbors); - int index = 0; - while (index < seeds.size()) { - final T current = seeds.get(index); - PointStatus pStatus = visited.get(current); - // only check non-visited points - if (pStatus == null) { - final List currentNeighbors = getNeighbors(current, points); - if (currentNeighbors.size() >= minPts) { - seeds = merge(seeds, currentNeighbors); - } - } - - if (pStatus != PointStatus.PART_OF_CLUSTER) { - visited.put(current, PointStatus.PART_OF_CLUSTER); - cluster.addPoint(current); - } - - index++; - } - return cluster; - } - - /** - * Returns a list of density-reachable neighbors of a {@code point}. - * - * @param point the point to look for - * @param points possible neighbors - * @return the List of neighbors - */ - private List getNeighbors(final T point, final Collection points) { - final List neighbors = new ArrayList(); - for (final T neighbor : points) { - if (point != neighbor && neighbor.distanceFrom(point) <= eps) { - neighbors.add(neighbor); - } - } - return neighbors; - } - - /** - * Merges two lists together. - * - * @param one first list - * @param two second list - * @return merged lists - */ - private List merge(final List one, final List two) { - final Set oneSet = new HashSet(one); - for (T item : two) { - if (!oneSet.contains(item)) { - one.add(item); - } - } - return one; - } -} diff --git a/src/main/java/org/apache/commons/math4/stat/clustering/EuclideanDoublePoint.java b/src/main/java/org/apache/commons/math4/stat/clustering/EuclideanDoublePoint.java deleted file mode 100644 index 912b01d9b..000000000 --- a/src/main/java/org/apache/commons/math4/stat/clustering/EuclideanDoublePoint.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.math4.stat.clustering; - -import java.io.Serializable; -import java.util.Collection; -import java.util.Arrays; - -import org.apache.commons.math4.util.MathArrays; - -/** - * A simple implementation of {@link Clusterable} for points with double coordinates. - * @since 3.1 - * @deprecated As of 3.2 (to be removed in 4.0), - * use {@link org.apache.commons.math4.ml.clustering.DoublePoint} instead - */ -@Deprecated -public class EuclideanDoublePoint implements Clusterable, Serializable { - - /** Serializable version identifier. */ - private static final long serialVersionUID = 8026472786091227632L; - - /** Point coordinates. */ - private final double[] point; - - /** - * Build an instance wrapping an integer array. - *

- * The wrapped array is referenced, it is not copied. - * - * @param point the n-dimensional point in integer space - */ - public EuclideanDoublePoint(final double[] point) { - this.point = point; - } - - /** {@inheritDoc} */ - public EuclideanDoublePoint centroidOf(final Collection points) { - final double[] centroid = new double[getPoint().length]; - for (final EuclideanDoublePoint p : points) { - for (int i = 0; i < centroid.length; i++) { - centroid[i] += p.getPoint()[i]; - } - } - for (int i = 0; i < centroid.length; i++) { - centroid[i] /= points.size(); - } - return new EuclideanDoublePoint(centroid); - } - - /** {@inheritDoc} */ - public double distanceFrom(final EuclideanDoublePoint p) { - return MathArrays.distance(point, p.getPoint()); - } - - /** {@inheritDoc} */ - @Override - public boolean equals(final Object other) { - if (!(other instanceof EuclideanDoublePoint)) { - return false; - } - return Arrays.equals(point, ((EuclideanDoublePoint) other).point); - } - - /** - * Get the n-dimensional point in integer space. - * - * @return a reference (not a copy!) to the wrapped array - */ - public double[] getPoint() { - return point; - } - - /** {@inheritDoc} */ - @Override - public int hashCode() { - return Arrays.hashCode(point); - } - - /** {@inheritDoc} */ - @Override - public String toString() { - return Arrays.toString(point); - } - -} diff --git a/src/main/java/org/apache/commons/math4/stat/clustering/EuclideanIntegerPoint.java b/src/main/java/org/apache/commons/math4/stat/clustering/EuclideanIntegerPoint.java deleted file mode 100644 index bbc707e14..000000000 --- a/src/main/java/org/apache/commons/math4/stat/clustering/EuclideanIntegerPoint.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.math4.stat.clustering; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.Collection; - -import org.apache.commons.math4.util.MathArrays; - -/** - * A simple implementation of {@link Clusterable} for points with integer coordinates. - * @since 2.0 - * @deprecated As of 3.2 (to be removed in 4.0), - * use {@link org.apache.commons.math4.ml.clustering.DoublePoint} instead - */ -@Deprecated -public class EuclideanIntegerPoint implements Clusterable, Serializable { - - /** Serializable version identifier. */ - private static final long serialVersionUID = 3946024775784901369L; - - /** Point coordinates. */ - private final int[] point; - - /** - * Build an instance wrapping an integer array. - *

The wrapped array is referenced, it is not copied.

- * @param point the n-dimensional point in integer space - */ - public EuclideanIntegerPoint(final int[] point) { - this.point = point; - } - - /** - * Get the n-dimensional point in integer space. - * @return a reference (not a copy!) to the wrapped array - */ - public int[] getPoint() { - return point; - } - - /** {@inheritDoc} */ - public double distanceFrom(final EuclideanIntegerPoint p) { - return MathArrays.distance(point, p.getPoint()); - } - - /** {@inheritDoc} */ - public EuclideanIntegerPoint centroidOf(final Collection points) { - int[] centroid = new int[getPoint().length]; - for (EuclideanIntegerPoint p : points) { - for (int i = 0; i < centroid.length; i++) { - centroid[i] += p.getPoint()[i]; - } - } - for (int i = 0; i < centroid.length; i++) { - centroid[i] /= points.size(); - } - return new EuclideanIntegerPoint(centroid); - } - - /** {@inheritDoc} */ - @Override - public boolean equals(final Object other) { - if (!(other instanceof EuclideanIntegerPoint)) { - return false; - } - return Arrays.equals(point, ((EuclideanIntegerPoint) other).point); - } - - /** {@inheritDoc} */ - @Override - public int hashCode() { - return Arrays.hashCode(point); - } - - /** - * {@inheritDoc} - * @since 2.1 - */ - @Override - public String toString() { - return Arrays.toString(point); - } - -} diff --git a/src/main/java/org/apache/commons/math4/stat/clustering/KMeansPlusPlusClusterer.java b/src/main/java/org/apache/commons/math4/stat/clustering/KMeansPlusPlusClusterer.java deleted file mode 100644 index 6c93ed5d3..000000000 --- a/src/main/java/org/apache/commons/math4/stat/clustering/KMeansPlusPlusClusterer.java +++ /dev/null @@ -1,514 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.math4.stat.clustering; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Random; - -import org.apache.commons.math4.exception.ConvergenceException; -import org.apache.commons.math4.exception.MathIllegalArgumentException; -import org.apache.commons.math4.exception.NumberIsTooSmallException; -import org.apache.commons.math4.exception.util.LocalizedFormats; -import org.apache.commons.math4.stat.descriptive.moment.Variance; -import org.apache.commons.math4.util.MathUtils; - -/** - * Clustering algorithm based on David Arthur and Sergei Vassilvitski k-means++ algorithm. - * @param type of the points to cluster - * @see K-means++ (wikipedia) - * @since 2.0 - * @deprecated As of 3.2 (to be removed in 4.0), - * use {@link org.apache.commons.math4.ml.clustering.KMeansPlusPlusClusterer} instead - */ -@Deprecated -public class KMeansPlusPlusClusterer> { - - /** Strategies to use for replacing an empty cluster. */ - public static enum EmptyClusterStrategy { - - /** Split the cluster with largest distance variance. */ - LARGEST_VARIANCE, - - /** Split the cluster with largest number of points. */ - LARGEST_POINTS_NUMBER, - - /** Create a cluster around the point farthest from its centroid. */ - FARTHEST_POINT, - - /** Generate an error. */ - ERROR - - } - - /** Random generator for choosing initial centers. */ - private final Random random; - - /** Selected strategy for empty clusters. */ - private final EmptyClusterStrategy emptyStrategy; - - /** Build a clusterer. - *

- * The default strategy for handling empty clusters that may appear during - * algorithm iterations is to split the cluster with largest distance variance. - *

- * @param random random generator to use for choosing initial centers - */ - public KMeansPlusPlusClusterer(final Random random) { - this(random, EmptyClusterStrategy.LARGEST_VARIANCE); - } - - /** Build a clusterer. - * @param random random generator to use for choosing initial centers - * @param emptyStrategy strategy to use for handling empty clusters that - * may appear during algorithm iterations - * @since 2.2 - */ - public KMeansPlusPlusClusterer(final Random random, final EmptyClusterStrategy emptyStrategy) { - this.random = random; - this.emptyStrategy = emptyStrategy; - } - - /** - * Runs the K-means++ clustering algorithm. - * - * @param points the points to cluster - * @param k the number of clusters to split the data into - * @param numTrials number of trial runs - * @param maxIterationsPerTrial the maximum number of iterations to run the algorithm - * for at each trial run. If negative, no maximum will be used - * @return a list of clusters containing the points - * @throws MathIllegalArgumentException if the data points are null or the number - * of clusters is larger than the number of data points - * @throws ConvergenceException if an empty cluster is encountered and the - * {@link #emptyStrategy} is set to {@code ERROR} - */ - public List> cluster(final Collection points, final int k, - int numTrials, int maxIterationsPerTrial) - throws MathIllegalArgumentException, ConvergenceException { - - // at first, we have not found any clusters list yet - List> best = null; - double bestVarianceSum = Double.POSITIVE_INFINITY; - - // do several clustering trials - for (int i = 0; i < numTrials; ++i) { - - // compute a clusters list - List> clusters = cluster(points, k, maxIterationsPerTrial); - - // compute the variance of the current list - double varianceSum = 0.0; - for (final Cluster cluster : clusters) { - if (!cluster.getPoints().isEmpty()) { - - // compute the distance variance of the current cluster - final T center = cluster.getCenter(); - final Variance stat = new Variance(); - for (final T point : cluster.getPoints()) { - stat.increment(point.distanceFrom(center)); - } - varianceSum += stat.getResult(); - - } - } - - if (varianceSum <= bestVarianceSum) { - // this one is the best we have found so far, remember it - best = clusters; - bestVarianceSum = varianceSum; - } - - } - - // return the best clusters list found - return best; - - } - - /** - * Runs the K-means++ clustering algorithm. - * - * @param points the points to cluster - * @param k the number of clusters to split the data into - * @param maxIterations the maximum number of iterations to run the algorithm - * for. If negative, no maximum will be used - * @return a list of clusters containing the points - * @throws MathIllegalArgumentException if the data points are null or the number - * of clusters is larger than the number of data points - * @throws ConvergenceException if an empty cluster is encountered and the - * {@link #emptyStrategy} is set to {@code ERROR} - */ - public List> cluster(final Collection points, final int k, - final int maxIterations) - throws MathIllegalArgumentException, ConvergenceException { - - // sanity checks - MathUtils.checkNotNull(points); - - // number of clusters has to be smaller or equal the number of data points - if (points.size() < k) { - throw new NumberIsTooSmallException(points.size(), k, false); - } - - // create the initial clusters - List> clusters = chooseInitialCenters(points, k, random); - - // create an array containing the latest assignment of a point to a cluster - // no need to initialize the array, as it will be filled with the first assignment - int[] assignments = new int[points.size()]; - assignPointsToClusters(clusters, points, assignments); - - // iterate through updating the centers until we're done - final int max = (maxIterations < 0) ? Integer.MAX_VALUE : maxIterations; - for (int count = 0; count < max; count++) { - boolean emptyCluster = false; - List> newClusters = new ArrayList>(); - for (final Cluster cluster : clusters) { - final T newCenter; - if (cluster.getPoints().isEmpty()) { - switch (emptyStrategy) { - case LARGEST_VARIANCE : - newCenter = getPointFromLargestVarianceCluster(clusters); - break; - case LARGEST_POINTS_NUMBER : - newCenter = getPointFromLargestNumberCluster(clusters); - break; - case FARTHEST_POINT : - newCenter = getFarthestPoint(clusters); - break; - default : - throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); - } - emptyCluster = true; - } else { - newCenter = cluster.getCenter().centroidOf(cluster.getPoints()); - } - newClusters.add(new Cluster(newCenter)); - } - int changes = assignPointsToClusters(newClusters, points, assignments); - clusters = newClusters; - - // if there were no more changes in the point-to-cluster assignment - // and there are no empty clusters left, return the current clusters - if (changes == 0 && !emptyCluster) { - return clusters; - } - } - return clusters; - } - - /** - * Adds the given points to the closest {@link Cluster}. - * - * @param type of the points to cluster - * @param clusters the {@link Cluster}s to add the points to - * @param points the points to add to the given {@link Cluster}s - * @param assignments points assignments to clusters - * @return the number of points assigned to different clusters as the iteration before - */ - private static > int - assignPointsToClusters(final List> clusters, final Collection points, - final int[] assignments) { - int assignedDifferently = 0; - int pointIndex = 0; - for (final T p : points) { - int clusterIndex = getNearestCluster(clusters, p); - if (clusterIndex != assignments[pointIndex]) { - assignedDifferently++; - } - - Cluster cluster = clusters.get(clusterIndex); - cluster.addPoint(p); - assignments[pointIndex++] = clusterIndex; - } - - return assignedDifferently; - } - - /** - * Use K-means++ to choose the initial centers. - * - * @param type of the points to cluster - * @param points the points to choose the initial centers from - * @param k the number of centers to choose - * @param random random generator to use - * @return the initial centers - */ - private static > List> - chooseInitialCenters(final Collection points, final int k, final Random random) { - - // Convert to list for indexed access. Make it unmodifiable, since removal of items - // would screw up the logic of this method. - final List pointList = Collections.unmodifiableList(new ArrayList (points)); - - // The number of points in the list. - final int numPoints = pointList.size(); - - // Set the corresponding element in this array to indicate when - // elements of pointList are no longer available. - final boolean[] taken = new boolean[numPoints]; - - // The resulting list of initial centers. - final List> resultSet = new ArrayList>(); - - // Choose one center uniformly at random from among the data points. - final int firstPointIndex = random.nextInt(numPoints); - - final T firstPoint = pointList.get(firstPointIndex); - - resultSet.add(new Cluster(firstPoint)); - - // Must mark it as taken - taken[firstPointIndex] = true; - - // To keep track of the minimum distance squared of elements of - // pointList to elements of resultSet. - final double[] minDistSquared = new double[numPoints]; - - // Initialize the elements. Since the only point in resultSet is firstPoint, - // this is very easy. - for (int i = 0; i < numPoints; i++) { - if (i != firstPointIndex) { // That point isn't considered - double d = firstPoint.distanceFrom(pointList.get(i)); - minDistSquared[i] = d*d; - } - } - - while (resultSet.size() < k) { - - // Sum up the squared distances for the points in pointList not - // already taken. - double distSqSum = 0.0; - - for (int i = 0; i < numPoints; i++) { - if (!taken[i]) { - distSqSum += minDistSquared[i]; - } - } - - // Add one new data point as a center. Each point x is chosen with - // probability proportional to D(x)2 - final double r = random.nextDouble() * distSqSum; - - // The index of the next point to be added to the resultSet. - int nextPointIndex = -1; - - // Sum through the squared min distances again, stopping when - // sum >= r. - double sum = 0.0; - for (int i = 0; i < numPoints; i++) { - if (!taken[i]) { - sum += minDistSquared[i]; - if (sum >= r) { - nextPointIndex = i; - break; - } - } - } - - // If it's not set to >= 0, the point wasn't found in the previous - // for loop, probably because distances are extremely small. Just pick - // the last available point. - if (nextPointIndex == -1) { - for (int i = numPoints - 1; i >= 0; i--) { - if (!taken[i]) { - nextPointIndex = i; - break; - } - } - } - - // We found one. - if (nextPointIndex >= 0) { - - final T p = pointList.get(nextPointIndex); - - resultSet.add(new Cluster (p)); - - // Mark it as taken. - taken[nextPointIndex] = true; - - if (resultSet.size() < k) { - // Now update elements of minDistSquared. We only have to compute - // the distance to the new center to do this. - for (int j = 0; j < numPoints; j++) { - // Only have to worry about the points still not taken. - if (!taken[j]) { - double d = p.distanceFrom(pointList.get(j)); - double d2 = d * d; - if (d2 < minDistSquared[j]) { - minDistSquared[j] = d2; - } - } - } - } - - } else { - // None found -- - // Break from the while loop to prevent - // an infinite loop. - break; - } - } - - return resultSet; - } - - /** - * Get a random point from the {@link Cluster} with the largest distance variance. - * - * @param clusters the {@link Cluster}s to search - * @return a random point from the selected cluster - * @throws ConvergenceException if clusters are all empty - */ - private T getPointFromLargestVarianceCluster(final Collection> clusters) - throws ConvergenceException { - - double maxVariance = Double.NEGATIVE_INFINITY; - Cluster selected = null; - for (final Cluster cluster : clusters) { - if (!cluster.getPoints().isEmpty()) { - - // compute the distance variance of the current cluster - final T center = cluster.getCenter(); - final Variance stat = new Variance(); - for (final T point : cluster.getPoints()) { - stat.increment(point.distanceFrom(center)); - } - final double variance = stat.getResult(); - - // select the cluster with the largest variance - if (variance > maxVariance) { - maxVariance = variance; - selected = cluster; - } - - } - } - - // did we find at least one non-empty cluster ? - if (selected == null) { - throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); - } - - // extract a random point from the cluster - final List selectedPoints = selected.getPoints(); - return selectedPoints.remove(random.nextInt(selectedPoints.size())); - - } - - /** - * Get a random point from the {@link Cluster} with the largest number of points - * - * @param clusters the {@link Cluster}s to search - * @return a random point from the selected cluster - * @throws ConvergenceException if clusters are all empty - */ - private T getPointFromLargestNumberCluster(final Collection> clusters) throws ConvergenceException { - - int maxNumber = 0; - Cluster selected = null; - for (final Cluster cluster : clusters) { - - // get the number of points of the current cluster - final int number = cluster.getPoints().size(); - - // select the cluster with the largest number of points - if (number > maxNumber) { - maxNumber = number; - selected = cluster; - } - - } - - // did we find at least one non-empty cluster ? - if (selected == null) { - throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); - } - - // extract a random point from the cluster - final List selectedPoints = selected.getPoints(); - return selectedPoints.remove(random.nextInt(selectedPoints.size())); - - } - - /** - * Get the point farthest to its cluster center - * - * @param clusters the {@link Cluster}s to search - * @return point farthest to its cluster center - * @throws ConvergenceException if clusters are all empty - */ - private T getFarthestPoint(final Collection> clusters) throws ConvergenceException { - - double maxDistance = Double.NEGATIVE_INFINITY; - Cluster selectedCluster = null; - int selectedPoint = -1; - for (final Cluster cluster : clusters) { - - // get the farthest point - final T center = cluster.getCenter(); - final List points = cluster.getPoints(); - for (int i = 0; i < points.size(); ++i) { - final double distance = points.get(i).distanceFrom(center); - if (distance > maxDistance) { - maxDistance = distance; - selectedCluster = cluster; - selectedPoint = i; - } - } - - } - - // did we find at least one non-empty cluster ? - if (selectedCluster == null) { - throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); - } - - return selectedCluster.getPoints().remove(selectedPoint); - - } - - /** - * Returns the nearest {@link Cluster} to the given point - * - * @param type of the points to cluster - * @param clusters the {@link Cluster}s to search - * @param point the point to find the nearest {@link Cluster} for - * @return the index of the nearest {@link Cluster} to the given point - */ - private static > int - getNearestCluster(final Collection> clusters, final T point) { - double minDistance = Double.MAX_VALUE; - int clusterIndex = 0; - int minCluster = 0; - for (final Cluster c : clusters) { - final double distance = point.distanceFrom(c.getCenter()); - if (distance < minDistance) { - minDistance = distance; - minCluster = clusterIndex; - } - clusterIndex++; - } - return minCluster; - } - -} diff --git a/src/main/java/org/apache/commons/math4/stat/clustering/package-info.java b/src/main/java/org/apache/commons/math4/stat/clustering/package-info.java deleted file mode 100644 index bf5970983..000000000 --- a/src/main/java/org/apache/commons/math4/stat/clustering/package-info.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - *

All classes and sub-packages of this package are deprecated.

- *

Please use their replacements, to be found under - *
    - *
  • {@link org.apache.commons.math4.ml.clustering}
  • - *
- *

- * - *

- * Clustering algorithms. - *

- */ -package org.apache.commons.math4.stat.clustering; diff --git a/src/test/java/org/apache/commons/math4/stat/clustering/DBSCANClustererTest.java b/src/test/java/org/apache/commons/math4/stat/clustering/DBSCANClustererTest.java deleted file mode 100644 index 0534a5b65..000000000 --- a/src/test/java/org/apache/commons/math4/stat/clustering/DBSCANClustererTest.java +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.math4.stat.clustering; - -import java.util.Arrays; -import java.util.List; - -import org.apache.commons.math4.exception.MathIllegalArgumentException; -import org.apache.commons.math4.exception.NullArgumentException; -import org.apache.commons.math4.stat.clustering.Cluster; -import org.apache.commons.math4.stat.clustering.DBSCANClusterer; -import org.apache.commons.math4.stat.clustering.EuclideanDoublePoint; -import org.apache.commons.math4.stat.clustering.EuclideanIntegerPoint; -import org.junit.Assert; -import org.junit.Test; - -@Deprecated -public class DBSCANClustererTest { - - @Test - public void testCluster() { - // Test data generated using: http://people.cs.nctu.edu.tw/~rsliang/dbscan/testdatagen.html - final EuclideanDoublePoint[] points = new EuclideanDoublePoint[] { - new EuclideanDoublePoint(new double[] { 83.08303244924173, 58.83387754182331 }), - new EuclideanDoublePoint(new double[] { 45.05445510940626, 23.469642649637535 }), - new EuclideanDoublePoint(new double[] { 14.96417921432294, 69.0264096390456 }), - new EuclideanDoublePoint(new double[] { 73.53189604333602, 34.896145021310076 }), - new EuclideanDoublePoint(new double[] { 73.28498173551634, 33.96860806993209 }), - new EuclideanDoublePoint(new double[] { 73.45828098873608, 33.92584423092194 }), - new EuclideanDoublePoint(new double[] { 73.9657889183145, 35.73191006924026 }), - new EuclideanDoublePoint(new double[] { 74.0074097183533, 36.81735596177168 }), - new EuclideanDoublePoint(new double[] { 73.41247541410848, 34.27314856695011 }), - new EuclideanDoublePoint(new double[] { 73.9156256353017, 36.83206791547127 }), - new EuclideanDoublePoint(new double[] { 74.81499205809087, 37.15682749846019 }), - new EuclideanDoublePoint(new double[] { 74.03144880081527, 37.57399178552441 }), - new EuclideanDoublePoint(new double[] { 74.51870941207744, 38.674258946906775 }), - new EuclideanDoublePoint(new double[] { 74.50754595105536, 35.58903978415765 }), - new EuclideanDoublePoint(new double[] { 74.51322752749547, 36.030572259100154 }), - new EuclideanDoublePoint(new double[] { 59.27900996617973, 46.41091720294207 }), - new EuclideanDoublePoint(new double[] { 59.73744793841615, 46.20015558367595 }), - new EuclideanDoublePoint(new double[] { 58.81134076672606, 45.71150126331486 }), - new EuclideanDoublePoint(new double[] { 58.52225539437495, 47.416083617601544 }), - new EuclideanDoublePoint(new double[] { 58.218626647023484, 47.36228902172297 }), - new EuclideanDoublePoint(new double[] { 60.27139669447206, 46.606106348801404 }), - new EuclideanDoublePoint(new double[] { 60.894962462363765, 46.976924697402865 }), - new EuclideanDoublePoint(new double[] { 62.29048673878424, 47.66970563563518 }), - new EuclideanDoublePoint(new double[] { 61.03857608977705, 46.212924720020965 }), - new EuclideanDoublePoint(new double[] { 60.16916214139201, 45.18193661351688 }), - new EuclideanDoublePoint(new double[] { 59.90036905976012, 47.555364347063005 }), - new EuclideanDoublePoint(new double[] { 62.33003634144552, 47.83941489877179 }), - new EuclideanDoublePoint(new double[] { 57.86035536718555, 47.31117930193432 }), - new EuclideanDoublePoint(new double[] { 58.13715479685925, 48.985960494028404 }), - new EuclideanDoublePoint(new double[] { 56.131923963548616, 46.8508904252667 }), - new EuclideanDoublePoint(new double[] { 55.976329887053, 47.46384037658572 }), - new EuclideanDoublePoint(new double[] { 56.23245975235477, 47.940035191131756 }), - new EuclideanDoublePoint(new double[] { 58.51687048212625, 46.622885352699086 }), - new EuclideanDoublePoint(new double[] { 57.85411081905477, 45.95394361577928 }), - new EuclideanDoublePoint(new double[] { 56.445776311447844, 45.162093662656844 }), - new EuclideanDoublePoint(new double[] { 57.36691949656233, 47.50097194337286 }), - new EuclideanDoublePoint(new double[] { 58.243626387557015, 46.114052729681134 }), - new EuclideanDoublePoint(new double[] { 56.27224595635198, 44.799080066150054 }), - new EuclideanDoublePoint(new double[] { 57.606924816500396, 46.94291057763621 }), - new EuclideanDoublePoint(new double[] { 30.18714230041951, 13.877149710431695 }), - new EuclideanDoublePoint(new double[] { 30.449448810657486, 13.490778346545994 }), - new EuclideanDoublePoint(new double[] { 30.295018390286714, 13.264889000216499 }), - new EuclideanDoublePoint(new double[] { 30.160201832884923, 11.89278262341395 }), - new EuclideanDoublePoint(new double[] { 31.341509791789576, 15.282655921997502 }), - new EuclideanDoublePoint(new double[] { 31.68601630325429, 14.756873246748 }), - new EuclideanDoublePoint(new double[] { 29.325963742565364, 12.097849250072613 }), - new EuclideanDoublePoint(new double[] { 29.54820742388256, 13.613295356975868 }), - new EuclideanDoublePoint(new double[] { 28.79359608888626, 10.36352064087987 }), - new EuclideanDoublePoint(new double[] { 31.01284597092308, 12.788479208014905 }), - new EuclideanDoublePoint(new double[] { 27.58509216737002, 11.47570110601373 }), - new EuclideanDoublePoint(new double[] { 28.593799561727792, 10.780998203903437 }), - new EuclideanDoublePoint(new double[] { 31.356105766724795, 15.080316198524088 }), - new EuclideanDoublePoint(new double[] { 31.25948503636755, 13.674329151166603 }), - new EuclideanDoublePoint(new double[] { 32.31590076372959, 14.95261758659035 }), - new EuclideanDoublePoint(new double[] { 30.460413702763617, 15.88402809202671 }), - new EuclideanDoublePoint(new double[] { 32.56178203062154, 14.586076852632686 }), - new EuclideanDoublePoint(new double[] { 32.76138648530468, 16.239837325178087 }), - new EuclideanDoublePoint(new double[] { 30.1829453331884, 14.709592407103628 }), - new EuclideanDoublePoint(new double[] { 29.55088173528202, 15.0651247180067 }), - new EuclideanDoublePoint(new double[] { 29.004155302187428, 14.089665298582986 }), - new EuclideanDoublePoint(new double[] { 29.339624439831823, 13.29096065578051 }), - new EuclideanDoublePoint(new double[] { 30.997460327576846, 14.551914158277214 }), - new EuclideanDoublePoint(new double[] { 30.66784126125276, 16.269703107886016 }) - }; - - final DBSCANClusterer transformer = - new DBSCANClusterer(2.0, 5); - final List> clusters = transformer.cluster(Arrays.asList(points)); - - final List clusterOne = - Arrays.asList(points[3], points[4], points[5], points[6], points[7], points[8], points[9], points[10], - points[11], points[12], points[13], points[14]); - final List clusterTwo = - Arrays.asList(points[15], points[16], points[17], points[18], points[19], points[20], points[21], - points[22], points[23], points[24], points[25], points[26], points[27], points[28], - points[29], points[30], points[31], points[32], points[33], points[34], points[35], - points[36], points[37], points[38]); - final List clusterThree = - Arrays.asList(points[39], points[40], points[41], points[42], points[43], points[44], points[45], - points[46], points[47], points[48], points[49], points[50], points[51], points[52], - points[53], points[54], points[55], points[56], points[57], points[58], points[59], - points[60], points[61], points[62]); - - boolean cluster1Found = false; - boolean cluster2Found = false; - boolean cluster3Found = false; - Assert.assertEquals(3, clusters.size()); - for (final Cluster cluster : clusters) { - if (cluster.getPoints().containsAll(clusterOne)) { - cluster1Found = true; - } - if (cluster.getPoints().containsAll(clusterTwo)) { - cluster2Found = true; - } - if (cluster.getPoints().containsAll(clusterThree)) { - cluster3Found = true; - } - } - Assert.assertTrue(cluster1Found); - Assert.assertTrue(cluster2Found); - Assert.assertTrue(cluster3Found); - } - - @Test - public void testSingleLink() { - final EuclideanIntegerPoint[] points = { - new EuclideanIntegerPoint(new int[] {10, 10}), // A - new EuclideanIntegerPoint(new int[] {12, 9}), - new EuclideanIntegerPoint(new int[] {10, 8}), - new EuclideanIntegerPoint(new int[] {8, 8}), - new EuclideanIntegerPoint(new int[] {8, 6}), - new EuclideanIntegerPoint(new int[] {7, 7}), - new EuclideanIntegerPoint(new int[] {5, 6}), // B - new EuclideanIntegerPoint(new int[] {14, 8}), // C - new EuclideanIntegerPoint(new int[] {7, 15}), // N - Noise, should not be present - new EuclideanIntegerPoint(new int[] {17, 8}), // D - single-link connected to C should not be present - - }; - - final DBSCANClusterer clusterer = new DBSCANClusterer(3, 3); - List> clusters = clusterer.cluster(Arrays.asList(points)); - - Assert.assertEquals(1, clusters.size()); - - final List clusterOne = - Arrays.asList(points[0], points[1], points[2], points[3], points[4], points[5], points[6], points[7]); - Assert.assertTrue(clusters.get(0).getPoints().containsAll(clusterOne)); - } - - @Test - public void testGetEps() { - final DBSCANClusterer transformer = new DBSCANClusterer(2.0, 5); - Assert.assertEquals(2.0, transformer.getEps(), 0.0); - } - - @Test - public void testGetMinPts() { - final DBSCANClusterer transformer = new DBSCANClusterer(2.0, 5); - Assert.assertEquals(5, transformer.getMinPts()); - } - - @Test(expected = MathIllegalArgumentException.class) - public void testNegativeEps() { - new DBSCANClusterer(-2.0, 5); - } - - @Test(expected = MathIllegalArgumentException.class) - public void testNegativeMinPts() { - new DBSCANClusterer(2.0, -5); - } - - @Test(expected = NullArgumentException.class) - public void testNullDataset() { - DBSCANClusterer clusterer = new DBSCANClusterer(2.0, 5); - clusterer.cluster(null); - } - -} diff --git a/src/test/java/org/apache/commons/math4/stat/clustering/EuclideanDoublePointTest.java b/src/test/java/org/apache/commons/math4/stat/clustering/EuclideanDoublePointTest.java deleted file mode 100644 index 290b28426..000000000 --- a/src/test/java/org/apache/commons/math4/stat/clustering/EuclideanDoublePointTest.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.math4.stat.clustering; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.math4.TestUtils; -import org.apache.commons.math4.stat.clustering.EuclideanDoublePoint; -import org.apache.commons.math4.util.FastMath; -import org.junit.Assert; -import org.junit.Test; - -@Deprecated -public class EuclideanDoublePointTest { - - @Test - public void testArrayIsReference() { - final double[] array = { -3.0, -2.0, -1.0, 0.0, 1.0 }; - Assert.assertArrayEquals(array, new EuclideanDoublePoint(array).getPoint(), 1.0e-15); - } - - @Test - public void testDistance() { - final EuclideanDoublePoint e1 = new EuclideanDoublePoint(new double[] { -3.0, -2.0, -1.0, 0.0, 1.0 }); - final EuclideanDoublePoint e2 = new EuclideanDoublePoint(new double[] { 1.0, 0.0, -1.0, 1.0, 1.0 }); - Assert.assertEquals(FastMath.sqrt(21.0), e1.distanceFrom(e2), 1.0e-15); - Assert.assertEquals(0.0, e1.distanceFrom(e1), 1.0e-15); - Assert.assertEquals(0.0, e2.distanceFrom(e2), 1.0e-15); - } - - @Test - public void testCentroid() { - final List list = new ArrayList(); - list.add(new EuclideanDoublePoint(new double[] { 1.0, 3.0 })); - list.add(new EuclideanDoublePoint(new double[] { 2.0, 2.0 })); - list.add(new EuclideanDoublePoint(new double[] { 3.0, 3.0 })); - list.add(new EuclideanDoublePoint(new double[] { 2.0, 4.0 })); - final EuclideanDoublePoint c = list.get(0).centroidOf(list); - Assert.assertEquals(2.0, c.getPoint()[0], 1.0e-15); - Assert.assertEquals(3.0, c.getPoint()[1], 1.0e-15); - } - - @Test - public void testSerial() { - final EuclideanDoublePoint p = new EuclideanDoublePoint(new double[] { -3.0, -2.0, -1.0, 0.0, 1.0 }); - Assert.assertEquals(p, TestUtils.serializeAndRecover(p)); - } - -} diff --git a/src/test/java/org/apache/commons/math4/stat/clustering/EuclideanIntegerPointTest.java b/src/test/java/org/apache/commons/math4/stat/clustering/EuclideanIntegerPointTest.java deleted file mode 100644 index e38c0d0d5..000000000 --- a/src/test/java/org/apache/commons/math4/stat/clustering/EuclideanIntegerPointTest.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.math4.stat.clustering; - - -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.math4.TestUtils; -import org.apache.commons.math4.stat.clustering.EuclideanIntegerPoint; -import org.apache.commons.math4.util.FastMath; -import org.junit.Assert; -import org.junit.Test; - -@Deprecated -public class EuclideanIntegerPointTest { - - @Test - public void testArrayIsReference() { - int[] array = { -3, -2, -1, 0, 1 }; - Assert.assertTrue(array == new EuclideanIntegerPoint(array).getPoint()); - } - - @Test - public void testDistance() { - EuclideanIntegerPoint e1 = new EuclideanIntegerPoint(new int[] { -3, -2, -1, 0, 1 }); - EuclideanIntegerPoint e2 = new EuclideanIntegerPoint(new int[] { 1, 0, -1, 1, 1 }); - Assert.assertEquals(FastMath.sqrt(21.0), e1.distanceFrom(e2), 1.0e-15); - Assert.assertEquals(0.0, e1.distanceFrom(e1), 1.0e-15); - Assert.assertEquals(0.0, e2.distanceFrom(e2), 1.0e-15); - } - - @Test - public void testCentroid() { - List list = new ArrayList(); - list.add(new EuclideanIntegerPoint(new int[] { 1, 3 })); - list.add(new EuclideanIntegerPoint(new int[] { 2, 2 })); - list.add(new EuclideanIntegerPoint(new int[] { 3, 3 })); - list.add(new EuclideanIntegerPoint(new int[] { 2, 4 })); - EuclideanIntegerPoint c = list.get(0).centroidOf(list); - Assert.assertEquals(2, c.getPoint()[0]); - Assert.assertEquals(3, c.getPoint()[1]); - } - - @Test - public void testSerial() { - EuclideanIntegerPoint p = new EuclideanIntegerPoint(new int[] { -3, -2, -1, 0, 1 }); - Assert.assertEquals(p, TestUtils.serializeAndRecover(p)); - } - -} diff --git a/src/test/java/org/apache/commons/math4/stat/clustering/KMeansPlusPlusClustererTest.java b/src/test/java/org/apache/commons/math4/stat/clustering/KMeansPlusPlusClustererTest.java deleted file mode 100644 index 02538d99d..000000000 --- a/src/test/java/org/apache/commons/math4/stat/clustering/KMeansPlusPlusClustererTest.java +++ /dev/null @@ -1,277 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.commons.math4.stat.clustering; - - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; -import java.util.Random; - -import org.apache.commons.math4.exception.NumberIsTooSmallException; -import org.apache.commons.math4.stat.clustering.Cluster; -import org.apache.commons.math4.stat.clustering.Clusterable; -import org.apache.commons.math4.stat.clustering.EuclideanIntegerPoint; -import org.apache.commons.math4.stat.clustering.KMeansPlusPlusClusterer; -import org.junit.Assert; -import org.junit.Test; - -@Deprecated -public class KMeansPlusPlusClustererTest { - - @Test - public void dimension2() { - KMeansPlusPlusClusterer transformer = - new KMeansPlusPlusClusterer(new Random(1746432956321l)); - EuclideanIntegerPoint[] points = new EuclideanIntegerPoint[] { - - // first expected cluster - new EuclideanIntegerPoint(new int[] { -15, 3 }), - new EuclideanIntegerPoint(new int[] { -15, 4 }), - new EuclideanIntegerPoint(new int[] { -15, 5 }), - new EuclideanIntegerPoint(new int[] { -14, 3 }), - new EuclideanIntegerPoint(new int[] { -14, 5 }), - new EuclideanIntegerPoint(new int[] { -13, 3 }), - new EuclideanIntegerPoint(new int[] { -13, 4 }), - new EuclideanIntegerPoint(new int[] { -13, 5 }), - - // second expected cluster - new EuclideanIntegerPoint(new int[] { -1, 0 }), - new EuclideanIntegerPoint(new int[] { -1, -1 }), - new EuclideanIntegerPoint(new int[] { 0, -1 }), - new EuclideanIntegerPoint(new int[] { 1, -1 }), - new EuclideanIntegerPoint(new int[] { 1, -2 }), - - // third expected cluster - new EuclideanIntegerPoint(new int[] { 13, 3 }), - new EuclideanIntegerPoint(new int[] { 13, 4 }), - new EuclideanIntegerPoint(new int[] { 14, 4 }), - new EuclideanIntegerPoint(new int[] { 14, 7 }), - new EuclideanIntegerPoint(new int[] { 16, 5 }), - new EuclideanIntegerPoint(new int[] { 16, 6 }), - new EuclideanIntegerPoint(new int[] { 17, 4 }), - new EuclideanIntegerPoint(new int[] { 17, 7 }) - - }; - List> clusters = - transformer.cluster(Arrays.asList(points), 3, 5, 10); - - Assert.assertEquals(3, clusters.size()); - boolean cluster1Found = false; - boolean cluster2Found = false; - boolean cluster3Found = false; - for (Cluster cluster : clusters) { - int[] center = cluster.getCenter().getPoint(); - if (center[0] < 0) { - cluster1Found = true; - Assert.assertEquals(8, cluster.getPoints().size()); - Assert.assertEquals(-14, center[0]); - Assert.assertEquals( 4, center[1]); - } else if (center[1] < 0) { - cluster2Found = true; - Assert.assertEquals(5, cluster.getPoints().size()); - Assert.assertEquals( 0, center[0]); - Assert.assertEquals(-1, center[1]); - } else { - cluster3Found = true; - Assert.assertEquals(8, cluster.getPoints().size()); - Assert.assertEquals(15, center[0]); - Assert.assertEquals(5, center[1]); - } - } - Assert.assertTrue(cluster1Found); - Assert.assertTrue(cluster2Found); - Assert.assertTrue(cluster3Found); - - } - - /** - * JIRA: MATH-305 - * - * Two points, one cluster, one iteration - */ - @Test - public void testPerformClusterAnalysisDegenerate() { - KMeansPlusPlusClusterer transformer = new KMeansPlusPlusClusterer( - new Random(1746432956321l)); - EuclideanIntegerPoint[] points = new EuclideanIntegerPoint[] { - new EuclideanIntegerPoint(new int[] { 1959, 325100 }), - new EuclideanIntegerPoint(new int[] { 1960, 373200 }), }; - List> clusters = transformer.cluster(Arrays.asList(points), 1, 1); - Assert.assertEquals(1, clusters.size()); - Assert.assertEquals(2, (clusters.get(0).getPoints().size())); - EuclideanIntegerPoint pt1 = new EuclideanIntegerPoint(new int[] { 1959, 325100 }); - EuclideanIntegerPoint pt2 = new EuclideanIntegerPoint(new int[] { 1960, 373200 }); - Assert.assertTrue(clusters.get(0).getPoints().contains(pt1)); - Assert.assertTrue(clusters.get(0).getPoints().contains(pt2)); - - } - - @Test - public void testCertainSpace() { - KMeansPlusPlusClusterer.EmptyClusterStrategy[] strategies = { - KMeansPlusPlusClusterer.EmptyClusterStrategy.LARGEST_VARIANCE, - KMeansPlusPlusClusterer.EmptyClusterStrategy.LARGEST_POINTS_NUMBER, - KMeansPlusPlusClusterer.EmptyClusterStrategy.FARTHEST_POINT - }; - for (KMeansPlusPlusClusterer.EmptyClusterStrategy strategy : strategies) { - KMeansPlusPlusClusterer transformer = - new KMeansPlusPlusClusterer(new Random(1746432956321l), strategy); - int numberOfVariables = 27; - // initialise testvalues - int position1 = 1; - int position2 = position1 + numberOfVariables; - int position3 = position2 + numberOfVariables; - int position4 = position3 + numberOfVariables; - // testvalues will be multiplied - int multiplier = 1000000; - - EuclideanIntegerPoint[] breakingPoints = new EuclideanIntegerPoint[numberOfVariables]; - // define the space which will break the cluster algorithm - for (int i = 0; i < numberOfVariables; i++) { - int points[] = { position1, position2, position3, position4 }; - // multiply the values - for (int j = 0; j < points.length; j++) { - points[j] *= multiplier; - } - EuclideanIntegerPoint euclideanIntegerPoint = new EuclideanIntegerPoint(points); - breakingPoints[i] = euclideanIntegerPoint; - position1 += numberOfVariables; - position2 += numberOfVariables; - position3 += numberOfVariables; - position4 += numberOfVariables; - } - - for (int n = 2; n < 27; ++n) { - List> clusters = - transformer.cluster(Arrays.asList(breakingPoints), n, 100); - Assert.assertEquals(n, clusters.size()); - int sum = 0; - for (Cluster cluster : clusters) { - sum += cluster.getPoints().size(); - } - Assert.assertEquals(numberOfVariables, sum); - } - } - - } - - /** - * A helper class for testSmallDistances(). This class is similar to EuclideanIntegerPoint, but - * it defines a different distanceFrom() method that tends to return distances less than 1. - */ - private class CloseIntegerPoint implements Clusterable { - public CloseIntegerPoint(EuclideanIntegerPoint point) { - euclideanPoint = point; - } - - public double distanceFrom(CloseIntegerPoint p) { - return euclideanPoint.distanceFrom(p.euclideanPoint) * 0.001; - } - - public CloseIntegerPoint centroidOf(Collection p) { - Collection euclideanPoints = - new ArrayList(); - for (CloseIntegerPoint point : p) { - euclideanPoints.add(point.euclideanPoint); - } - return new CloseIntegerPoint(euclideanPoint.centroidOf(euclideanPoints)); - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof CloseIntegerPoint)) { - return false; - } - CloseIntegerPoint p = (CloseIntegerPoint) o; - - return euclideanPoint.equals(p.euclideanPoint); - } - - @Override - public int hashCode() { - return euclideanPoint.hashCode(); - } - - private EuclideanIntegerPoint euclideanPoint; - } - - /** - * Test points that are very close together. See issue MATH-546. - */ - @Test - public void testSmallDistances() { - // Create a bunch of CloseIntegerPoints. Most are identical, but one is different by a - // small distance. - int[] repeatedArray = { 0 }; - int[] uniqueArray = { 1 }; - CloseIntegerPoint repeatedPoint = - new CloseIntegerPoint(new EuclideanIntegerPoint(repeatedArray)); - CloseIntegerPoint uniquePoint = - new CloseIntegerPoint(new EuclideanIntegerPoint(uniqueArray)); - - Collection points = new ArrayList(); - final int NUM_REPEATED_POINTS = 10 * 1000; - for (int i = 0; i < NUM_REPEATED_POINTS; ++i) { - points.add(repeatedPoint); - } - points.add(uniquePoint); - - // Ask a KMeansPlusPlusClusterer to run zero iterations (i.e., to simply choose initial - // cluster centers). - final long RANDOM_SEED = 0; - final int NUM_CLUSTERS = 2; - final int NUM_ITERATIONS = 0; - KMeansPlusPlusClusterer clusterer = - new KMeansPlusPlusClusterer(new Random(RANDOM_SEED)); - List> clusters = - clusterer.cluster(points, NUM_CLUSTERS, NUM_ITERATIONS); - - // Check that one of the chosen centers is the unique point. - boolean uniquePointIsCenter = false; - for (Cluster cluster : clusters) { - if (cluster.getCenter().equals(uniquePoint)) { - uniquePointIsCenter = true; - } - } - Assert.assertTrue(uniquePointIsCenter); - } - - /** - * 2 variables cannot be clustered into 3 clusters. See issue MATH-436. - */ - @Test(expected=NumberIsTooSmallException.class) - public void testPerformClusterAnalysisToManyClusters() { - KMeansPlusPlusClusterer transformer = - new KMeansPlusPlusClusterer( - new Random(1746432956321l)); - - EuclideanIntegerPoint[] points = new EuclideanIntegerPoint[] { - new EuclideanIntegerPoint(new int[] { - 1959, 325100 - }), new EuclideanIntegerPoint(new int[] { - 1960, 373200 - }) - }; - - transformer.cluster(Arrays.asList(points), 3, 1); - - } - -}