From 1b238344cd2c311fc68d998043602320c5f11970 Mon Sep 17 00:00:00 2001
From: Luc Maisonobe
+ * The stored value is a copy of the parameter array, not the array itself
+ *
+ * The stored array is the one which was set by previous calls to
+ *
- * To compute percentiles, the data must be (totally) ordered. Input arrays
- * are copied and then sorted using {@link java.util.Arrays#sort(double[])}.
+ * To compute percentiles, the data must be at least partially ordered. Input
+ * arrays are copied and recursively partitioned using an ordering definition.
* The ordering used by Arrays.sort(double[])
is the one determined
* by {@link java.lang.Double#compareTo(Double)}. This ordering makes
* Double.NaN
larger than any other value (including
@@ -60,6 +60,18 @@ import org.apache.commons.math.util.FastMath;
* elements, arrays containing NaN
or infinite values will often
* result in NaN
or infinite values returned.
+ * Since 2.2, Percentile implementation uses only selection instead of complete + * sorting and caches selection algorithm state between calls to the various + * {@code evaluate} methods when several percentiles are to be computed on the same data. + * This greatly improves efficiency, both for single percentile and multiple + * percentiles computations. However, it also induces a need to be sure the data + * at one call to {@code evaluate} is the same as the data with the cached algorithm + * state from the previous calls. Percentile does this by checking the array reference + * itself and a checksum of its content by default. If the user already knows he calls + * {@code evaluate} on an immutable array, he can save the checking time by calling the + * {@code evaluate} methods that do not + *
+ *
* Note that this implementation is not synchronized. If
* multiple threads access an instance of this class concurrently, and at least
* one of the threads invokes the increment()
or
@@ -72,10 +84,19 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
/** Serializable version identifier */
private static final long serialVersionUID = -8091216485095130416L;
+ /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */
+ private static final int MIN_SELECT_SIZE = 15;
+
+ /** Maximum number of partitioning pivots cached (each level double the number of pivots). */
+ private static final int MAX_CACHED_LEVELS = 10;
+
/** Determines what percentile is computed when evaluate() is activated
* with no quantile argument */
private double quantile = 0.0;
+ /** Cached pivots. */
+ private int[] cachedPivots;
+
/**
* Constructs a Percentile with a default quantile
* value of 50.0.
@@ -92,6 +113,7 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
*/
public Percentile(final double p) {
setQuantile(p);
+ cachedPivots = null;
}
/**
@@ -104,6 +126,42 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
copy(original, this);
}
+ /** {@inheritDoc} */
+ @Override
+ public void setData(final double[] values) {
+ if (values == null) {
+ cachedPivots = null;
+ } else {
+ cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
+ Arrays.fill(cachedPivots, -1);
+ }
+ super.setData(values);
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void setData(final double[] values, final int begin, final int length) {
+ if (values == null) {
+ cachedPivots = null;
+ } else {
+ cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
+ Arrays.fill(cachedPivots, -1);
+ }
+ super.setData(values, begin, length);
+ }
+
+ /**
+ * Returns the result of evaluating the statistic over the stored data.
+ *
+ * The stored array is the one which was set by previous calls to + *
+ * @param p the percentile value to compute + * @return the value of the statistic applied to the stored data + */ + public double evaluate(final double p) { + return evaluate(getDataRef(), p); + } + /** * Returns an estimate of thep
th percentile of the values
* in the values
array.
@@ -214,21 +272,176 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
double fpos = FastMath.floor(pos);
int intPos = (int) fpos;
double dif = pos - fpos;
- double[] sorted = new double[length];
- System.arraycopy(values, begin, sorted, 0, length);
- Arrays.sort(sorted);
+ double[] work;
+ int[] pivotsHeap;
+ if (values == getDataRef()) {
+ work = getDataRef();
+ pivotsHeap = cachedPivots;
+ } else {
+ work = new double[length];
+ System.arraycopy(values, begin, work, 0, length);
+ pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
+ Arrays.fill(pivotsHeap, -1);
+ }
if (pos < 1) {
- return sorted[0];
+ return select(work, pivotsHeap, 0);
}
if (pos >= n) {
- return sorted[length - 1];
+ return select(work, pivotsHeap, length - 1);
}
- double lower = sorted[intPos - 1];
- double upper = sorted[intPos];
+ double lower = select(work, pivotsHeap, intPos - 1);
+ double upper = select(work, pivotsHeap, intPos);
return lower + dif * (upper - lower);
}
+ /**
+ * Select the kth smallest element from work array
+ * @param work work array (will be reorganized during the call)
+ * @param pivotsHeap set of pivot index corresponding to elements that
+ * are already at their sorted location, stored as an implicit heap
+ * (i.e. a sorted binary tree stored in a flat array, where the
+ * children of a node at index n are at indices 2n+1 for the left
+ * child and 2n+2 for the right child, with 0-based indices)
+ * @param k index of the desired element
+ * @return kth smallest element
+ */
+ private double select(final double[] work, final int[] pivotsHeap, final int k) {
+
+ int begin = 0;
+ int end = work.length;
+ int node = 0;
+
+ while (end - begin > MIN_SELECT_SIZE) {
+
+ final int pivot;
+ if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) {
+ // the pivot has already been found in a previous call
+ // and the array has already been partitioned around it
+ pivot = pivotsHeap[node];
+ } else {
+ // select a pivot and partition work array around it
+ pivot = partition(work, begin, end, medianOf3(work, begin, end));
+ if (node < pivotsHeap.length) {
+ pivotsHeap[node] = pivot;
+ }
+ }
+
+ if (k == pivot) {
+ // the pivot was exactly the element we wanted
+ return work[k];
+ } else if (k < pivot) {
+ // the element is in the left partition
+ end = pivot;
+ node = Math.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow
+ } else {
+ // the element is in the right partition
+ begin = pivot + 1;
+ node = Math.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow
+ }
+
+ }
+
+ // the element is somewhere in the small sub-array
+ // sort the sub-array using insertion sort
+ insertionSort(work, begin, end);
+ return work[k];
+
+ }
+
+ /** Select a pivot index as the median of three
+ * @param work data array
+ * @param begin index of the first element of the slice
+ * @param end index after the last element of the slice
+ * @return the index of the median element chosen between the
+ * first, the middle and the last element of the array slice
+ */
+ int medianOf3(final double[] work, final int begin, final int end) {
+
+ final int inclusiveEnd = end - 1;
+ final int middle = begin + (inclusiveEnd - begin) / 2;
+ final double wBegin = work[begin];
+ final double wMiddle = work[middle];
+ final double wEnd = work[inclusiveEnd];
+
+ if (wBegin < wMiddle) {
+ if (wMiddle < wEnd) {
+ return middle;
+ } else {
+ return (wBegin < wEnd) ? inclusiveEnd : begin;
+ }
+ } else {
+ if (wBegin < wEnd) {
+ return begin;
+ } else {
+ return (wMiddle < wEnd) ? inclusiveEnd : middle;
+ }
+ }
+
+ }
+
+ /**
+ * Partition an array slice around a pivot
+ * + * Partitioning exchanges array elements such that all elements + * smaller than pivot are before it and all elements larger than + * pivot are after it + *
+ * @param work data array + * @param begin index of the first element of the slice + * @param end index after the last element of the slice + * @param pivot initial index of the pivot + * @return index of the pivot after partition + */ + private int partition(final double[] work, final int begin, final int end, final int pivot) { + + final double value = work[pivot]; + work[pivot] = work[begin]; + + int i = begin + 1; + int j = end - 1; + while (i < j) { + while ((i < j) && (work[j] >= value)) { + --j; + } + while ((i < j) && (work[i] <= value)) { + ++i; + } + + if (i < j) { + final double tmp = work[i]; + work[i++] = work[j]; + work[j--] = tmp; + } + } + + if ((i >= end) || (work[i] > value)) { + --i; + } + work[begin] = work[i]; + work[i] = value; + return i; + + } + + /** + * Sort in place a (small) array slice using insertion sort + * @param work array to sort + * @param begin index of the first element of the slice to sort + * @param end index after the last element of the slice to sort + */ + private void insertionSort(final double[] work, final int begin, final int end) { + for (int j = begin + 1; j < end; j++) { + final double saved = work[j]; + int i = j - 1; + while ((i >= begin) && (saved < work[i])) { + work[i + 1] = work[i]; + i--; + } + work[i + 1] = saved; + } + } + /** * Returns the value of the quantile field (determines what percentile is * computed when evaluate() is called with no quantile argument). @@ -274,6 +487,10 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa * @throws NullPointerException if either source or dest is null */ public static void copy(Percentile source, Percentile dest) { + dest.setData(source.getDataRef()); + if (source.cachedPivots != null) { + System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length); + } dest.quantile = source.quantile; } diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/summary/Product.java b/src/main/java/org/apache/commons/math/stat/descriptive/summary/Product.java index f9796b458..abe27d4de 100644 --- a/src/main/java/org/apache/commons/math/stat/descriptive/summary/Product.java +++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/Product.java @@ -213,6 +213,7 @@ public class Product extends AbstractStorelessUnivariateStatistic implements Ser * @throws NullPointerException if either source or dest is null */ public static void copy(Product source, Product dest) { + dest.setData(source.getDataRef()); dest.n = source.n; dest.value = source.value; } diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/summary/Sum.java b/src/main/java/org/apache/commons/math/stat/descriptive/summary/Sum.java index b1d9059d8..997cc3aec 100644 --- a/src/main/java/org/apache/commons/math/stat/descriptive/summary/Sum.java +++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/Sum.java @@ -209,6 +209,7 @@ public class Sum extends AbstractStorelessUnivariateStatistic implements Seriali * @throws NullPointerException if either source or dest is null */ public static void copy(Sum source, Sum dest) { + dest.setData(source.getDataRef()); dest.n = source.n; dest.value = source.value; } diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfLogs.java b/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfLogs.java index a4ce08ef1..27264a3a4 100644 --- a/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfLogs.java +++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfLogs.java @@ -155,6 +155,7 @@ public class SumOfLogs extends AbstractStorelessUnivariateStatistic implements S * @throws NullPointerException if either source or dest is null */ public static void copy(SumOfLogs source, SumOfLogs dest) { + dest.setData(source.getDataRef()); dest.n = source.n; dest.value = source.value; } diff --git a/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfSquares.java b/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfSquares.java index 36a216817..ac2317703 100644 --- a/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfSquares.java +++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfSquares.java @@ -143,6 +143,7 @@ public class SumOfSquares extends AbstractStorelessUnivariateStatistic implement * @throws NullPointerException if either source or dest is null */ public static void copy(SumOfSquares source, SumOfSquares dest) { + dest.setData(source.getDataRef()); dest.n = source.n; dest.value = source.value; } diff --git a/src/site/xdoc/changes.xml b/src/site/xdoc/changes.xml index 1d9989434..aaf51d40a 100644 --- a/src/site/xdoc/changes.xml +++ b/src/site/xdoc/changes.xml @@ -85,6 +85,11 @@ The