Improved Percentile performance by using a selection algorithm instead of a

complete sort, and by allowing caching data array and pivots when several
different percentiles are desired
JIRA: MATH-417

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@1006301 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Luc Maisonobe 2010-10-10 14:50:56 +00:00
parent 4614f35efc
commit 1b238344cd
18 changed files with 307 additions and 12 deletions

View File

@ -92,6 +92,11 @@
<!-- the following expositions of internal representation are intentional and documented -->
<Match>
<Class name="org.apache.commons.math.stat.descriptive.AbstractUnivariateStatistic"/>
<Method name="getDataRef" params="" returns="double[]" />
<Bug pattern="EI_EXPOSE_REP" />
</Match>
<Match>
<Class name="org.apache.commons.math.optimization.RealPointValuePair"/>
<Method name="getPointRef" params="" returns="double[]" />

View File

@ -17,10 +17,10 @@
package org.apache.commons.math.stat.descriptive;
import org.apache.commons.math.MathRuntimeException;
import org.apache.commons.math.exception.util.LocalizedFormats;
import org.apache.commons.math.exception.NullArgumentException;
import org.apache.commons.math.exception.NotPositiveException;
import org.apache.commons.math.exception.DimensionMismatchException;
import org.apache.commons.math.exception.NotPositiveException;
import org.apache.commons.math.exception.NullArgumentException;
import org.apache.commons.math.exception.util.LocalizedFormats;
/**
* Abstract base class for all implementations of the
@ -38,6 +38,60 @@ import org.apache.commons.math.exception.DimensionMismatchException;
public abstract class AbstractUnivariateStatistic
implements UnivariateStatistic {
/** Stored data. */
private double[] storedData;
/**
* Set the data array.
* <p>
* The stored value is a copy of the parameter array, not the array itself
* </p>
* @param values data array to store (may be null to remove stored data)
* @see #evaluate()
*/
public void setData(final double[] values) {
storedData = (values == null) ? null : values.clone();
}
/**
* Get a copy of the stored data array.
* @return copy of the stored data array (may be null)
*/
public double[] getData() {
return (storedData == null) ? null : storedData.clone();
}
/**
* Get a reference to the stored data array.
* @return reference to the stored data array (may be null)
*/
protected double[] getDataRef() {
return storedData;
}
/**
* Set the data array.
* @param values data array to store
* @param begin the index of the first element to include
* @param length the number of elements to include
* @see #evaluate()
*/
public void setData(final double[] values, final int begin, final int length) {
storedData = new double[length];
System.arraycopy(values, begin, storedData, 0, length);
}
/**
* Returns the result of evaluating the statistic over the stored data.
* <p>
* The stored array is the one which was set by previous calls to
* </p>
* @return the value of the statistic applied to the stored data
*/
public double evaluate() {
return evaluate(storedData);
}
/**
* {@inheritDoc}
*/

View File

@ -151,6 +151,7 @@ public class FirstMoment extends AbstractStorelessUnivariateStatistic
* @throws NullPointerException if either source or dest is null
*/
public static void copy(FirstMoment source, FirstMoment dest) {
dest.setData(source.getDataRef());
dest.n = source.n;
dest.m1 = source.m1;
dest.dev = source.dev;

View File

@ -186,6 +186,7 @@ public class GeometricMean extends AbstractStorelessUnivariateStatistic implemen
* @throws NullPointerException if either source or dest is null
*/
public static void copy(GeometricMean source, GeometricMean dest) {
dest.setData(source.getDataRef());
dest.sumOfLogs = source.sumOfLogs.copy();
}

View File

@ -214,6 +214,7 @@ public class Kurtosis extends AbstractStorelessUnivariateStatistic implements S
* @throws NullPointerException if either source or dest is null
*/
public static void copy(Kurtosis source, Kurtosis dest) {
dest.setData(source.getDataRef());
dest.moment = source.moment.copy();
dest.incMoment = source.incMoment;
}

View File

@ -265,6 +265,7 @@ public class Mean extends AbstractStorelessUnivariateStatistic
* @throws NullPointerException if either source or dest is null
*/
public static void copy(Mean source, Mean dest) {
dest.setData(source.getDataRef());
dest.incMoment = source.incMoment;
dest.moment = source.moment.copy();
}

View File

@ -159,6 +159,7 @@ public class SemiVariance extends AbstractUnivariateStatistic implements Seriali
* @throws NullPointerException if either source or dest is null
*/
public static void copy(final SemiVariance source, SemiVariance dest) {
dest.setData(source.getDataRef());
dest.biasCorrected = source.biasCorrected;
dest.varianceDirection = source.varianceDirection;
}

View File

@ -206,6 +206,7 @@ public class Skewness extends AbstractStorelessUnivariateStatistic implements Se
* @throws NullPointerException if either source or dest is null
*/
public static void copy(Skewness source, Skewness dest) {
dest.setData(source.getDataRef());
dest.moment = new ThirdMoment(source.moment.copy());
dest.incMoment = source.incMoment;
}

View File

@ -264,6 +264,7 @@ public class StandardDeviation extends AbstractStorelessUnivariateStatistic
* @throws NullPointerException if either source or dest is null
*/
public static void copy(StandardDeviation source, StandardDeviation dest) {
dest.setData(source.getDataRef());
dest.variance = source.variance.copy();
}

View File

@ -602,6 +602,7 @@ public class Variance extends AbstractStorelessUnivariateStatistic implements Se
dest == null) {
throw new NullArgumentException();
}
dest.setData(source.getDataRef());
dest.moment = source.moment.copy();
dest.isBiasCorrected = source.isBiasCorrected;
dest.incMoment = source.incMoment;

View File

@ -156,6 +156,7 @@ public class Max extends AbstractStorelessUnivariateStatistic implements Seriali
* @throws NullPointerException if either source or dest is null
*/
public static void copy(Max source, Max dest) {
dest.setData(source.getDataRef());
dest.n = source.n;
dest.value = source.value;
}

View File

@ -156,6 +156,7 @@ public class Min extends AbstractStorelessUnivariateStatistic implements Seriali
* @throws NullPointerException if either source or dest is null
*/
public static void copy(Min source, Min dest) {
dest.setData(source.getDataRef());
dest.n = source.n;
dest.value = source.value;
}

View File

@ -47,8 +47,8 @@ import org.apache.commons.math.util.FastMath;
* </li>
* </ol></p>
* <p>
* To compute percentiles, the data must be (totally) ordered. Input arrays
* are copied and then sorted using {@link java.util.Arrays#sort(double[])}.
* To compute percentiles, the data must be at least partially ordered. Input
* arrays are copied and recursively partitioned using an ordering definition.
* The ordering used by <code>Arrays.sort(double[])</code> is the one determined
* by {@link java.lang.Double#compareTo(Double)}. This ordering makes
* <code>Double.NaN</code> larger than any other value (including
@ -60,6 +60,18 @@ import org.apache.commons.math.util.FastMath;
* elements, arrays containing <code>NaN</code> or infinite values will often
* result in <code>NaN<code> or infinite values returned.</p>
* <p>
* Since 2.2, Percentile implementation uses only selection instead of complete
* sorting and caches selection algorithm state between calls to the various
* {@code evaluate} methods when several percentiles are to be computed on the same data.
* This greatly improves efficiency, both for single percentile and multiple
* percentiles computations. However, it also induces a need to be sure the data
* at one call to {@code evaluate} is the same as the data with the cached algorithm
* state from the previous calls. Percentile does this by checking the array reference
* itself and a checksum of its content by default. If the user already knows he calls
* {@code evaluate} on an immutable array, he can save the checking time by calling the
* {@code evaluate} methods that do <em>not</em>
* </p>
* <p>
* <strong>Note that this implementation is not synchronized.</strong> If
* multiple threads access an instance of this class concurrently, and at least
* one of the threads invokes the <code>increment()</code> or
@ -72,10 +84,19 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
/** Serializable version identifier */
private static final long serialVersionUID = -8091216485095130416L;
/** Minimum size under which we use a simple insertion sort rather than Hoare's select. */
private static final int MIN_SELECT_SIZE = 15;
/** Maximum number of partitioning pivots cached (each level double the number of pivots). */
private static final int MAX_CACHED_LEVELS = 10;
/** Determines what percentile is computed when evaluate() is activated
* with no quantile argument */
private double quantile = 0.0;
/** Cached pivots. */
private int[] cachedPivots;
/**
* Constructs a Percentile with a default quantile
* value of 50.0.
@ -92,6 +113,7 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
*/
public Percentile(final double p) {
setQuantile(p);
cachedPivots = null;
}
/**
@ -104,6 +126,42 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
copy(original, this);
}
/** {@inheritDoc} */
@Override
public void setData(final double[] values) {
if (values == null) {
cachedPivots = null;
} else {
cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
Arrays.fill(cachedPivots, -1);
}
super.setData(values);
}
/** {@inheritDoc} */
@Override
public void setData(final double[] values, final int begin, final int length) {
if (values == null) {
cachedPivots = null;
} else {
cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
Arrays.fill(cachedPivots, -1);
}
super.setData(values, begin, length);
}
/**
* Returns the result of evaluating the statistic over the stored data.
* <p>
* The stored array is the one which was set by previous calls to
* </p>
* @param p the percentile value to compute
* @return the value of the statistic applied to the stored data
*/
public double evaluate(final double p) {
return evaluate(getDataRef(), p);
}
/**
* Returns an estimate of the <code>p</code>th percentile of the values
* in the <code>values</code> array.
@ -214,21 +272,176 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
double fpos = FastMath.floor(pos);
int intPos = (int) fpos;
double dif = pos - fpos;
double[] sorted = new double[length];
System.arraycopy(values, begin, sorted, 0, length);
Arrays.sort(sorted);
double[] work;
int[] pivotsHeap;
if (values == getDataRef()) {
work = getDataRef();
pivotsHeap = cachedPivots;
} else {
work = new double[length];
System.arraycopy(values, begin, work, 0, length);
pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
Arrays.fill(pivotsHeap, -1);
}
if (pos < 1) {
return sorted[0];
return select(work, pivotsHeap, 0);
}
if (pos >= n) {
return sorted[length - 1];
return select(work, pivotsHeap, length - 1);
}
double lower = sorted[intPos - 1];
double upper = sorted[intPos];
double lower = select(work, pivotsHeap, intPos - 1);
double upper = select(work, pivotsHeap, intPos);
return lower + dif * (upper - lower);
}
/**
* Select the k<sup>th</sup> smallest element from work array
* @param work work array (will be reorganized during the call)
* @param pivotsHeap set of pivot index corresponding to elements that
* are already at their sorted location, stored as an implicit heap
* (i.e. a sorted binary tree stored in a flat array, where the
* children of a node at index n are at indices 2n+1 for the left
* child and 2n+2 for the right child, with 0-based indices)
* @param k index of the desired element
* @return k<sup>th</sup> smallest element
*/
private double select(final double[] work, final int[] pivotsHeap, final int k) {
int begin = 0;
int end = work.length;
int node = 0;
while (end - begin > MIN_SELECT_SIZE) {
final int pivot;
if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) {
// the pivot has already been found in a previous call
// and the array has already been partitioned around it
pivot = pivotsHeap[node];
} else {
// select a pivot and partition work array around it
pivot = partition(work, begin, end, medianOf3(work, begin, end));
if (node < pivotsHeap.length) {
pivotsHeap[node] = pivot;
}
}
if (k == pivot) {
// the pivot was exactly the element we wanted
return work[k];
} else if (k < pivot) {
// the element is in the left partition
end = pivot;
node = Math.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow
} else {
// the element is in the right partition
begin = pivot + 1;
node = Math.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow
}
}
// the element is somewhere in the small sub-array
// sort the sub-array using insertion sort
insertionSort(work, begin, end);
return work[k];
}
/** Select a pivot index as the median of three
* @param work data array
* @param begin index of the first element of the slice
* @param end index after the last element of the slice
* @return the index of the median element chosen between the
* first, the middle and the last element of the array slice
*/
int medianOf3(final double[] work, final int begin, final int end) {
final int inclusiveEnd = end - 1;
final int middle = begin + (inclusiveEnd - begin) / 2;
final double wBegin = work[begin];
final double wMiddle = work[middle];
final double wEnd = work[inclusiveEnd];
if (wBegin < wMiddle) {
if (wMiddle < wEnd) {
return middle;
} else {
return (wBegin < wEnd) ? inclusiveEnd : begin;
}
} else {
if (wBegin < wEnd) {
return begin;
} else {
return (wMiddle < wEnd) ? inclusiveEnd : middle;
}
}
}
/**
* Partition an array slice around a pivot
* <p>
* Partitioning exchanges array elements such that all elements
* smaller than pivot are before it and all elements larger than
* pivot are after it
* </p>
* @param work data array
* @param begin index of the first element of the slice
* @param end index after the last element of the slice
* @param pivot initial index of the pivot
* @return index of the pivot after partition
*/
private int partition(final double[] work, final int begin, final int end, final int pivot) {
final double value = work[pivot];
work[pivot] = work[begin];
int i = begin + 1;
int j = end - 1;
while (i < j) {
while ((i < j) && (work[j] >= value)) {
--j;
}
while ((i < j) && (work[i] <= value)) {
++i;
}
if (i < j) {
final double tmp = work[i];
work[i++] = work[j];
work[j--] = tmp;
}
}
if ((i >= end) || (work[i] > value)) {
--i;
}
work[begin] = work[i];
work[i] = value;
return i;
}
/**
* Sort in place a (small) array slice using insertion sort
* @param work array to sort
* @param begin index of the first element of the slice to sort
* @param end index after the last element of the slice to sort
*/
private void insertionSort(final double[] work, final int begin, final int end) {
for (int j = begin + 1; j < end; j++) {
final double saved = work[j];
int i = j - 1;
while ((i >= begin) && (saved < work[i])) {
work[i + 1] = work[i];
i--;
}
work[i + 1] = saved;
}
}
/**
* Returns the value of the quantile field (determines what percentile is
* computed when evaluate() is called with no quantile argument).
@ -274,6 +487,10 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
* @throws NullPointerException if either source or dest is null
*/
public static void copy(Percentile source, Percentile dest) {
dest.setData(source.getDataRef());
if (source.cachedPivots != null) {
System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length);
}
dest.quantile = source.quantile;
}

View File

@ -213,6 +213,7 @@ public class Product extends AbstractStorelessUnivariateStatistic implements Ser
* @throws NullPointerException if either source or dest is null
*/
public static void copy(Product source, Product dest) {
dest.setData(source.getDataRef());
dest.n = source.n;
dest.value = source.value;
}

View File

@ -209,6 +209,7 @@ public class Sum extends AbstractStorelessUnivariateStatistic implements Seriali
* @throws NullPointerException if either source or dest is null
*/
public static void copy(Sum source, Sum dest) {
dest.setData(source.getDataRef());
dest.n = source.n;
dest.value = source.value;
}

View File

@ -155,6 +155,7 @@ public class SumOfLogs extends AbstractStorelessUnivariateStatistic implements S
* @throws NullPointerException if either source or dest is null
*/
public static void copy(SumOfLogs source, SumOfLogs dest) {
dest.setData(source.getDataRef());
dest.n = source.n;
dest.value = source.value;
}

View File

@ -143,6 +143,7 @@ public class SumOfSquares extends AbstractStorelessUnivariateStatistic implement
* @throws NullPointerException if either source or dest is null
*/
public static void copy(SumOfSquares source, SumOfSquares dest) {
dest.setData(source.getDataRef());
dest.n = source.n;
dest.value = source.value;
}

View File

@ -85,6 +85,11 @@ The <action> type attribute can be add,update,fix,remove.
</action>
</release>
<release version="2.2" date="TBD" description="TBD">
<action dev="luc" type="update" issue="MATH-417">
Improved Percentile performance by using a selection algorithm instead of a
complete sort, and by allowing caching data array and pivots when several
different percentiles are desired
</action>
<action dev="luc" type="fix" issue="MATH-391">
Fixed an error preventing zero length vectors to be built by some constructors
</action>