Improved Percentile performance by using a selection algorithm instead of a

complete sort, and by allowing caching data array and pivots when several different percentiles are desired JIRA: MATH-417 git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/branches/MATH_2_X@1006299 13f79535-47bb-0310-9956-ffa450edef68
2010-10-10 14:47:17 +00:00 · 2010-10-10 14:47:17 +00:00 · fa55b9f280
parent 05dbd4e4cf
commit fa55b9f280
18 changed files with 307 additions and 12 deletions
--- a/findbugs-exclude-filter.xml
+++ b/findbugs-exclude-filter.xml
@ -92,6 +92,11 @@


  <!-- the following expositions of internal representation are intentional and documented -->
+  <Match>
+    <Class name="org.apache.commons.math.stat.descriptive.AbstractUnivariateStatistic"/>
+    <Method name="getDataRef" params="" returns="double[]" />
+    <Bug pattern="EI_EXPOSE_REP" />
+  </Match>
  <Match>
    <Class name="org.apache.commons.math.optimization.RealPointValuePair"/>
    <Method name="getPointRef" params="" returns="double[]" />
--- a/src/main/java/org/apache/commons/math/stat/descriptive/AbstractUnivariateStatistic.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/AbstractUnivariateStatistic.java
@ -17,10 +17,10 @@
 package org.apache.commons.math.stat.descriptive;

 import org.apache.commons.math.MathRuntimeException;
-import org.apache.commons.math.exception.util.LocalizedFormats;
-import org.apache.commons.math.exception.NullArgumentException;
-import org.apache.commons.math.exception.NotPositiveException;
 import org.apache.commons.math.exception.DimensionMismatchException;
+import org.apache.commons.math.exception.NotPositiveException;
+import org.apache.commons.math.exception.NullArgumentException;
+import org.apache.commons.math.exception.util.LocalizedFormats;

 /**
 * Abstract base class for all implementations of the
@ -38,6 +38,60 @@ import org.apache.commons.math.exception.DimensionMismatchException;
 public abstract class AbstractUnivariateStatistic
    implements UnivariateStatistic {

+    /** Stored data. */
+    private double[] storedData;
+
+    /**
+     * Set the data array.
+     * <p>
+     * The stored value is a copy of the parameter array, not the array itself
+     * </p>
+     * @param values data array to store (may be null to remove stored data)
+     * @see #evaluate()
+     */
+    public void setData(final double[] values) {
+        storedData = (values == null) ? null : values.clone();
+    }
+
+    /**
+     * Get a copy of the stored data array.
+     * @return copy of the stored data array (may be null)
+     */
+    public double[] getData() {
+        return (storedData == null) ? null : storedData.clone();
+    }
+
+    /**
+     * Get a reference to the stored data array.
+     * @return reference to the stored data array (may be null)
+     */
+    protected double[] getDataRef() {
+        return storedData;
+    }
+
+    /**
+     * Set the data array.
+     * @param values data array to store
+     * @param begin the index of the first element to include
+     * @param length the number of elements to include
+     * @see #evaluate()
+     */
+    public void setData(final double[] values, final int begin, final int length) {
+        storedData = new double[length];
+        System.arraycopy(values, begin, storedData, 0, length);
+    }
+
+    /**
+     * Returns the result of evaluating the statistic over the stored data.
+     * <p>
+     * The stored array is the one which was set by previous calls to
+     * </p>
+     * @return the value of the statistic applied to the stored data
+     */
+    public double evaluate() {
+        return evaluate(storedData);
+    }
+
    /**
     * {@inheritDoc}
     */
--- a/src/main/java/org/apache/commons/math/stat/descriptive/moment/FirstMoment.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/FirstMoment.java
@ -151,6 +151,7 @@ public class FirstMoment extends AbstractStorelessUnivariateStatistic
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(FirstMoment source, FirstMoment dest) {
+        dest.setData(source.getDataRef());
        dest.n = source.n;
        dest.m1 = source.m1;
        dest.dev = source.dev;
--- a/src/main/java/org/apache/commons/math/stat/descriptive/moment/GeometricMean.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/GeometricMean.java
@ -186,6 +186,7 @@ public class GeometricMean extends AbstractStorelessUnivariateStatistic implemen
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(GeometricMean source, GeometricMean dest) {
+        dest.setData(source.getDataRef());
        dest.sumOfLogs = source.sumOfLogs.copy();
    }

--- a/src/main/java/org/apache/commons/math/stat/descriptive/moment/Kurtosis.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Kurtosis.java
@ -214,6 +214,7 @@ public class Kurtosis extends AbstractStorelessUnivariateStatistic  implements S
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(Kurtosis source, Kurtosis dest) {
+        dest.setData(source.getDataRef());
        dest.moment = source.moment.copy();
        dest.incMoment = source.incMoment;
    }
--- a/src/main/java/org/apache/commons/math/stat/descriptive/moment/Mean.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Mean.java
@ -265,6 +265,7 @@ public class Mean extends AbstractStorelessUnivariateStatistic
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(Mean source, Mean dest) {
+        dest.setData(source.getDataRef());
        dest.incMoment = source.incMoment;
        dest.moment = source.moment.copy();
    }
--- a/src/main/java/org/apache/commons/math/stat/descriptive/moment/SemiVariance.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/SemiVariance.java
@ -159,6 +159,7 @@ public class SemiVariance extends AbstractUnivariateStatistic implements Seriali
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(final SemiVariance source, SemiVariance dest) {
+        dest.setData(source.getDataRef());
        dest.biasCorrected = source.biasCorrected;
        dest.varianceDirection = source.varianceDirection;
    }
--- a/src/main/java/org/apache/commons/math/stat/descriptive/moment/Skewness.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Skewness.java
@ -206,6 +206,7 @@ public class Skewness extends AbstractStorelessUnivariateStatistic implements Se
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(Skewness source, Skewness dest) {
+        dest.setData(source.getDataRef());
        dest.moment = new ThirdMoment(source.moment.copy());
        dest.incMoment = source.incMoment;
    }
--- a/src/main/java/org/apache/commons/math/stat/descriptive/moment/StandardDeviation.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/StandardDeviation.java
@ -264,6 +264,7 @@ public class StandardDeviation extends AbstractStorelessUnivariateStatistic
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(StandardDeviation source, StandardDeviation dest) {
+        dest.setData(source.getDataRef());
        dest.variance = source.variance.copy();
    }

--- a/src/main/java/org/apache/commons/math/stat/descriptive/moment/Variance.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/moment/Variance.java
@ -602,6 +602,7 @@ public class Variance extends AbstractStorelessUnivariateStatistic implements Se
            dest == null) {
            throw new NullArgumentException();
        }
+        dest.setData(source.getDataRef());
        dest.moment = source.moment.copy();
        dest.isBiasCorrected = source.isBiasCorrected;
        dest.incMoment = source.incMoment;
--- a/src/main/java/org/apache/commons/math/stat/descriptive/rank/Max.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/rank/Max.java
@ -156,6 +156,7 @@ public class Max extends AbstractStorelessUnivariateStatistic implements Seriali
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(Max source, Max dest) {
+        dest.setData(source.getDataRef());
        dest.n = source.n;
        dest.value = source.value;
    }
--- a/src/main/java/org/apache/commons/math/stat/descriptive/rank/Min.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/rank/Min.java
@ -156,6 +156,7 @@ public class Min extends AbstractStorelessUnivariateStatistic implements Seriali
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(Min source, Min dest) {
+        dest.setData(source.getDataRef());
        dest.n = source.n;
        dest.value = source.value;
    }
--- a/src/main/java/org/apache/commons/math/stat/descriptive/rank/Percentile.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/rank/Percentile.java
@ -47,8 +47,8 @@ import org.apache.commons.math.util.FastMath;
 * </li>
 * </ol></p>
 * <p>
- * To compute percentiles, the data must be (totally) ordered.  Input arrays
- * are copied and then sorted using  {@link java.util.Arrays#sort(double[])}.
+ * To compute percentiles, the data must be at least partially ordered.  Input
+ * arrays are copied and recursively partitioned using an ordering definition.
 * The ordering used by <code>Arrays.sort(double[])</code> is the one determined
 * by {@link java.lang.Double#compareTo(Double)}.  This ordering makes
 * <code>Double.NaN</code> larger than any other value (including
@ -60,6 +60,18 @@ import org.apache.commons.math.util.FastMath;
 * elements, arrays containing  <code>NaN</code> or infinite values will often
 * result in <code>NaN<code> or infinite values returned.</p>
 * <p>
+ * Since 2.2, Percentile implementation uses only selection instead of complete
+ * sorting and caches selection algorithm state between calls to the various
+ * {@code evaluate} methods when several percentiles are to be computed on the same data.
+ * This greatly improves efficiency, both for single percentile and multiple
+ * percentiles computations. However, it also induces a need to be sure the data
+ * at one call to {@code evaluate} is the same as the data with the cached algorithm
+ * state from the previous calls. Percentile does this by checking the array reference
+ * itself and a checksum of its content by default. If the user already knows he calls
+ * {@code evaluate} on an immutable array, he can save the checking time by calling the
+ * {@code evaluate} methods that do <em>not</em>
+ * </p>
+ * <p>
 * <strong>Note that this implementation is not synchronized.</strong> If
 * multiple threads access an instance of this class concurrently, and at least
 * one of the threads invokes the <code>increment()</code> or
@ -72,10 +84,19 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
    /** Serializable version identifier */
    private static final long serialVersionUID = -8091216485095130416L;

+    /** Minimum size under which we use a simple insertion sort rather than Hoare's select. */
+    private static final int MIN_SELECT_SIZE = 15;
+
+    /** Maximum number of partitioning pivots cached (each level double the number of pivots). */
+    private static final int MAX_CACHED_LEVELS = 10;
+
    /** Determines what percentile is computed when evaluate() is activated
     * with no quantile argument */
    private double quantile = 0.0;

+    /** Cached pivots. */
+    private int[] cachedPivots;
+
    /**
     * Constructs a Percentile with a default quantile
     * value of 50.0.
@ -92,6 +113,7 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
     */
    public Percentile(final double p) {
        setQuantile(p);
+        cachedPivots = null;
    }

    /**
@ -104,6 +126,42 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
        copy(original, this);
    }

+    /** {@inheritDoc} */
+    @Override
+    public void setData(final double[] values) {
+        if (values == null) {
+            cachedPivots = null;
+        } else {
+            cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
+            Arrays.fill(cachedPivots, -1);
+        }
+        super.setData(values);
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    public void setData(final double[] values, final int begin, final int length) {
+        if (values == null) {
+            cachedPivots = null;
+        } else {
+            cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
+            Arrays.fill(cachedPivots, -1);
+        }
+        super.setData(values, begin, length);
+    }
+
+    /**
+     * Returns the result of evaluating the statistic over the stored data.
+     * <p>
+     * The stored array is the one which was set by previous calls to
+     * </p>
+     * @param p the percentile value to compute
+     * @return the value of the statistic applied to the stored data
+     */
+    public double evaluate(final double p) {
+        return evaluate(getDataRef(), p);
+    }
+
    /**
     * Returns an estimate of the <code>p</code>th percentile of the values
     * in the <code>values</code> array.
@ -214,21 +272,176 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
        double fpos = FastMath.floor(pos);
        int intPos = (int) fpos;
        double dif = pos - fpos;
-        double[] sorted = new double[length];
-        System.arraycopy(values, begin, sorted, 0, length);
-        Arrays.sort(sorted);
+        double[] work;
+        int[] pivotsHeap;
+        if (values == getDataRef()) {
+            work = getDataRef();
+            pivotsHeap = cachedPivots;
+        } else {
+            work = new double[length];
+            System.arraycopy(values, begin, work, 0, length);
+            pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
+            Arrays.fill(pivotsHeap, -1);
+        }

        if (pos < 1) {
-            return sorted[0];
+            return select(work, pivotsHeap, 0);
        }
        if (pos >= n) {
-            return sorted[length - 1];
+            return select(work, pivotsHeap, length - 1);
        }
-        double lower = sorted[intPos - 1];
-        double upper = sorted[intPos];
+        double lower = select(work, pivotsHeap, intPos - 1);
+        double upper = select(work, pivotsHeap, intPos);
        return lower + dif * (upper - lower);
    }

+    /**
+     * Select the k<sup>th</sup> smallest element from work array
+     * @param work work array (will be reorganized during the call)
+     * @param pivotsHeap set of pivot index corresponding to elements that
+     * are already at their sorted location, stored as an implicit heap
+     * (i.e. a sorted binary tree stored in a flat array, where the
+     * children of a node at index n are at indices 2n+1 for the left
+     * child and 2n+2 for the right child, with 0-based indices)
+     * @param k index of the desired element
+     * @return k<sup>th</sup> smallest element
+     */
+    private double select(final double[] work, final int[] pivotsHeap, final int k) {
+
+        int begin = 0;
+        int end   = work.length;
+        int node  = 0;
+
+        while (end - begin > MIN_SELECT_SIZE) {
+
+            final int pivot;
+            if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) {
+                // the pivot has already been found in a previous call
+                // and the array has already been partitioned around it
+                pivot = pivotsHeap[node];
+            } else {
+                // select a pivot and partition work array around it
+                pivot = partition(work, begin, end, medianOf3(work, begin, end));
+                if (node < pivotsHeap.length) {
+                    pivotsHeap[node] =  pivot;
+                }
+            }
+
+            if (k == pivot) {
+                // the pivot was exactly the element we wanted
+                return work[k];
+            } else if (k < pivot) {
+                // the element is in the left partition
+                end  = pivot;
+                node = Math.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow
+            } else {
+                // the element is in the right partition
+                begin = pivot + 1;
+                node  = Math.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow
+            }
+
+        }
+
+        // the element is somewhere in the small sub-array
+        // sort the sub-array using insertion sort
+        insertionSort(work, begin, end);
+        return work[k];
+
+    }
+
+    /** Select a pivot index as the median of three
+     * @param work data array
+     * @param begin index of the first element of the slice
+     * @param end index after the last element of the slice
+     * @return the index of the median element chosen between the
+     * first, the middle and the last element of the array slice
+     */
+    int medianOf3(final double[] work, final int begin, final int end) {
+
+        final int inclusiveEnd = end - 1;
+        final int    middle    = begin + (inclusiveEnd - begin) / 2;
+        final double wBegin    = work[begin];
+        final double wMiddle   = work[middle];
+        final double wEnd      = work[inclusiveEnd];
+
+        if (wBegin < wMiddle) {
+            if (wMiddle < wEnd) {
+                return middle;
+            } else {
+                return (wBegin < wEnd) ? inclusiveEnd : begin;
+            }
+        } else {
+            if (wBegin < wEnd) {
+                return begin;
+            } else {
+                return (wMiddle < wEnd) ? inclusiveEnd : middle;
+            }
+        }
+
+    }
+
+    /**
+     * Partition an array slice around a pivot
+     * <p>
+     * Partitioning exchanges array elements such that all elements
+     * smaller than pivot are before it and all elements larger than
+     * pivot are after it
+     * </p>
+     * @param work data array
+     * @param begin index of the first element of the slice
+     * @param end index after the last element of the slice
+     * @param pivot initial index of the pivot
+     * @return index of the pivot after partition
+     */
+    private int partition(final double[] work, final int begin, final int end, final int pivot) {
+
+        final double value = work[pivot];
+        work[pivot] = work[begin];
+
+        int i = begin + 1;
+        int j = end - 1;
+        while (i < j) {
+            while ((i < j) && (work[j] >= value)) {
+                --j;
+            }
+            while ((i < j) && (work[i] <= value)) {
+                ++i;
+            }
+
+            if (i < j) {
+                final double tmp = work[i];
+                work[i++] = work[j];
+                work[j--] = tmp;
+            }
+        }
+
+        if ((i >= end) || (work[i] > value)) {
+            --i;
+        }
+        work[begin] = work[i];
+        work[i]     = value;
+        return i;
+
+    }
+
+    /**
+     * Sort in place a (small) array slice using insertion sort
+     * @param work array to sort
+     * @param begin index of the first element of the slice to sort
+     * @param end index after the last element of the slice to sort
+     */
+    private void insertionSort(final double[] work, final int begin, final int end) {
+        for (int j = begin + 1; j < end; j++) {
+            final double saved = work[j];
+            int i = j - 1;
+            while ((i >= begin) && (saved < work[i])) {
+                work[i + 1] = work[i];
+                i--;
+            }
+            work[i + 1] = saved;
+        }
+    }
+
    /**
     * Returns the value of the quantile field (determines what percentile is
     * computed when evaluate() is called with no quantile argument).
@ -274,6 +487,10 @@ public class Percentile extends AbstractUnivariateStatistic implements Serializa
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(Percentile source, Percentile dest) {
+        dest.setData(source.getDataRef());
+        if (source.cachedPivots != null) {
+            System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length);
+        }
        dest.quantile = source.quantile;
    }

--- a/src/main/java/org/apache/commons/math/stat/descriptive/summary/Product.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/Product.java
@ -216,6 +216,7 @@ public class Product extends AbstractStorelessUnivariateStatistic implements Ser
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(Product source, Product dest) {
+        dest.setData(source.getDataRef());
        dest.n = source.n;
        dest.value = source.value;
    }
--- a/src/main/java/org/apache/commons/math/stat/descriptive/summary/Sum.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/Sum.java
@ -212,6 +212,7 @@ public class Sum extends AbstractStorelessUnivariateStatistic implements Seriali
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(Sum source, Sum dest) {
+        dest.setData(source.getDataRef());
        dest.n = source.n;
        dest.value = source.value;
    }
--- a/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfLogs.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfLogs.java
@ -158,6 +158,7 @@ public class SumOfLogs extends AbstractStorelessUnivariateStatistic implements S
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(SumOfLogs source, SumOfLogs dest) {
+        dest.setData(source.getDataRef());
        dest.n = source.n;
        dest.value = source.value;
    }
--- a/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfSquares.java
+++ b/src/main/java/org/apache/commons/math/stat/descriptive/summary/SumOfSquares.java
@ -146,6 +146,7 @@ public class SumOfSquares extends AbstractStorelessUnivariateStatistic implement
     * @throws NullPointerException if either source or dest is null
     */
    public static void copy(SumOfSquares source, SumOfSquares dest) {
+        dest.setData(source.getDataRef());
        dest.n = source.n;
        dest.value = source.value;
    }
--- a/src/site/xdoc/changes.xml
+++ b/src/site/xdoc/changes.xml
@ -52,6 +52,11 @@ The <action> type attribute can be add,update,fix,remove.
    If the output is not quite correct, check for invisible trailing spaces!
     -->
    <release version="2.2" date="TBD" description="TBD">
+      <action dev="luc" type="update" issue="MATH-417">
+        Improved Percentile performance by using a selection algorithm instead of a
+        complete sort, and by allowing caching data array and pivots when several
+        different percentiles are desired
+      </action>
      <action dev="luc" type="fix" issue="MATH-391">
        Fixed an error preventing zero length vectors to be built by some constructors
      </action>