Added sampling methods to the distribution classes, based on the random

data generation methods in the random package.
JIRA: MATH-310

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@949895 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Phil Steitz 2010-05-31 23:39:25 +00:00
parent fb7bf199ff
commit 4fdbd67677
11 changed files with 306 additions and 86 deletions

View File

@ -753,6 +753,10 @@ public class MessagesResources_fr
{ "Discrete cumulative probability function returned NaN for argument {0}",
"Discr\u00e8tes fonction de probabilit\u00e9 cumulative retourn\u00e9 NaN \u00e0 l''argument de {0}" },
// org.apache.commons.math.distribution.AbstractIntegerDistribution
// org.apache.commons.math.distribution.AbstractContinuousDistribution
{ "Sample size must be positive",
"Taille de l'\u00e9chantillon doit \u00eatre positif" },
// org.apache.commons.math.distribution.BinomialDistributionImpl
{ "number of trials must be non-negative ({0})",

View File

@ -25,6 +25,7 @@ import org.apache.commons.math.MathRuntimeException;
import org.apache.commons.math.analysis.UnivariateRealFunction;
import org.apache.commons.math.analysis.solvers.BrentSolver;
import org.apache.commons.math.analysis.solvers.UnivariateRealSolverUtils;
import org.apache.commons.math.random.RandomDataImpl;
/**
* Base class for continuous distributions. Default implementations are
@ -40,6 +41,12 @@ public abstract class AbstractContinuousDistribution
/** Serializable version identifier */
private static final long serialVersionUID = -38038050983108802L;
/**
* RandomData instance used to generate samples from the distribution
* @since 2.2
*/
protected final RandomDataImpl randomData = new RandomDataImpl();
/**
* Solver absolute accuracy for inverse cum computation
* @since 2.1
@ -102,7 +109,7 @@ public abstract class AbstractContinuousDistribution
}
};
// Try to bracket root, test domain endoints if this fails
// Try to bracket root, test domain endpoints if this fails
double lowerBound = getDomainLowerBound(p);
double upperBound = getDomainUpperBound(p);
double[] bracket = null;
@ -134,6 +141,50 @@ public abstract class AbstractContinuousDistribution
return root;
}
/**
* Reseeds the random generator used to generate samples.
*
* @param seed the new seed
* @since 2.2
*/
public void reseedRandomGenerator(long seed) {
randomData.reSeed(seed);
}
/**
* Generates a random value sampled from this distribution. The default
* implementation uses the
* <a href="http://en.wikipedia.org/wiki/Inverse_transform_sampling"> inversion method.</a>
*
* @return random value
* @since 2.2
* @throws MathException if an error occurs generating the random value
*/
public double sample() throws MathException {
return randomData.nextInversionDeviate(this);
}
/**
* Generates a random sample from the distribution. The default implementation
* generates the sample by calling {@link #sample()} in a loop.
*
* @param sampleSize number of random values to generate
* @since 2.2
* @return an array representing the random sample
* @throws MathException if an error occurs generating the sample
* @throws IllegalArgumentException if sampleSize is not positive
*/
public double[] sample(int sampleSize) throws MathException {
if (sampleSize <= 0) {
MathRuntimeException.createIllegalArgumentException("Sample size must be positive");
}
double[] out = new double[sampleSize];
for (int i = 0; i < sampleSize; i++) {
out[i] = sample();
}
return out;
}
/**
* Access the initial domain value, based on <code>p</code>, used to
* bracket a CDF root. This method is used by
@ -175,4 +226,5 @@ public abstract class AbstractContinuousDistribution
protected double getSolverAbsoluteAccuracy() {
return solverAbsoluteAccuracy;
}
}

View File

@ -21,6 +21,7 @@ import java.io.Serializable;
import org.apache.commons.math.FunctionEvaluationException;
import org.apache.commons.math.MathException;
import org.apache.commons.math.MathRuntimeException;
import org.apache.commons.math.random.RandomDataImpl;
/**
@ -44,6 +45,12 @@ public abstract class AbstractIntegerDistribution extends AbstractDistribution
/** Serializable version identifier */
private static final long serialVersionUID = -1146319659338487221L;
/**
* RandomData instance used to generate samples from the distribution
* @since 2.2
*/
protected final RandomDataImpl randomData = new RandomDataImpl();
/**
* Default constructor.
*/
@ -209,7 +216,51 @@ public abstract class AbstractIntegerDistribution extends AbstractDistribution
}
/**
* Computes the cumulative probablity function and checks for NaN values returned.
* Reseeds the random generator used to generate samples.
*
* @param seed the new seed
* @since 2.2
*/
public void reseedRandomGenerator(long seed) {
randomData.reSeed(seed);
}
/**
* Generates a random value sampled from this distribution. The default
* implementation uses the
* <a href="http://en.wikipedia.org/wiki/Inverse_transform_sampling"> inversion method.</a>
*
* @return random value
* @since 2.2
* @throws MathException if an error occurs generating the random value
*/
public int sample() throws MathException {
return randomData.nextInversionDeviate(this);
}
/**
* Generates a random sample from the distribution. The default implementation
* generates the sample by calling {@link #sample()} in a loop.
*
* @param sampleSize number of random values to generate
* @since 2.2
* @return an array representing the random sample
* @throws MathException if an error occurs generating the sample
* @throws IllegalArgumentException if sampleSize is not positive
*/
public int[] sample(int sampleSize) throws MathException {
if (sampleSize <= 0) {
MathRuntimeException.createIllegalArgumentException("Sample size must be positive");
}
int[] out = new int[sampleSize];
for (int i = 0; i < sampleSize; i++) {
out[i] = sample();
}
return out;
}
/**
* Computes the cumulative probability function and checks for NaN values returned.
* Throws MathException if the value is NaN. Wraps and rethrows any MathException encountered
* evaluating the cumulative probability function in a FunctionEvaluationException. Throws
* FunctionEvaluationException of the cumulative probability function returns NaN.

View File

@ -175,6 +175,23 @@ public class ExponentialDistributionImpl extends AbstractContinuousDistribution
return ret;
}
/**
* Generates a random value sampled from this distribution.
*
* <p><strong>Algorithm Description</strong>: Uses the <a
* href="http://www.jesus.ox.ac.uk/~clifford/a5/chap1/node5.html"> Inversion
* Method</a> to generate exponentially distributed random values from
* uniform deviates. </p>
*
* @return random value
* @since 2.2
* @throws MathException if an error occurs generating the random value
*/
@Override
public double sample() throws MathException {
return randomData.nextExponential(mean);
}
/**
* Access the domain value lower bound, based on <code>p</code>, used to
* bracket a CDF root.

View File

@ -228,6 +228,18 @@ public class NormalDistributionImpl extends AbstractContinuousDistribution
return super.inverseCumulativeProbability(p);
}
/**
* Generates a random value sampled from this distribution.
*
* @return random value
* @since 2.2
* @throws MathException if an error occurs generating the random value
*/
@Override
public double sample() throws MathException {
return randomData.nextGaussian(mean, standardDeviation);
}
/**
* Access the domain value lower bound, based on <code>p</code>, used to
* bracket a CDF root. This method is used by

View File

@ -237,6 +237,28 @@ public class PoissonDistributionImpl extends AbstractIntegerDistribution
return normal.cumulativeProbability(x + 0.5);
}
/**
* Generates a random value sampled from this distribution.
*
* <p><strong>Algorithm Description</strong>:
* <ul><li> For small means, uses simulation of a Poisson process
* using Uniform deviates, as described
* <a href="http://irmi.epfl.ch/cmos/Pmmi/interactive/rng7.htm"> here.</a>
* The Poisson process (and hence value returned) is bounded by 1000 * mean.</li><
*
* <li> For large means, uses the rejection algorithm described in <br/>
* Devroye, Luc. (1981).<i>The Computer Generation of Poisson Random Variables</i>
* <strong>Computing</strong> vol. 26 pp. 197-207.</li></ul></p>
*
* @return random value
* @since 2.2
* @throws MathException if an error occurs generating the random value
*/
@Override
public int sample() throws MathException {
return (int) Math.min(randomData.nextPoisson(mean), Integer.MAX_VALUE);
}
/**
* Access the domain value lower bound, based on <code>p</code>, used to
* bracket a CDF root. This method is used by

View File

@ -56,6 +56,7 @@ The <action> type attribute can be add,update,fix,remove.
Added random data generation methods to RandomDataImpl for the remaining distributions in the
distributions package. Added a generic nextInversionDeviate method that takes a discrete
or continuous distribution as argument and generates a random deviate from the distribution.
Also added sampling methods based on the implementations in RandomDataImpl to distributions.
</action>
<action dev="luc" type="fix" issue="MATH-362" >
Fixed Levenberg-Marquardt optimizer that did not use the vectorial convergence checker.

View File

@ -29,6 +29,7 @@ import junit.framework.AssertionFailedError;
import org.apache.commons.math.complex.Complex;
import org.apache.commons.math.complex.ComplexFormat;
import org.apache.commons.math.distribution.ContinuousDistribution;
import org.apache.commons.math.linear.FieldMatrix;
import org.apache.commons.math.linear.RealMatrix;
import org.apache.commons.math.stat.inference.ChiSquareTest;
@ -438,4 +439,77 @@ public class TestUtils {
assertChiSquareAccept(labels, expected, observed, alpha);
}
/**
* Asserts the null hypothesis for a ChiSquare test. Fails and dumps arguments and test
* statistics if the null hypothesis can be rejected with confidence 100 * (1 - alpha)%
*
* @param expected expected counts
* @param observed observed counts
* @param alpha significance level of the test
*/
public static void assertChiSquareAccept(double[] expected, long[] observed, double alpha) throws Exception {
String[] labels = new String[expected.length];
for (int i = 0; i < labels.length; i++) {
labels[i] = Integer.toString(i + 1);
}
assertChiSquareAccept(labels, expected, observed, alpha);
}
/**
* Computes the 25th, 50th and 75th percentiles of the given distribution and returns
* these values in an array.
*/
public static double[] getDistributionQuartiles(ContinuousDistribution distribution) throws Exception {
double[] quantiles = new double[3];
quantiles[0] = distribution.inverseCumulativeProbability(0.25d);
quantiles[1] = distribution.inverseCumulativeProbability(0.5d);
quantiles[2] = distribution.inverseCumulativeProbability(0.75d);
return quantiles;
}
/**
* Updates observed counts of values in quartiles.
* counts[0] <-> 1st quartile ... counts[3] <-> top quartile
*/
public static void updateCounts(double value, long[] counts, double[] quartiles) {
if (value < quartiles[0]) {
counts[0]++;
} else if (value > quartiles[2]) {
counts[3]++;
} else if (value > quartiles[1]) {
counts[2]++;
} else {
counts[1]++;
}
}
/**
* Eliminates points with zero mass from densityPoints and densityValues parallel
* arrays. Returns the number of positive mass points and collapses the arrays so
* that the first <returned value> elements of the input arrays represent the positive
* mass points.
*/
public static int eliminateZeroMassPoints(int[] densityPoints, double[] densityValues) {
int positiveMassCount = 0;
for (int i = 0; i < densityValues.length; i++) {
if (densityValues[i] > 0) {
positiveMassCount++;
}
}
if (positiveMassCount < densityValues.length) {
int[] newPoints = new int[positiveMassCount];
double[] newValues = new double[positiveMassCount];
int j = 0;
for (int i = 0; i < densityValues.length; i++) {
if (densityValues[i] > 0) {
newPoints[j] = densityPoints[i];
newValues[j] = densityValues[i];
j++;
}
}
System.arraycopy(newPoints,0,densityPoints,0,positiveMassCount);
System.arraycopy(newValues,0,densityValues,0,positiveMassCount);
}
return positiveMassCount;
}
}

View File

@ -257,6 +257,23 @@ public abstract class ContinuousDistributionAbstractTest extends TestCase {
}
}
/**
* Test sampling
*/
public void testSampling() throws Exception {
AbstractContinuousDistribution dist = (AbstractContinuousDistribution) makeDistribution();
final int sampleSize = 1000;
double[] sample = dist.sample(sampleSize);
double[] quartiles = TestUtils.getDistributionQuartiles(dist);
double[] expected = {250, 250, 250, 250};
long[] counts = new long[4];
dist.reseedRandomGenerator(1000); // Use fixed seed
for (int i = 0; i < sampleSize; i++) {
TestUtils.updateCounts(sample[i], counts, quartiles);
}
TestUtils.assertChiSquareAccept(expected, counts, 0.001);
}
//------------------ Getters / Setters for test instance data -----------
/**
* @return Returns the cumulativeTestPoints.

View File

@ -16,6 +16,8 @@
*/
package org.apache.commons.math.distribution;
import org.apache.commons.math.TestUtils;
import junit.framework.TestCase;
/**
@ -270,6 +272,32 @@ public abstract class IntegerDistributionAbstractTest extends TestCase {
}
}
/**
* Test sampling
*/
public void testSampling() throws Exception {
int[] densityPoints = makeDensityTestPoints();
double[] densityValues = makeDensityTestValues();
int sampleSize = 1000;
int length = TestUtils.eliminateZeroMassPoints(densityPoints, densityValues);
AbstractIntegerDistribution distribution = (AbstractIntegerDistribution) makeDistribution();
double[] expectedCounts = new double[length];
long[] observedCounts = new long[length];
for (int i = 0; i < length; i++) {
expectedCounts[i] = sampleSize * densityValues[i];
}
distribution.reseedRandomGenerator(1000); // Use fixed seed
int[] sample = distribution.sample(sampleSize);
for (int i = 0; i < sampleSize; i++) {
for (int j = 0; j < length; j++) {
if (sample[i] == densityPoints[j]) {
observedCounts[j]++;
}
}
}
TestUtils.assertChiSquareAccept(densityPoints, expectedCounts, observedCounts, .001);
}
//------------------ Getters / Setters for test instance data -----------
/**
* @return Returns the cumulativeTestPoints.

View File

@ -815,108 +815,80 @@ public class RandomDataTest extends RetryTestCase {
}
public void testNextBeta() throws Exception {
double[] quartiles = getDistributionQuartiles(new BetaDistributionImpl(2,5));
double[] quartiles = TestUtils.getDistributionQuartiles(new BetaDistributionImpl(2,5));
long[] counts = new long[4];
randomData.reSeed(1000);
for (int i = 0; i < 1000; i++) {
double value = randomData.nextBeta(2, 5);
updateCounts(value, counts, quartiles);
TestUtils.updateCounts(value, counts, quartiles);
}
TestUtils.assertChiSquareAccept(quartiles, expected, counts, 0.001);
TestUtils.assertChiSquareAccept(expected, counts, 0.001);
}
public void testNextCauchy() throws Exception {
double[] quartiles = getDistributionQuartiles(new CauchyDistributionImpl(1.2, 2.1));
double[] quartiles = TestUtils.getDistributionQuartiles(new CauchyDistributionImpl(1.2, 2.1));
long[] counts = new long[4];
randomData.reSeed(1000);
for (int i = 0; i < 1000; i++) {
double value = randomData.nextCauchy(1.2, 2.1);
updateCounts(value, counts, quartiles);
TestUtils.updateCounts(value, counts, quartiles);
}
TestUtils.assertChiSquareAccept(quartiles, expected, counts, 0.001);
TestUtils.assertChiSquareAccept(expected, counts, 0.001);
}
public void testNextChiSquare() throws Exception {
double[] quartiles = getDistributionQuartiles(new ChiSquaredDistributionImpl(12));
double[] quartiles = TestUtils.getDistributionQuartiles(new ChiSquaredDistributionImpl(12));
long[] counts = new long[4];
randomData.reSeed(1000);
for (int i = 0; i < 1000; i++) {
double value = randomData.nextChiSquare(12);
updateCounts(value, counts, quartiles);
TestUtils.updateCounts(value, counts, quartiles);
}
TestUtils.assertChiSquareAccept(quartiles, expected, counts, 0.001);
TestUtils.assertChiSquareAccept(expected, counts, 0.001);
}
public void testNextF() throws Exception {
double[] quartiles = getDistributionQuartiles(new FDistributionImpl(12, 5));
double[] quartiles = TestUtils.getDistributionQuartiles(new FDistributionImpl(12, 5));
long[] counts = new long[4];
randomData.reSeed(1000);
for (int i = 0; i < 1000; i++) {
double value = randomData.nextF(12, 5);
updateCounts(value, counts, quartiles);
TestUtils.updateCounts(value, counts, quartiles);
}
TestUtils.assertChiSquareAccept(quartiles, expected, counts, 0.001);
TestUtils.assertChiSquareAccept(expected, counts, 0.001);
}
public void testNextGamma() throws Exception {
double[] quartiles = getDistributionQuartiles(new GammaDistributionImpl(4, 2));
double[] quartiles = TestUtils.getDistributionQuartiles(new GammaDistributionImpl(4, 2));
long[] counts = new long[4];
randomData.reSeed(1000);
for (int i = 0; i < 1000; i++) {
double value = randomData.nextGamma(4, 2);
updateCounts(value, counts, quartiles);
TestUtils.updateCounts(value, counts, quartiles);
}
TestUtils.assertChiSquareAccept(quartiles, expected, counts, 0.001);
TestUtils.assertChiSquareAccept(expected, counts, 0.001);
}
public void testNextT() throws Exception {
double[] quartiles = getDistributionQuartiles(new TDistributionImpl(10));
double[] quartiles = TestUtils.getDistributionQuartiles(new TDistributionImpl(10));
long[] counts = new long[4];
randomData.reSeed(1000);
for (int i = 0; i < 1000; i++) {
double value = randomData.nextT(10);
updateCounts(value, counts, quartiles);
TestUtils.updateCounts(value, counts, quartiles);
}
TestUtils.assertChiSquareAccept(quartiles, expected, counts, 0.001);
TestUtils.assertChiSquareAccept(expected, counts, 0.001);
}
public void testNextWeibull() throws Exception {
double[] quartiles = getDistributionQuartiles(new WeibullDistributionImpl(1.2, 2.1));
double[] quartiles = TestUtils.getDistributionQuartiles(new WeibullDistributionImpl(1.2, 2.1));
long[] counts = new long[4];
randomData.reSeed(1000);
for (int i = 0; i < 1000; i++) {
double value = randomData.nextWeibull(1.2, 2.1);
updateCounts(value, counts, quartiles);
}
TestUtils.assertChiSquareAccept(quartiles, expected, counts, 0.001);
}
/**
* Computes the 25th, 50th and 75th percentiles of the given distribution and returns
* these values in an array.
*/
private double[] getDistributionQuartiles(ContinuousDistribution distribution) throws Exception {
double[] quantiles = new double[3];
quantiles[0] = distribution.inverseCumulativeProbability(0.25d);
quantiles[1] = distribution.inverseCumulativeProbability(0.5d);
quantiles[2] = distribution.inverseCumulativeProbability(0.75d);
return quantiles;
}
/**
* Updates observed counts of values in quartiles.
* counts[0] <-> 1st quartile ... counts[3] <-> top quartile
*/
private void updateCounts(double value, long[] counts, double[] quantiles) {
if (value < quantiles[0]) {
counts[0]++;
} else if (value > quantiles[2]) {
counts[3]++;
} else if (value > quantiles[1]) {
counts[2]++;
} else {
counts[1]++;
TestUtils.updateCounts(value, counts, quartiles);
}
TestUtils.assertChiSquareAccept(expected, counts, 0.001);
}
public void testNextBinomial() throws Exception {
@ -924,7 +896,7 @@ public class RandomDataTest extends RetryTestCase {
int[] densityPoints = testInstance.makeDensityTestPoints();
double[] densityValues = testInstance.makeDensityTestValues();
int sampleSize = 1000;
int length = eliminateZeroMassPoints(densityPoints, densityValues);
int length = TestUtils.eliminateZeroMassPoints(densityPoints, densityValues);
BinomialDistributionImpl distribution = (BinomialDistributionImpl) testInstance.makeDistribution();
double[] expectedCounts = new double[length];
long[] observedCounts = new long[length];
@ -949,7 +921,7 @@ public class RandomDataTest extends RetryTestCase {
int[] densityPoints = testInstance.makeDensityTestPoints();
double[] densityValues = testInstance.makeDensityTestValues();
int sampleSize = 1000;
int length = eliminateZeroMassPoints(densityPoints, densityValues);
int length = TestUtils.eliminateZeroMassPoints(densityPoints, densityValues);
HypergeometricDistributionImpl distribution = (HypergeometricDistributionImpl) testInstance.makeDistribution();
double[] expectedCounts = new double[length];
long[] observedCounts = new long[length];
@ -974,7 +946,7 @@ public class RandomDataTest extends RetryTestCase {
int[] densityPoints = testInstance.makeDensityTestPoints();
double[] densityValues = testInstance.makeDensityTestValues();
int sampleSize = 1000;
int length = eliminateZeroMassPoints(densityPoints, densityValues);
int length = TestUtils.eliminateZeroMassPoints(densityPoints, densityValues);
PascalDistributionImpl distribution = (PascalDistributionImpl) testInstance.makeDistribution();
double[] expectedCounts = new double[length];
long[] observedCounts = new long[length];
@ -998,7 +970,7 @@ public class RandomDataTest extends RetryTestCase {
int[] densityPoints = testInstance.makeDensityTestPoints();
double[] densityValues = testInstance.makeDensityTestValues();
int sampleSize = 1000;
int length = eliminateZeroMassPoints(densityPoints, densityValues);
int length = TestUtils.eliminateZeroMassPoints(densityPoints, densityValues);
ZipfDistributionImpl distribution = (ZipfDistributionImpl) testInstance.makeDistribution();
double[] expectedCounts = new double[length];
long[] observedCounts = new long[length];
@ -1017,34 +989,4 @@ public class RandomDataTest extends RetryTestCase {
TestUtils.assertChiSquareAccept(densityPoints, expectedCounts, observedCounts, .001);
}
/**
* Eliminates points with zero mass from densityPoints and densityValues parallel
* arrays. Returns the number of positive mass points and collapses the arrays so
* that the first <returned value> elements of the input arrays represent the positive
* mass points.
*/
private int eliminateZeroMassPoints(int[] densityPoints, double[] densityValues) {
int positiveMassCount = 0;
for (int i = 0; i < densityValues.length; i++) {
if (densityValues[i] > 0) {
positiveMassCount++;
}
}
if (positiveMassCount < densityValues.length) {
int[] newPoints = new int[positiveMassCount];
double[] newValues = new double[positiveMassCount];
int j = 0;
for (int i = 0; i < densityValues.length; i++) {
if (densityValues[i] > 0) {
newPoints[j] = densityPoints[i];
newValues[j] = densityValues[i];
j++;
}
}
System.arraycopy(newPoints,0,densityPoints,0,positiveMassCount);
System.arraycopy(newValues,0,densityValues,0,positiveMassCount);
}
return positiveMassCount;
}
}