Code style and Javadoc nits.
This commit is contained in:
parent
509f50b0a5
commit
693d560427
|
@ -46,13 +46,14 @@ import org.apache.commons.math4.legacy.core.jdkmath.AccurateMath;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>Represents an <a href="http://en.wikipedia.org/wiki/Empirical_distribution_function">
|
* <p>Represents an <a href="http://en.wikipedia.org/wiki/Empirical_distribution_function">
|
||||||
* empirical probability distribution</a> -- a probability distribution derived
|
* empirical probability distribution</a>: Probability distribution derived
|
||||||
* from observed data without making any assumptions about the functional form
|
* from observed data without making any assumptions about the functional form
|
||||||
* of the population distribution that the data come from.</p>
|
* of the population distribution that the data come from.</p>
|
||||||
*
|
*
|
||||||
* <p>An <code>EmpiricalDistribution</code> maintains data structures, called
|
* <p>An {@code EmpiricalDistribution} maintains data structures called
|
||||||
* <i>distribution digests</i>, that describe empirical distributions and
|
* <i>distribution digests</i> that describe empirical distributions and
|
||||||
* support the following operations: <ul>
|
* support the following operations:
|
||||||
|
* <ul>
|
||||||
* <li>loading the distribution from a file of observed data values</li>
|
* <li>loading the distribution from a file of observed data values</li>
|
||||||
* <li>dividing the input data into "bin ranges" and reporting bin frequency
|
* <li>dividing the input data into "bin ranges" and reporting bin frequency
|
||||||
* counts (data for histogram)</li>
|
* counts (data for histogram)</li>
|
||||||
|
@ -60,26 +61,31 @@ import org.apache.commons.math4.legacy.core.jdkmath.AccurateMath;
|
||||||
* as well as the observations within each bin</li>
|
* as well as the observations within each bin</li>
|
||||||
* <li>generating random values from the distribution</li>
|
* <li>generating random values from the distribution</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
* Applications can use <code>EmpiricalDistribution</code> to build grouped
|
*
|
||||||
|
* Applications can use {@code EmpiricalDistribution} to build grouped
|
||||||
* frequency histograms representing the input data or to generate random values
|
* frequency histograms representing the input data or to generate random values
|
||||||
* "like" those in the input file -- i.e., the values generated will follow the
|
* "like" those in the input file, i.e. the values generated will follow the
|
||||||
* distribution of the values in the file.
|
* distribution of the values in the file.
|
||||||
*
|
*
|
||||||
* <p>The implementation uses what amounts to the
|
* <p>The implementation uses what amounts to the
|
||||||
* <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
|
* <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
|
||||||
* Variable Kernel Method</a> with Gaussian smoothing:<p>
|
* Variable Kernel Method</a> with Gaussian smoothing:<p>
|
||||||
* <strong>Digesting the input file</strong>
|
* <strong>Digesting the input file</strong>
|
||||||
* <ol><li>Pass the file once to compute min and max.</li>
|
* <ol>
|
||||||
* <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
|
* <li>Pass the file once to compute min and max.</li>
|
||||||
|
* <li>Divide the range from min to max into {@code binCount} bins.</li>
|
||||||
* <li>Pass the data file again, computing bin counts and univariate
|
* <li>Pass the data file again, computing bin counts and univariate
|
||||||
* statistics (mean, std dev.) for each of the bins </li>
|
* statistics (mean and std dev.) for each bin.</li>
|
||||||
* <li>Divide the interval (0,1) into subintervals associated with the bins,
|
* <li>Divide the interval (0,1) into subintervals associated with the bins,
|
||||||
* with the length of a bin's subinterval proportional to its count.</li></ol>
|
* with the length of a bin's subinterval proportional to its count.</li>
|
||||||
* <strong>Generating random values from the distribution</strong><ol>
|
* </ol>
|
||||||
|
* <strong>Generating random values from the distribution</strong>
|
||||||
|
* <ol>
|
||||||
* <li>Generate a uniformly distributed value in (0,1) </li>
|
* <li>Generate a uniformly distributed value in (0,1) </li>
|
||||||
* <li>Select the subinterval to which the value belongs.
|
* <li>Select the subinterval to which the value belongs.
|
||||||
* <li>Generate a random Gaussian value with mean = mean of the associated
|
* <li>Generate a random Gaussian value with mean = mean of the associated
|
||||||
* bin and std dev = std dev of associated bin.</li></ol>
|
* bin and std dev = std dev of associated bin.</li>
|
||||||
|
* </ol>
|
||||||
*
|
*
|
||||||
* <p>EmpiricalDistribution implements the {@link ContinuousDistribution} interface
|
* <p>EmpiricalDistribution implements the {@link ContinuousDistribution} interface
|
||||||
* as follows. Given x within the range of values in the dataset, let B
|
* as follows. Given x within the range of values in the dataset, let B
|
||||||
|
@ -91,49 +97,38 @@ import org.apache.commons.math4.legacy.core.jdkmath.AccurateMath;
|
||||||
* grouped frequency distribution at the bin endpoints and interpolates within
|
* grouped frequency distribution at the bin endpoints and interpolates within
|
||||||
* bins using within-bin kernels.</p>
|
* bins using within-bin kernels.</p>
|
||||||
*
|
*
|
||||||
*<strong>USAGE NOTES:</strong><ul>
|
* <strong>USAGE NOTES:</strong>
|
||||||
*<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb
|
* <ul>
|
||||||
|
* <li>The {@code binCount} is set by default to 1000. A good rule of thumb
|
||||||
* is to set the bin count to approximately the length of the input file divided
|
* is to set the bin count to approximately the length of the input file divided
|
||||||
* by 10. </li>
|
* by 10. </li>
|
||||||
*<li>The input file <i>must</i> be a plain text file containing one valid numeric
|
* <li>The input file <i>must</i> be a plain text file containing one valid numeric
|
||||||
* entry per line.</li>
|
* entry per line.</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class EmpiricalDistribution extends AbstractRealDistribution
|
public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
implements ContinuousDistribution {
|
implements ContinuousDistribution {
|
||||||
|
|
||||||
/** Default bin count. */
|
/** Default bin count. */
|
||||||
public static final int DEFAULT_BIN_COUNT = 1000;
|
public static final int DEFAULT_BIN_COUNT = 1000;
|
||||||
|
|
||||||
/** Character set for file input. */
|
/** Character set for file input. */
|
||||||
private static final String FILE_CHARSET = "US-ASCII";
|
private static final String FILE_CHARSET = "US-ASCII";
|
||||||
|
|
||||||
/** Serializable version identifier. */
|
/** Serializable version identifier. */
|
||||||
private static final long serialVersionUID = 5729073523949762654L;
|
private static final long serialVersionUID = 5729073523949762654L;
|
||||||
|
/** Bins' characteristics. */
|
||||||
/** List of SummaryStatistics objects characterizing the bins. */
|
|
||||||
private final List<SummaryStatistics> binStats;
|
private final List<SummaryStatistics> binStats;
|
||||||
|
|
||||||
/** Sample statistics. */
|
/** Sample statistics. */
|
||||||
private SummaryStatistics sampleStats;
|
private SummaryStatistics sampleStats;
|
||||||
|
|
||||||
/** Max loaded value. */
|
/** Max loaded value. */
|
||||||
private double max = Double.NEGATIVE_INFINITY;
|
private double max = Double.NEGATIVE_INFINITY;
|
||||||
|
|
||||||
/** Min loaded value. */
|
/** Min loaded value. */
|
||||||
private double min = Double.POSITIVE_INFINITY;
|
private double min = Double.POSITIVE_INFINITY;
|
||||||
|
|
||||||
/** Grid size. */
|
/** Grid size. */
|
||||||
private double delta;
|
private double delta;
|
||||||
|
/** Number of bins. */
|
||||||
/** number of bins. */
|
|
||||||
private final int binCount;
|
private final int binCount;
|
||||||
|
/** Whether the distribution is loaded. */
|
||||||
/** is the distribution loaded? */
|
|
||||||
private boolean loaded;
|
private boolean loaded;
|
||||||
|
/** Upper bounds of subintervals in (0,1) belonging to the bins. */
|
||||||
/** upper bounds of subintervals in (0,1) "belonging" to the bins. */
|
|
||||||
private double[] upperBounds;
|
private double[] upperBounds;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -247,11 +242,10 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides methods for computing <code>sampleStats</code> and
|
* Provides methods for computing {@code sampleStats} and
|
||||||
* <code>beanStats</code> abstracting the source of data.
|
* {@code beanStats} abstracting the source of data.
|
||||||
*/
|
*/
|
||||||
private abstract class DataAdapter{
|
private abstract class DataAdapter {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compute bin stats.
|
* Compute bin stats.
|
||||||
*
|
*
|
||||||
|
@ -265,16 +259,14 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
* @throws IOException if an error occurs computing sample stats
|
* @throws IOException if an error occurs computing sample stats
|
||||||
*/
|
*/
|
||||||
public abstract void computeStats() throws IOException;
|
public abstract void computeStats() throws IOException;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <code>DataAdapter</code> for data provided through some input stream.
|
* {@code DataAdapter} for data provided through some input stream.
|
||||||
*/
|
*/
|
||||||
private class StreamDataAdapter extends DataAdapter{
|
private class StreamDataAdapter extends DataAdapter {
|
||||||
|
|
||||||
/** Input stream providing access to the data. */
|
/** Input stream providing access to the data. */
|
||||||
private BufferedReader inputStream;
|
private final BufferedReader inputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a StreamDataAdapter from a BufferedReader.
|
* Create a StreamDataAdapter from a BufferedReader.
|
||||||
|
@ -282,7 +274,6 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
* @param in BufferedReader input stream
|
* @param in BufferedReader input stream
|
||||||
*/
|
*/
|
||||||
StreamDataAdapter(BufferedReader in){
|
StreamDataAdapter(BufferedReader in){
|
||||||
super();
|
|
||||||
inputStream = in;
|
inputStream = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -298,7 +289,6 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
}
|
}
|
||||||
|
|
||||||
inputStream.close();
|
inputStream.close();
|
||||||
inputStream = null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** {@inheritDoc} */
|
/** {@inheritDoc} */
|
||||||
|
@ -312,15 +302,13 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
sampleStats.addValue(val);
|
sampleStats.addValue(val);
|
||||||
}
|
}
|
||||||
inputStream.close();
|
inputStream.close();
|
||||||
inputStream = null;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <code>DataAdapter</code> for data provided as array of doubles.
|
* {@code DataAdapter} for data provided as array of doubles.
|
||||||
*/
|
*/
|
||||||
private class ArrayDataAdapter extends DataAdapter {
|
private class ArrayDataAdapter extends DataAdapter {
|
||||||
|
|
||||||
/** Array of input data values. */
|
/** Array of input data values. */
|
||||||
private final double[] inputArray;
|
private final double[] inputArray;
|
||||||
|
|
||||||
|
@ -331,7 +319,6 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
* @throws NullArgumentException if in is null
|
* @throws NullArgumentException if in is null
|
||||||
*/
|
*/
|
||||||
ArrayDataAdapter(double[] in) {
|
ArrayDataAdapter(double[] in) {
|
||||||
super();
|
|
||||||
NullArgumentException.check(in);
|
NullArgumentException.check(in);
|
||||||
inputArray = in;
|
inputArray = in;
|
||||||
}
|
}
|
||||||
|
@ -349,8 +336,7 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
@Override
|
@Override
|
||||||
public void computeBinStats() throws IOException {
|
public void computeBinStats() throws IOException {
|
||||||
for (int i = 0; i < inputArray.length; i++) {
|
for (int i = 0; i < inputArray.length; i++) {
|
||||||
SummaryStatistics stats =
|
SummaryStatistics stats = binStats.get(findBin(inputArray[i]));
|
||||||
binStats.get(findBin(inputArray[i]));
|
|
||||||
stats.addValue(inputArray[i]);
|
stats.addValue(inputArray[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -362,12 +348,11 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
* @param da object providing access to the data
|
* @param da object providing access to the data
|
||||||
* @throws IOException if an IO error occurs
|
* @throws IOException if an IO error occurs
|
||||||
*/
|
*/
|
||||||
private void fillBinStats(final DataAdapter da)
|
private void fillBinStats(final DataAdapter da) throws IOException {
|
||||||
throws IOException {
|
|
||||||
// Set up grid
|
// Set up grid
|
||||||
min = sampleStats.getMin();
|
min = sampleStats.getMin();
|
||||||
max = sampleStats.getMax();
|
max = sampleStats.getMax();
|
||||||
delta = (max - min)/binCount;
|
delta = (max - min) / binCount;
|
||||||
|
|
||||||
// Initialize binStats ArrayList
|
// Initialize binStats ArrayList
|
||||||
if (!binStats.isEmpty()) {
|
if (!binStats.isEmpty()) {
|
||||||
|
@ -375,7 +360,7 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
}
|
}
|
||||||
for (int i = 0; i < binCount; i++) {
|
for (int i = 0; i < binCount; i++) {
|
||||||
SummaryStatistics stats = new SummaryStatistics();
|
SummaryStatistics stats = new SummaryStatistics();
|
||||||
binStats.add(i,stats);
|
binStats.add(i, stats);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Filling data in binStats Array
|
// Filling data in binStats Array
|
||||||
|
@ -383,13 +368,12 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
|
|
||||||
// Assign upperBounds based on bin counts
|
// Assign upperBounds based on bin counts
|
||||||
upperBounds = new double[binCount];
|
upperBounds = new double[binCount];
|
||||||
upperBounds[0] =
|
upperBounds[0] = binStats.get(0).getN() / (double) sampleStats.getN();
|
||||||
((double) binStats.get(0).getN()) / (double) sampleStats.getN();
|
for (int i = 1; i < binCount - 1; i++) {
|
||||||
for (int i = 1; i < binCount-1; i++) {
|
upperBounds[i] = upperBounds[i - 1] +
|
||||||
upperBounds[i] = upperBounds[i-1] +
|
binStats.get(i).getN() / (double) sampleStats.getN();
|
||||||
((double) binStats.get(i).getN()) / (double) sampleStats.getN();
|
|
||||||
}
|
}
|
||||||
upperBounds[binCount-1] = 1.0d;
|
upperBounds[binCount - 1] = 1d;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -399,8 +383,7 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
* @return the index of the bin containing the value
|
* @return the index of the bin containing the value
|
||||||
*/
|
*/
|
||||||
private int findBin(double value) {
|
private int findBin(double value) {
|
||||||
return AccurateMath.min(
|
return AccurateMath.min(AccurateMath.max((int) AccurateMath.ceil((value - min) / delta) - 1, 0),
|
||||||
AccurateMath.max((int) AccurateMath.ceil((value - min) / delta) - 1, 0),
|
|
||||||
binCount - 1);
|
binCount - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -490,7 +473,7 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
return loaded;
|
return loaded;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Distribution methods ---------------------------
|
// Distribution methods.
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@inheritDoc}
|
* {@inheritDoc}
|
||||||
|
@ -588,21 +571,22 @@ public class EmpiricalDistribution extends AbstractRealDistribution
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public double inverseCumulativeProbability(final double p) {
|
public double inverseCumulativeProbability(final double p) {
|
||||||
if (p < 0.0 || p > 1.0) {
|
if (p < 0 ||
|
||||||
|
p > 1) {
|
||||||
throw new OutOfRangeException(p, 0, 1);
|
throw new OutOfRangeException(p, 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p == 0.0) {
|
if (p == 0) {
|
||||||
return getSupportLowerBound();
|
return getSupportLowerBound();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p == 1.0) {
|
if (p == 1) {
|
||||||
return getSupportUpperBound();
|
return getSupportUpperBound();
|
||||||
}
|
}
|
||||||
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while (cumBinP(i) < p) {
|
while (cumBinP(i) < p) {
|
||||||
i++;
|
++i;
|
||||||
}
|
}
|
||||||
|
|
||||||
final ContinuousDistribution kernel = getKernel(binStats.get(i));
|
final ContinuousDistribution kernel = getKernel(binStats.get(i));
|
||||||
|
|
|
@ -667,8 +667,7 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Ignore
|
@Ignore@Test
|
||||||
@Test
|
|
||||||
public void testMath1462() {
|
public void testMath1462() {
|
||||||
final double[] data = {
|
final double[] data = {
|
||||||
6464.0205, 6449.1328, 6489.4569, 6497.5533, 6251.6487,
|
6464.0205, 6449.1328, 6489.4569, 6497.5533, 6251.6487,
|
||||||
|
@ -689,13 +688,20 @@ public final class EmpiricalDistributionTest extends RealDistributionAbstractTes
|
||||||
final EmpiricalDistribution ed = new EmpiricalDistribution(data.length);
|
final EmpiricalDistribution ed = new EmpiricalDistribution(data.length);
|
||||||
ed.load(data);
|
ed.load(data);
|
||||||
|
|
||||||
final double p50 = ed.inverseCumulativeProbability(0.5);
|
double v;
|
||||||
final double p51 = ed.inverseCumulativeProbability(0.51111);
|
double p;
|
||||||
final double p49 = ed.inverseCumulativeProbability(0.49999);
|
|
||||||
|
|
||||||
Assert.assertTrue(p51 < 6350);
|
p = 0.49999;
|
||||||
Assert.assertTrue(p49 < 6341);
|
v = ed.inverseCumulativeProbability(p);
|
||||||
Assert.assertTrue(p50 < 7000);
|
Assert.assertTrue("p=" + p + " => v=" + v, v < 6341);
|
||||||
|
|
||||||
|
p = 0.5;
|
||||||
|
v = ed.inverseCumulativeProbability(p);
|
||||||
|
Assert.assertTrue("p=" + p + " => v=" + v, v < 7000);
|
||||||
|
|
||||||
|
p = 0.51111;
|
||||||
|
v = ed.inverseCumulativeProbability(p);
|
||||||
|
Assert.assertTrue("p=" + p + " => v=" + v, v < 6350);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in New Issue