Added interface and reporting class for updating regression. JIRA: MATH-607. Contributed by Greg Sterijevski.
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@1144986 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
60f99d976d
commit
d3d5c6fb05
3
pom.xml
3
pom.xml
|
@ -219,6 +219,9 @@
|
|||
<contributor>
|
||||
<name>David Stefka</name>
|
||||
</contributor>
|
||||
<contributor>
|
||||
<name>Greg Sterijevski</name>
|
||||
</contributor>
|
||||
<contributor>
|
||||
<name>Mauro Talevi</name>
|
||||
</contributor>
|
||||
|
|
|
@ -0,0 +1,385 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.commons.math.stat.regression;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import org.apache.commons.math.util.FastMath;
|
||||
|
||||
/**
|
||||
* Results of a Multiple Linear Regression model fit.
|
||||
*
|
||||
* @version $Id$
|
||||
* @since 3.0
|
||||
*/
|
||||
public class RegressionResults implements Serializable {
|
||||
|
||||
private static final int SSE_IDX = 0;
|
||||
private static final int SST_IDX = 1;
|
||||
private static final int RSQ_IDX = 2;
|
||||
private static final int MSE_IDX = 3;
|
||||
private static final int ADJRSQ_IDX = 4;
|
||||
private static final long serialVersionUID = 1l;
|
||||
private final double[] parameters;
|
||||
private final double[][] varCovData;
|
||||
private final boolean isSymmetricVCD;
|
||||
private final int rank;
|
||||
private final long nobs;
|
||||
private final boolean containsConstant;
|
||||
private final double[] globalFitInfo;
|
||||
|
||||
/**
|
||||
* Set the default constructor to private access
|
||||
* to prevent inadvertent instantiation
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
private RegressionResults() {
|
||||
this.parameters = null;
|
||||
this.varCovData = null;
|
||||
this.rank = -1;
|
||||
this.nobs = -1;
|
||||
this.containsConstant = false;
|
||||
this.isSymmetricVCD = false;
|
||||
this.globalFitInfo = null;
|
||||
}
|
||||
|
||||
public RegressionResults(
|
||||
final double[] parameters, final double[][] varcov,
|
||||
final boolean isSymmetricCompressed,
|
||||
final long nobs, final int rank,
|
||||
final double sumy, final double sumysq, final double sse,
|
||||
final boolean containsConstant,
|
||||
final boolean copyData) {
|
||||
if (copyData) {
|
||||
this.parameters = Arrays.copyOf(parameters, parameters.length);
|
||||
this.varCovData = new double[varcov.length][];
|
||||
for (int i = 0; i < varcov.length; i++) {
|
||||
this.varCovData[i] = Arrays.copyOf(varcov[i], varcov[i].length);
|
||||
}
|
||||
} else {
|
||||
this.parameters = parameters;
|
||||
this.varCovData = varcov;
|
||||
}
|
||||
this.isSymmetricVCD = isSymmetricCompressed;
|
||||
this.nobs = nobs;
|
||||
this.rank = rank;
|
||||
this.containsConstant = containsConstant;
|
||||
this.globalFitInfo = new double[5];
|
||||
Arrays.fill(this.globalFitInfo, Double.NaN);
|
||||
|
||||
if (rank > 2) {
|
||||
this.globalFitInfo[SST_IDX] = containsConstant ?
|
||||
(sumysq - sumy * sumy / ((double) nobs)) : sumysq;
|
||||
}
|
||||
this.globalFitInfo[SSE_IDX] = sse;
|
||||
this.globalFitInfo[MSE_IDX] = this.globalFitInfo[SSE_IDX] /
|
||||
((double) (nobs - rank));
|
||||
this.globalFitInfo[RSQ_IDX] = 1.0 -
|
||||
this.globalFitInfo[SSE_IDX] /
|
||||
this.globalFitInfo[SST_IDX];
|
||||
|
||||
if (!containsConstant) {
|
||||
this.globalFitInfo[ADJRSQ_IDX] = 1.0 - (1.0 - this.globalFitInfo[RSQ_IDX]) *
|
||||
(nobs / (nobs - rank));
|
||||
} else {
|
||||
this.globalFitInfo[ADJRSQ_IDX] = 1.0 - (sse * (nobs - 1.0)) /
|
||||
(globalFitInfo[SST_IDX] * (nobs - rank));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns the parameter estimate for the regressor at the given index.</p>
|
||||
*
|
||||
* <p>A redundant regressor will have its redundancy flag set, as well as
|
||||
* a parameters estimated equal to {@code Double.NaN}</p>
|
||||
*
|
||||
* @param index an integer index which must be in the range [0, numberOfParameters-1]
|
||||
* @return parameters estimated for regressor at index
|
||||
* @throws IndexOutOfBoundsException thrown if the index >= numberOfParameters
|
||||
*/
|
||||
public double getParameterEstimate(int index) throws IndexOutOfBoundsException {
|
||||
if (parameters == null) {
|
||||
return Double.NaN;
|
||||
}
|
||||
if (index < 0 || index >= this.parameters.length) {
|
||||
throw new IndexOutOfBoundsException("Index is outside of the 0 to number of variables - 1 range");
|
||||
}
|
||||
return this.parameters[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns a copy of the regression parameters estimates.</p>
|
||||
*
|
||||
* <p>The parameter estimates are returned in the natural order of the data.</p>
|
||||
*
|
||||
* <p>A redundant regressor will have its redundancy flag set, as will
|
||||
* a parameter estimate equal to {@code Double.NaN}.</p>
|
||||
*
|
||||
* @return array of parameter estimates, null if no estimation occurred
|
||||
*/
|
||||
public double[] getParameterEstimates() {
|
||||
if (this.parameters == null) {
|
||||
return null;
|
||||
}
|
||||
return Arrays.copyOf(parameters, parameters.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
|
||||
* error of the parameter estimate at index</a>,
|
||||
* usually denoted s(b<sub>index</sub>).
|
||||
*
|
||||
* @param index an integer index which must be in the range [0, numberOfParameters-1]
|
||||
* @return standard errors associated with parameters estimated at index
|
||||
* @throws IndexOutOfBoundsException thrown if the index >= numberOfParameters
|
||||
*/
|
||||
public double getStdErrorOfEstimate(int index) throws IndexOutOfBoundsException {
|
||||
if (parameters == null) {
|
||||
return Double.NaN;
|
||||
}
|
||||
if (index < 0 || index >= this.parameters.length) {
|
||||
throw new IndexOutOfBoundsException("Index is outside of the 0 to number of variables - 1 range");
|
||||
}
|
||||
double var = this.getVcvElement(index, index);
|
||||
if (!Double.isNaN(var) && var > Double.MIN_VALUE) {
|
||||
return FastMath.sqrt(rank);
|
||||
}
|
||||
return Double.NaN;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
|
||||
* error of the parameter estimates</a>,
|
||||
* usually denoted s(b<sub>i</sub>).</p>
|
||||
*
|
||||
* <p>If there are problems with an ill conditioned design matrix then the regressor
|
||||
* which is redundant will be assigned <code>Double.NaN</code>. </p>
|
||||
*
|
||||
* @return an array standard errors associated with parameters estimates,
|
||||
* null if no estimation occurred
|
||||
*/
|
||||
public double[] getStdErrorOfEstimates() {
|
||||
if (parameters == null) {
|
||||
return null;
|
||||
}
|
||||
double[] se = new double[this.parameters.length];
|
||||
for (int i = 0; i < this.parameters.length; i++) {
|
||||
double var = this.getVcvElement(i, i);
|
||||
if (!Double.isNaN(var) && var > Double.MIN_VALUE) {
|
||||
se[i] = FastMath.sqrt(rank);
|
||||
continue;
|
||||
}
|
||||
se[i] = Double.NaN;
|
||||
}
|
||||
return se;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns the covariance between regression parameters i and j.</p>
|
||||
*
|
||||
* <p>If there are problems with an ill conditioned design matrix then the covariance
|
||||
* which involves redundant columns will be assigned {@code Double.NaN}. </p>
|
||||
*
|
||||
* @param i - the ith regression parameter
|
||||
* @param j - the jth regression parameter
|
||||
* @return the covariance of the parameter estimates
|
||||
*/
|
||||
public double getCovarianceOfParameters(int i, int j) throws IndexOutOfBoundsException {
|
||||
if (parameters == null) {
|
||||
return Double.NaN;
|
||||
}
|
||||
if (i < 0 || i >= this.parameters.length) {
|
||||
throw new IndexOutOfBoundsException(" Row index is outside of the 0 " +
|
||||
"to number of variables - 1 range");
|
||||
}
|
||||
if (j < 0 || j >= this.parameters.length) {
|
||||
throw new IndexOutOfBoundsException(" Column index is outside of the 0" +
|
||||
" to number of variables - 1 range");
|
||||
}
|
||||
return this.getVcvElement(i, j);
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns the number of parameters estimated in the model.</p>
|
||||
*
|
||||
* <p>This is the maximum number of regressors, some techniques may drop
|
||||
* redundant parameters</p>
|
||||
*
|
||||
* @return number of regressors, -1 if not estimated
|
||||
*/
|
||||
public int getNumberOfParameters() {
|
||||
if (this.parameters == null) {
|
||||
return -1;
|
||||
}
|
||||
return this.parameters.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of observations added to the regression model.
|
||||
*
|
||||
* @return Number of observations, -1 if an error condition prevents estimation
|
||||
*/
|
||||
public long getN() {
|
||||
return this.nobs;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns the sum of squared deviations of the y values about their mean.</p>
|
||||
*
|
||||
* <p>This is defined as SSTO
|
||||
* <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
|
||||
*
|
||||
* <p>If {@code n < 2}, this returns {@code Double.NaN}.</p>
|
||||
*
|
||||
* @return sum of squared deviations of y values
|
||||
*/
|
||||
public double getTotalSumSquares() {
|
||||
return this.globalFitInfo[SST_IDX];
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns the sum of squared deviations of the predicted y values about
|
||||
* their mean (which equals the mean of y).</p>
|
||||
*
|
||||
* <p>This is usually abbreviated SSR or SSM. It is defined as SSM
|
||||
* <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
|
||||
*
|
||||
* <p><strong>Preconditions</strong>: <ul>
|
||||
* <li>At least two observations (with at least two different x values)
|
||||
* must have been added before invoking this method. If this method is
|
||||
* invoked before a model can be estimated, <code>Double.NaN</code> is
|
||||
* returned.
|
||||
* </li></ul></p>
|
||||
*
|
||||
* @return sum of squared deviations of predicted y values
|
||||
*/
|
||||
public double getRegressionSumSquares() {
|
||||
return this.globalFitInfo[SST_IDX] - this.globalFitInfo[SSE_IDX];
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
|
||||
* sum of squared errors</a> (SSE) associated with the regression
|
||||
* model.</p>
|
||||
*
|
||||
* <p>The return value is constrained to be non-negative - i.e., if due to
|
||||
* rounding errors the computational formula returns a negative result,
|
||||
* 0 is returned.</p>
|
||||
*
|
||||
* <p><strong>Preconditions</strong>: <ul>
|
||||
* <li>numberOfParameters data pairs
|
||||
* must have been added before invoking this method. If this method is
|
||||
* invoked before a model can be estimated, <code>Double,NaN</code> is
|
||||
* returned.
|
||||
* </li></ul></p>
|
||||
*
|
||||
* @return sum of squared errors associated with the regression model
|
||||
*/
|
||||
public double getErrorSumSquares() {
|
||||
return this.globalFitInfo[ SSE_IDX];
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns the sum of squared errors divided by the degrees of freedom,
|
||||
* usually abbreviated MSE.</p>
|
||||
*
|
||||
* <p>If there are fewer than <strong>numberOfParameters + 1</strong> data pairs in the model,
|
||||
* or if there is no variation in <code>x</code>, this returns
|
||||
* <code>Double.NaN</code>.</p>
|
||||
*
|
||||
* @return sum of squared deviations of y values
|
||||
*/
|
||||
public double getMeanSquareError() {
|
||||
return this.globalFitInfo[ MSE_IDX];
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns the <a href="http://www.xycoon.com/coefficient1.htm">
|
||||
* coefficient of multiple determination</a>,
|
||||
* usually denoted r-square.</p>
|
||||
*
|
||||
* <p><strong>Preconditions</strong>: <ul>
|
||||
* <li>At least numberOfParameters observations (with at least numberOfParameters different x values)
|
||||
* must have been added before invoking this method. If this method is
|
||||
* invoked before a model can be estimated, {@code Double,NaN} is
|
||||
* returned.
|
||||
* </li></ul></p>
|
||||
*
|
||||
* @return r-square, a double in the interval [0, 1]
|
||||
*/
|
||||
public double getRSquared() {
|
||||
return this.globalFitInfo[ RSQ_IDX];
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Returns the adjusted R-squared statistic, defined by the formula <pre>
|
||||
* R<sup>2</sup><sub>adj</sub> = 1 - [SSR (n - 1)] / [SSTO (n - p)]
|
||||
* </pre>
|
||||
* where SSR is the sum of squared residuals},
|
||||
* SSTO is the total sum of squares}, n is the number
|
||||
* of observations and p is the number of parameters estimated (including the intercept).</p>
|
||||
*
|
||||
* <p>If the regression is estimated without an intercept term, what is returned is <pre>
|
||||
* <code> 1 - (1 - {@link #getRSquared()} ) * (n / (n - p)) </code>
|
||||
* </pre></p>
|
||||
*
|
||||
* @return adjusted R-Squared statistic
|
||||
*/
|
||||
public double getAdjustedRSquared() {
|
||||
return this.globalFitInfo[ ADJRSQ_IDX];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the regression model has been computed including an intercept.
|
||||
* In this case, the coefficient of the intercept is the first element of the
|
||||
* {@link #getParameterEstimates() parameter estimates}.
|
||||
* @return true if the model has an intercept term
|
||||
*/
|
||||
public boolean hasIntercept() {
|
||||
return this.containsConstant;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the i-jth element of the variance-covariance matrix.
|
||||
*
|
||||
* @param i first variable index
|
||||
* @param j second variable index
|
||||
* @return the requested variance-covariance matrix entry
|
||||
*/
|
||||
private double getVcvElement(int i, int j) {
|
||||
if (this.isSymmetricVCD) {
|
||||
if (this.varCovData.length > 1) {
|
||||
//could be stored in upper or lower triangular
|
||||
if (i == j) {
|
||||
return varCovData[i][i];
|
||||
} else if (i >= varCovData[j].length) {
|
||||
return varCovData[i][j];
|
||||
} else {
|
||||
return varCovData[j][i];
|
||||
}
|
||||
} else {//could be in single array
|
||||
if (i > j) {
|
||||
return varCovData[0][(i + 1) * i / 2 + j];
|
||||
} else {
|
||||
return varCovData[0][(j + 1) * j / 2 + i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return this.varCovData[i][j];
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.commons.math.stat.regression;
|
||||
|
||||
import org.apache.commons.math.MathException;
|
||||
|
||||
/**
|
||||
* An interface for regression models allowing for dynamic updating of the data.
|
||||
* That is, the entire data set need not be loaded into memory. As observations
|
||||
* become available, they can be added to the regression model and an updated
|
||||
* estimate regression statistics can be calculated.
|
||||
*
|
||||
* @version $Id$
|
||||
* @since 3.0
|
||||
*/
|
||||
public interface UpdatingMultipleLinearRegression {
|
||||
|
||||
/**
|
||||
* Returns true if a constant has been included false otherwise.
|
||||
*
|
||||
* @return true if constant exists, false otherwise
|
||||
*/
|
||||
boolean hasIntercept();
|
||||
|
||||
/**
|
||||
* Returns the number of observations added to the regression model.
|
||||
*
|
||||
* @return Number of observations
|
||||
*/
|
||||
long getN();
|
||||
|
||||
/**
|
||||
* Adds one observation to the regression model.
|
||||
*
|
||||
* @param x the independent variables which form the design matrix
|
||||
* @param y the dependent or response variable
|
||||
*/
|
||||
void addObservation(double[] x, double y);
|
||||
|
||||
/**
|
||||
* Adds a series of observations to the regression model. The lengths of
|
||||
* x and y must be the same and x must be rectangular.
|
||||
*
|
||||
* @param x a series of observations on the independent variables
|
||||
* @param y a series of observations on the dependent variable
|
||||
* The length of x and y must be the same
|
||||
*/
|
||||
void addObservations(double[][] x, double[] y);
|
||||
|
||||
/**
|
||||
* Clears internal buffers and resets the regression model. This means all
|
||||
* data and derived values are initialized
|
||||
*/
|
||||
void clear();
|
||||
|
||||
|
||||
/**
|
||||
* Performs a regression on data present in buffers and outputs a RegressionResults object
|
||||
* @return RegressionResults acts as a container of regression output
|
||||
* @throws MathException a wide variety of exception cases are possible, check message
|
||||
*/
|
||||
RegressionResults regress() throws MathException;
|
||||
|
||||
/**
|
||||
* Performs a regression on data present in buffers including only regressors
|
||||
* indexed in variablesToInclude and outputs a RegressionResults object
|
||||
* @param variablesToInclude an array of indices of regressors to include
|
||||
* @return RegressionResults acts as a container of regression output
|
||||
* @throws MathException a wide variety of exception cases are possible, check message
|
||||
*/
|
||||
RegressionResults regress(int[] variablesToInclude) throws MathException;
|
||||
}
|
Loading…
Reference in New Issue