Added interface and reporting class for updating regression. JIRA: MATH-607. Contributed by Greg Sterijevski.

git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@1144986 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Phil Steitz 2011-07-10 23:45:43 +00:00
parent 60f99d976d
commit d3d5c6fb05
3 changed files with 474 additions and 0 deletions

View File

@ -219,6 +219,9 @@
<contributor>
<name>David Stefka</name>
</contributor>
<contributor>
<name>Greg Sterijevski</name>
</contributor>
<contributor>
<name>Mauro Talevi</name>
</contributor>

View File

@ -0,0 +1,385 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math.stat.regression;
import java.io.Serializable;
import java.util.Arrays;
import org.apache.commons.math.util.FastMath;
/**
* Results of a Multiple Linear Regression model fit.
*
* @version $Id$
* @since 3.0
*/
public class RegressionResults implements Serializable {
private static final int SSE_IDX = 0;
private static final int SST_IDX = 1;
private static final int RSQ_IDX = 2;
private static final int MSE_IDX = 3;
private static final int ADJRSQ_IDX = 4;
private static final long serialVersionUID = 1l;
private final double[] parameters;
private final double[][] varCovData;
private final boolean isSymmetricVCD;
private final int rank;
private final long nobs;
private final boolean containsConstant;
private final double[] globalFitInfo;
/**
* Set the default constructor to private access
* to prevent inadvertent instantiation
*/
@SuppressWarnings("unused")
private RegressionResults() {
this.parameters = null;
this.varCovData = null;
this.rank = -1;
this.nobs = -1;
this.containsConstant = false;
this.isSymmetricVCD = false;
this.globalFitInfo = null;
}
public RegressionResults(
final double[] parameters, final double[][] varcov,
final boolean isSymmetricCompressed,
final long nobs, final int rank,
final double sumy, final double sumysq, final double sse,
final boolean containsConstant,
final boolean copyData) {
if (copyData) {
this.parameters = Arrays.copyOf(parameters, parameters.length);
this.varCovData = new double[varcov.length][];
for (int i = 0; i < varcov.length; i++) {
this.varCovData[i] = Arrays.copyOf(varcov[i], varcov[i].length);
}
} else {
this.parameters = parameters;
this.varCovData = varcov;
}
this.isSymmetricVCD = isSymmetricCompressed;
this.nobs = nobs;
this.rank = rank;
this.containsConstant = containsConstant;
this.globalFitInfo = new double[5];
Arrays.fill(this.globalFitInfo, Double.NaN);
if (rank > 2) {
this.globalFitInfo[SST_IDX] = containsConstant ?
(sumysq - sumy * sumy / ((double) nobs)) : sumysq;
}
this.globalFitInfo[SSE_IDX] = sse;
this.globalFitInfo[MSE_IDX] = this.globalFitInfo[SSE_IDX] /
((double) (nobs - rank));
this.globalFitInfo[RSQ_IDX] = 1.0 -
this.globalFitInfo[SSE_IDX] /
this.globalFitInfo[SST_IDX];
if (!containsConstant) {
this.globalFitInfo[ADJRSQ_IDX] = 1.0 - (1.0 - this.globalFitInfo[RSQ_IDX]) *
(nobs / (nobs - rank));
} else {
this.globalFitInfo[ADJRSQ_IDX] = 1.0 - (sse * (nobs - 1.0)) /
(globalFitInfo[SST_IDX] * (nobs - rank));
}
}
/**
* <p>Returns the parameter estimate for the regressor at the given index.</p>
*
* <p>A redundant regressor will have its redundancy flag set, as well as
* a parameters estimated equal to {@code Double.NaN}</p>
*
* @param index an integer index which must be in the range [0, numberOfParameters-1]
* @return parameters estimated for regressor at index
* @throws IndexOutOfBoundsException thrown if the index >= numberOfParameters
*/
public double getParameterEstimate(int index) throws IndexOutOfBoundsException {
if (parameters == null) {
return Double.NaN;
}
if (index < 0 || index >= this.parameters.length) {
throw new IndexOutOfBoundsException("Index is outside of the 0 to number of variables - 1 range");
}
return this.parameters[index];
}
/**
* <p>Returns a copy of the regression parameters estimates.</p>
*
* <p>The parameter estimates are returned in the natural order of the data.</p>
*
* <p>A redundant regressor will have its redundancy flag set, as will
* a parameter estimate equal to {@code Double.NaN}.</p>
*
* @return array of parameter estimates, null if no estimation occurred
*/
public double[] getParameterEstimates() {
if (this.parameters == null) {
return null;
}
return Arrays.copyOf(parameters, parameters.length);
}
/**
* Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
* error of the parameter estimate at index</a>,
* usually denoted s(b<sub>index</sub>).
*
* @param index an integer index which must be in the range [0, numberOfParameters-1]
* @return standard errors associated with parameters estimated at index
* @throws IndexOutOfBoundsException thrown if the index >= numberOfParameters
*/
public double getStdErrorOfEstimate(int index) throws IndexOutOfBoundsException {
if (parameters == null) {
return Double.NaN;
}
if (index < 0 || index >= this.parameters.length) {
throw new IndexOutOfBoundsException("Index is outside of the 0 to number of variables - 1 range");
}
double var = this.getVcvElement(index, index);
if (!Double.isNaN(var) && var > Double.MIN_VALUE) {
return FastMath.sqrt(rank);
}
return Double.NaN;
}
/**
* <p>Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
* error of the parameter estimates</a>,
* usually denoted s(b<sub>i</sub>).</p>
*
* <p>If there are problems with an ill conditioned design matrix then the regressor
* which is redundant will be assigned <code>Double.NaN</code>. </p>
*
* @return an array standard errors associated with parameters estimates,
* null if no estimation occurred
*/
public double[] getStdErrorOfEstimates() {
if (parameters == null) {
return null;
}
double[] se = new double[this.parameters.length];
for (int i = 0; i < this.parameters.length; i++) {
double var = this.getVcvElement(i, i);
if (!Double.isNaN(var) && var > Double.MIN_VALUE) {
se[i] = FastMath.sqrt(rank);
continue;
}
se[i] = Double.NaN;
}
return se;
}
/**
* <p>Returns the covariance between regression parameters i and j.</p>
*
* <p>If there are problems with an ill conditioned design matrix then the covariance
* which involves redundant columns will be assigned {@code Double.NaN}. </p>
*
* @param i - the ith regression parameter
* @param j - the jth regression parameter
* @return the covariance of the parameter estimates
*/
public double getCovarianceOfParameters(int i, int j) throws IndexOutOfBoundsException {
if (parameters == null) {
return Double.NaN;
}
if (i < 0 || i >= this.parameters.length) {
throw new IndexOutOfBoundsException(" Row index is outside of the 0 " +
"to number of variables - 1 range");
}
if (j < 0 || j >= this.parameters.length) {
throw new IndexOutOfBoundsException(" Column index is outside of the 0" +
" to number of variables - 1 range");
}
return this.getVcvElement(i, j);
}
/**
* <p>Returns the number of parameters estimated in the model.</p>
*
* <p>This is the maximum number of regressors, some techniques may drop
* redundant parameters</p>
*
* @return number of regressors, -1 if not estimated
*/
public int getNumberOfParameters() {
if (this.parameters == null) {
return -1;
}
return this.parameters.length;
}
/**
* Returns the number of observations added to the regression model.
*
* @return Number of observations, -1 if an error condition prevents estimation
*/
public long getN() {
return this.nobs;
}
/**
* <p>Returns the sum of squared deviations of the y values about their mean.</p>
*
* <p>This is defined as SSTO
* <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
*
* <p>If {@code n < 2}, this returns {@code Double.NaN}.</p>
*
* @return sum of squared deviations of y values
*/
public double getTotalSumSquares() {
return this.globalFitInfo[SST_IDX];
}
/**
* <p>Returns the sum of squared deviations of the predicted y values about
* their mean (which equals the mean of y).</p>
*
* <p>This is usually abbreviated SSR or SSM. It is defined as SSM
* <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
*
* <p><strong>Preconditions</strong>: <ul>
* <li>At least two observations (with at least two different x values)
* must have been added before invoking this method. If this method is
* invoked before a model can be estimated, <code>Double.NaN</code> is
* returned.
* </li></ul></p>
*
* @return sum of squared deviations of predicted y values
*/
public double getRegressionSumSquares() {
return this.globalFitInfo[SST_IDX] - this.globalFitInfo[SSE_IDX];
}
/**
* <p>Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
* sum of squared errors</a> (SSE) associated with the regression
* model.</p>
*
* <p>The return value is constrained to be non-negative - i.e., if due to
* rounding errors the computational formula returns a negative result,
* 0 is returned.</p>
*
* <p><strong>Preconditions</strong>: <ul>
* <li>numberOfParameters data pairs
* must have been added before invoking this method. If this method is
* invoked before a model can be estimated, <code>Double,NaN</code> is
* returned.
* </li></ul></p>
*
* @return sum of squared errors associated with the regression model
*/
public double getErrorSumSquares() {
return this.globalFitInfo[ SSE_IDX];
}
/**
* <p>Returns the sum of squared errors divided by the degrees of freedom,
* usually abbreviated MSE.</p>
*
* <p>If there are fewer than <strong>numberOfParameters + 1</strong> data pairs in the model,
* or if there is no variation in <code>x</code>, this returns
* <code>Double.NaN</code>.</p>
*
* @return sum of squared deviations of y values
*/
public double getMeanSquareError() {
return this.globalFitInfo[ MSE_IDX];
}
/**
* <p>Returns the <a href="http://www.xycoon.com/coefficient1.htm">
* coefficient of multiple determination</a>,
* usually denoted r-square.</p>
*
* <p><strong>Preconditions</strong>: <ul>
* <li>At least numberOfParameters observations (with at least numberOfParameters different x values)
* must have been added before invoking this method. If this method is
* invoked before a model can be estimated, {@code Double,NaN} is
* returned.
* </li></ul></p>
*
* @return r-square, a double in the interval [0, 1]
*/
public double getRSquared() {
return this.globalFitInfo[ RSQ_IDX];
}
/**
* <p>Returns the adjusted R-squared statistic, defined by the formula <pre>
* R<sup>2</sup><sub>adj</sub> = 1 - [SSR (n - 1)] / [SSTO (n - p)]
* </pre>
* where SSR is the sum of squared residuals},
* SSTO is the total sum of squares}, n is the number
* of observations and p is the number of parameters estimated (including the intercept).</p>
*
* <p>If the regression is estimated without an intercept term, what is returned is <pre>
* <code> 1 - (1 - {@link #getRSquared()} ) * (n / (n - p)) </code>
* </pre></p>
*
* @return adjusted R-Squared statistic
*/
public double getAdjustedRSquared() {
return this.globalFitInfo[ ADJRSQ_IDX];
}
/**
* Returns true if the regression model has been computed including an intercept.
* In this case, the coefficient of the intercept is the first element of the
* {@link #getParameterEstimates() parameter estimates}.
* @return true if the model has an intercept term
*/
public boolean hasIntercept() {
return this.containsConstant;
}
/**
* Gets the i-jth element of the variance-covariance matrix.
*
* @param i first variable index
* @param j second variable index
* @return the requested variance-covariance matrix entry
*/
private double getVcvElement(int i, int j) {
if (this.isSymmetricVCD) {
if (this.varCovData.length > 1) {
//could be stored in upper or lower triangular
if (i == j) {
return varCovData[i][i];
} else if (i >= varCovData[j].length) {
return varCovData[i][j];
} else {
return varCovData[j][i];
}
} else {//could be in single array
if (i > j) {
return varCovData[0][(i + 1) * i / 2 + j];
} else {
return varCovData[0][(j + 1) * j / 2 + i];
}
}
} else {
return this.varCovData[i][j];
}
}
}

View File

@ -0,0 +1,86 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math.stat.regression;
import org.apache.commons.math.MathException;
/**
* An interface for regression models allowing for dynamic updating of the data.
* That is, the entire data set need not be loaded into memory. As observations
* become available, they can be added to the regression model and an updated
* estimate regression statistics can be calculated.
*
* @version $Id$
* @since 3.0
*/
public interface UpdatingMultipleLinearRegression {
/**
* Returns true if a constant has been included false otherwise.
*
* @return true if constant exists, false otherwise
*/
boolean hasIntercept();
/**
* Returns the number of observations added to the regression model.
*
* @return Number of observations
*/
long getN();
/**
* Adds one observation to the regression model.
*
* @param x the independent variables which form the design matrix
* @param y the dependent or response variable
*/
void addObservation(double[] x, double y);
/**
* Adds a series of observations to the regression model. The lengths of
* x and y must be the same and x must be rectangular.
*
* @param x a series of observations on the independent variables
* @param y a series of observations on the dependent variable
* The length of x and y must be the same
*/
void addObservations(double[][] x, double[] y);
/**
* Clears internal buffers and resets the regression model. This means all
* data and derived values are initialized
*/
void clear();
/**
* Performs a regression on data present in buffers and outputs a RegressionResults object
* @return RegressionResults acts as a container of regression output
* @throws MathException a wide variety of exception cases are possible, check message
*/
RegressionResults regress() throws MathException;
/**
* Performs a regression on data present in buffers including only regressors
* indexed in variablesToInclude and outputs a RegressionResults object
* @param variablesToInclude an array of indices of regressors to include
* @return RegressionResults acts as a container of regression output
* @throws MathException a wide variety of exception cases are possible, check message
*/
RegressionResults regress(int[] variablesToInclude) throws MathException;
}