Added Pearsons correlation implemendation. JIRA: MATH-114
git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@744802 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2d73d9f99a
commit
64fa01cd78
|
@ -0,0 +1,270 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.commons.math.stat.correlation;
|
||||
|
||||
import org.apache.commons.math.MathException;
|
||||
import org.apache.commons.math.MathRuntimeException;
|
||||
import org.apache.commons.math.distribution.TDistribution;
|
||||
import org.apache.commons.math.distribution.TDistributionImpl;
|
||||
import org.apache.commons.math.linear.RealMatrix;
|
||||
import org.apache.commons.math.linear.DenseRealMatrix;
|
||||
import org.apache.commons.math.stat.regression.SimpleRegression;
|
||||
|
||||
/**
|
||||
* Computes Pearson's product-moment correlation coefficients for pairs of arrays
|
||||
* or columns of a matrix.
|
||||
*
|
||||
* <p>The constructors that take <code>RealMatrix</code> or
|
||||
* <code>double[][]</code> arguments generate correlation matrices. The
|
||||
* columns of the input matrices are assumed to represent variable values.
|
||||
* Correlations are given by the formula</p>
|
||||
* <code>cor(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
|
||||
* where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
|
||||
* is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.
|
||||
*
|
||||
* @version $Revision$ $Date$
|
||||
* @since 2.0
|
||||
*/
|
||||
public class PearsonsCorrelation {
|
||||
|
||||
/** correlation matrix */
|
||||
private final RealMatrix correlationMatrix;
|
||||
|
||||
/** number of observations */
|
||||
private final int nObs;
|
||||
|
||||
/**
|
||||
* Create a PearsonsCorrelation instance without data
|
||||
*/
|
||||
public PearsonsCorrelation() {
|
||||
super();
|
||||
correlationMatrix = null;
|
||||
nObs = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a PearsonsCorrelation from a rectangular array
|
||||
* whose columns represent values of variables to be correlated.
|
||||
*
|
||||
* @param data rectangular array with columns representing variables
|
||||
* @throws IllegalArgumentException if the input data array is not
|
||||
* rectangular with at least two rows and two columns.
|
||||
*/
|
||||
public PearsonsCorrelation(double[][] data) {
|
||||
this(new DenseRealMatrix(data));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a PearsonsCorrelation from a RealMatrix whose columns
|
||||
* represent variables to be correlated.
|
||||
*
|
||||
* @param matrix matrix with columns representing variables to correlate
|
||||
*/
|
||||
public PearsonsCorrelation(RealMatrix matrix) {
|
||||
checkSufficientData(matrix);
|
||||
nObs = matrix.getRowDimension();
|
||||
correlationMatrix = computeCorrelation(matrix);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a PearsonsCorrelation from a {@link Covariance}. The correlation
|
||||
* matrix is computed by scaling the Covariance's covariance matrix.
|
||||
* The Covariance instance must have been created from a data matrix with
|
||||
* columns representing variable values.
|
||||
*
|
||||
* @param covariance Covariance instance
|
||||
*/
|
||||
public PearsonsCorrelation(Covariance covariance) {
|
||||
RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
|
||||
if (covarianceMatrix == null) {
|
||||
throw MathRuntimeException.createIllegalArgumentException(
|
||||
"Covariance matrix is null", null);
|
||||
}
|
||||
nObs = covariance.getN();
|
||||
correlationMatrix = covarianceToCorrelation(covarianceMatrix);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a PearsonsCorrelation from a covariance matrix. The correlation
|
||||
* matrix is computed by scaling the covariance matrix.
|
||||
*
|
||||
* @param covarianceMatrix covariance matrix
|
||||
* @param numberOfObservations the number of observations in the dataset used to compute
|
||||
* the covariance matrix
|
||||
*/
|
||||
public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
|
||||
nObs = numberOfObservations;
|
||||
correlationMatrix = covarianceToCorrelation(covarianceMatrix);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the correlation matrix
|
||||
*
|
||||
* @return correlation matrix
|
||||
*/
|
||||
public RealMatrix getCorrelationMatrix() {
|
||||
return correlationMatrix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a matrix of standard errors associated with the estimates
|
||||
* in the correlation matrix.<br/>
|
||||
* <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
|
||||
* error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
|
||||
* <p>The formula used to compute the standard error is <br/>
|
||||
* <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
|
||||
* where <code>r</code> is the estimated correlation coefficient and
|
||||
* <code>n</code> is the number of observations in the source dataset.</p>
|
||||
*
|
||||
* @return matrix of correlation standard errors
|
||||
*/
|
||||
public RealMatrix getCorrelationStandardErrors() {
|
||||
int nVars = correlationMatrix.getColumnDimension();
|
||||
double[][] out = new double[nVars][nVars];
|
||||
for (int i = 0; i < nVars; i++) {
|
||||
for (int j = 0; j < nVars; j++) {
|
||||
double r = correlationMatrix.getEntry(i, j);
|
||||
out[i][j] = Math.sqrt((1 - r * r) /(nObs - 2));
|
||||
}
|
||||
}
|
||||
return new DenseRealMatrix(out);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a matrix of p-values associated with the (two-sided) null
|
||||
* hypothesis that the corresponding correlation coefficient is zero.
|
||||
* <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
|
||||
* that a random variable distributed as <code>t<sub>n-2</sub></code> takes
|
||||
* a value with absolute value greater than or equal to <br>
|
||||
* <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
|
||||
* <p>The values in the matrix are sometimes referred to as the
|
||||
* <i>significance</i> of the corresponding correlation coefficients.</p>
|
||||
*
|
||||
* @return matrix of p-values
|
||||
* @throws MathException if an error occurs estimating probabilities
|
||||
*/
|
||||
public RealMatrix getCorrelationPValues() throws MathException {
|
||||
TDistribution tDistribution = new TDistributionImpl(nObs - 2);
|
||||
int nVars = correlationMatrix.getColumnDimension();
|
||||
double[][] out = new double[nVars][nVars];
|
||||
for (int i = 0; i < nVars; i++) {
|
||||
for (int j = 0; j < nVars; j++) {
|
||||
if (i == j) {
|
||||
out[i][j] = 0d;
|
||||
} else {
|
||||
double r = correlationMatrix.getEntry(i, j);
|
||||
double t = Math.abs(r * Math.sqrt((nObs - 2)/(1 - r * r)));
|
||||
out[i][j] = 2 * (1 - tDistribution.cumulativeProbability(t));
|
||||
}
|
||||
}
|
||||
}
|
||||
return new DenseRealMatrix(out);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Computes the correlation matrix for the columns of the
|
||||
* input matrix.
|
||||
*
|
||||
* @param matrix matrix with columns representing variables to correlate
|
||||
* @return correlation matrix
|
||||
*/
|
||||
public RealMatrix computeCorrelation(RealMatrix matrix) {
|
||||
int nVars = matrix.getColumnDimension();
|
||||
RealMatrix outMatrix = new DenseRealMatrix(nVars, nVars);
|
||||
for (int i = 0; i < nVars; i++) {
|
||||
for (int j = 0; j < i; j++) {
|
||||
double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
|
||||
outMatrix.setEntry(i, j, corr);
|
||||
outMatrix.setEntry(j, i, corr);
|
||||
}
|
||||
outMatrix.setEntry(i, i, 1d);
|
||||
}
|
||||
return outMatrix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the Pearson's product-moment correlation coefficient between the two arrays.
|
||||
*
|
||||
* </p>Throws IllegalArgumentException if the arrays do not have the same length
|
||||
* or their common length is less than 2</p>
|
||||
*
|
||||
* @param xArray first data array
|
||||
* @param yArray second data array
|
||||
* @return Returns Pearson's correlation coefficient for the two arrays
|
||||
* @throws IllegalArgumentException if the arrays lengths do not match or
|
||||
* there is insufficient data
|
||||
*/
|
||||
public double correlation(final double[] xArray, final double[] yArray) throws IllegalArgumentException {
|
||||
SimpleRegression regression = new SimpleRegression();
|
||||
if(xArray.length == yArray.length && xArray.length > 1) {
|
||||
for(int i=0; i<xArray.length; i++) {
|
||||
regression.addData(xArray[i], yArray[i]);
|
||||
}
|
||||
return regression.getR();
|
||||
}
|
||||
else {
|
||||
throw MathRuntimeException.createIllegalArgumentException(
|
||||
"Invalid array dimensions. xArray has size {0}; yArray has {1} elements",
|
||||
new Object[] {xArray.length, yArray.length});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Derives a correlation matrix from a covariance matrix.
|
||||
*
|
||||
* <p>Uses the formula <br/>
|
||||
* <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where
|
||||
* <code>r(·,·)</code> is the correlation coefficient and
|
||||
* <code>s(·)</code> means standard deviation.</p>
|
||||
*
|
||||
* @param covarianceMatrix the covariance matrix
|
||||
* @return correlation matrix
|
||||
*/
|
||||
public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
|
||||
int nVars = covarianceMatrix.getColumnDimension();
|
||||
RealMatrix outMatrix = new DenseRealMatrix(nVars, nVars);
|
||||
for (int i = 0; i < nVars; i++) {
|
||||
double sigma = Math.sqrt(covarianceMatrix.getEntry(i, i));
|
||||
outMatrix.setEntry(i, i, 1d);
|
||||
for (int j = 0; j < i; j++) {
|
||||
double entry = covarianceMatrix.getEntry(i, j) /
|
||||
(sigma * Math.sqrt(covarianceMatrix.getEntry(j, j)));
|
||||
outMatrix.setEntry(i, j, entry);
|
||||
outMatrix.setEntry(j, i, entry);
|
||||
}
|
||||
}
|
||||
return outMatrix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws IllegalArgumentException of the matrix does not have at least
|
||||
* two columns and two rows
|
||||
*
|
||||
* @param matrix matrix to check for sufficiency
|
||||
*/
|
||||
private void checkSufficientData(final RealMatrix matrix) {
|
||||
int nRows = matrix.getRowDimension();
|
||||
int nCols = matrix.getColumnDimension();
|
||||
if (nRows < 2 || nCols < 2) {
|
||||
throw MathRuntimeException.createIllegalArgumentException(
|
||||
"Insufficient data: only {0} rows and {1} columns.",
|
||||
new Object[]{nRows, nCols});
|
||||
}
|
||||
}
|
||||
}
|
|
@ -39,6 +39,10 @@ The <action> type attribute can be add,update,fix,remove.
|
|||
</properties>
|
||||
<body>
|
||||
<release version="2.0" date="TBD" description="TBD">
|
||||
<action dev="psteitz" type="add" issue="MATH-114" due-to="John Gant">
|
||||
Added PearsonsCorrelation class to compute correlation matrices, standard
|
||||
errors and p-values for correlation coefficients.
|
||||
</action>
|
||||
<action dev="psteitz" type="add" issue="MATH-114">
|
||||
Added Covariance class to compute variance-covariance matrices in new
|
||||
correlation package.
|
||||
|
|
|
@ -0,0 +1,185 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
#------------------------------------------------------------------------------
|
||||
# R source file to validate Pearson's correlation tests in
|
||||
# org.apache.commons.math.stat.correlation.PearsonsCorrelationTest
|
||||
#
|
||||
# To run the test, install R, put this file and testFunctions
|
||||
# into the same directory, launch R from this directory and then enter
|
||||
# source("<name-of-this-file>")
|
||||
#
|
||||
#------------------------------------------------------------------------------
|
||||
tol <- 1E-9 # error tolerance for tests
|
||||
#------------------------------------------------------------------------------
|
||||
# Function definitions
|
||||
|
||||
source("testFunctions") # utility test functions
|
||||
options(digits=16) # override number of digits displayed
|
||||
|
||||
# function to verify correlation computations
|
||||
verifyCorrelation <- function(matrix, expectedCorrelation, name) {
|
||||
correlation <- cor(matrix)
|
||||
output <- c("Correlation matrix test dataset = ", name)
|
||||
if (assertEquals(expectedCorrelation, correlation,tol,"Correlations")) {
|
||||
displayPadded(output, SUCCEEDED, WIDTH)
|
||||
} else {
|
||||
displayPadded(output, FAILED, WIDTH)
|
||||
}
|
||||
}
|
||||
|
||||
# function to verify p-values
|
||||
verifyPValues <- function(matrix, pValues, name) {
|
||||
dimension <- dim(matrix)[2]
|
||||
corValues <- matrix(nrow=dimension,ncol=dimension)
|
||||
expectedValues <- matrix(nrow=dimension,ncol=dimension)
|
||||
for (i in 2:dimension) {
|
||||
for (j in 1:(i-1)) {
|
||||
corValues[i,j]<-cor.test(matrix[,i], matrix[,j])$p.value
|
||||
corValues[j,i]<-corValues[i,j]
|
||||
}
|
||||
}
|
||||
for (i in 1:dimension) {
|
||||
corValues[i,i] <- 1
|
||||
expectedValues[i,i] <- 1
|
||||
}
|
||||
ptr <- 1
|
||||
for (i in 2:dimension) {
|
||||
for (j in 1:(i-1)) {
|
||||
expectedValues[i,j] <- pValues[ptr]
|
||||
expectedValues[j,i] <- expectedValues[i,j]
|
||||
ptr <- ptr + 1
|
||||
}
|
||||
}
|
||||
output <- c("Correlation p-values test dataset = ", name)
|
||||
if (assertEquals(expectedValues, corValues,tol,"p-values")) {
|
||||
displayPadded(output, SUCCEEDED, WIDTH)
|
||||
} else {
|
||||
displayPadded(output, FAILED, WIDTH)
|
||||
}
|
||||
}
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
cat("Correlation test cases\n")
|
||||
|
||||
# Longley
|
||||
|
||||
longley <- matrix(c(60323,83.0,234289,2356,1590,107608,1947,
|
||||
61122,88.5,259426,2325,1456,108632,1948,
|
||||
60171,88.2,258054,3682,1616,109773,1949,
|
||||
61187,89.5,284599,3351,1650,110929,1950,
|
||||
63221,96.2,328975,2099,3099,112075,1951,
|
||||
63639,98.1,346999,1932,3594,113270,1952,
|
||||
64989,99.0,365385,1870,3547,115094,1953,
|
||||
63761,100.0,363112,3578,3350,116219,1954,
|
||||
66019,101.2,397469,2904,3048,117388,1955,
|
||||
67857,104.6,419180,2822,2857,118734,1956,
|
||||
68169,108.4,442769,2936,2798,120445,1957,
|
||||
66513,110.8,444546,4681,2637,121950,1958,
|
||||
68655,112.6,482704,3813,2552,123366,1959,
|
||||
69564,114.2,502601,3931,2514,125368,1960,
|
||||
69331,115.7,518173,4806,2572,127852,1961,
|
||||
70551,116.9,554894,4007,2827,130081,1962),
|
||||
nrow = 16, ncol = 7, byrow = TRUE)
|
||||
|
||||
expectedCorrelation <- matrix(c(
|
||||
1.000000000000000, 0.9708985250610560, 0.9835516111796693, 0.5024980838759942,
|
||||
0.4573073999764817, 0.960390571594376, 0.9713294591921188,
|
||||
0.970898525061056, 1.0000000000000000, 0.9915891780247822, 0.6206333925590966,
|
||||
0.4647441876006747, 0.979163432977498, 0.9911491900672053,
|
||||
0.983551611179669, 0.9915891780247822, 1.0000000000000000, 0.6042609398895580,
|
||||
0.4464367918926265, 0.991090069458478, 0.9952734837647849,
|
||||
0.502498083875994, 0.6206333925590966, 0.6042609398895580, 1.0000000000000000,
|
||||
-0.1774206295018783, 0.686551516365312, 0.6682566045621746,
|
||||
0.457307399976482, 0.4647441876006747, 0.4464367918926265, -0.1774206295018783,
|
||||
1.0000000000000000, 0.364416267189032, 0.4172451498349454,
|
||||
0.960390571594376, 0.9791634329774981, 0.9910900694584777, 0.6865515163653120,
|
||||
0.3644162671890320, 1.000000000000000, 0.9939528462329257,
|
||||
0.971329459192119, 0.9911491900672053, 0.9952734837647849, 0.6682566045621746,
|
||||
0.4172451498349454, 0.993952846232926, 1.0000000000000000),
|
||||
nrow = 7, ncol = 7, byrow = TRUE)
|
||||
verifyCorrelation(longley, expectedCorrelation, "longley")
|
||||
|
||||
expectedPValues <- c(
|
||||
4.38904690369668e-10,
|
||||
8.36353208910623e-12, 7.8159700933611e-14,
|
||||
0.0472894097790304, 0.01030636128354301, 0.01316878049026582,
|
||||
0.0749178049642416, 0.06971758330341182, 0.0830166169296545, 0.510948586323452,
|
||||
3.693245043123738e-09, 4.327782576751815e-11, 1.167954621905665e-13, 0.00331028281967516, 0.1652293725106684,
|
||||
3.95834476307755e-10, 1.114663916723657e-13, 1.332267629550188e-15, 0.00466039138541463, 0.1078477071581498, 7.771561172376096e-15)
|
||||
verifyPValues(longley, expectedPValues, "longley")
|
||||
|
||||
# Swiss Fertility
|
||||
|
||||
fertility <- matrix(c(80.2,17.0,15,12,9.96,
|
||||
83.1,45.1,6,9,84.84,
|
||||
92.5,39.7,5,5,93.40,
|
||||
85.8,36.5,12,7,33.77,
|
||||
76.9,43.5,17,15,5.16,
|
||||
76.1,35.3,9,7,90.57,
|
||||
83.8,70.2,16,7,92.85,
|
||||
92.4,67.8,14,8,97.16,
|
||||
82.4,53.3,12,7,97.67,
|
||||
82.9,45.2,16,13,91.38,
|
||||
87.1,64.5,14,6,98.61,
|
||||
64.1,62.0,21,12,8.52,
|
||||
66.9,67.5,14,7,2.27,
|
||||
68.9,60.7,19,12,4.43,
|
||||
61.7,69.3,22,5,2.82,
|
||||
68.3,72.6,18,2,24.20,
|
||||
71.7,34.0,17,8,3.30,
|
||||
55.7,19.4,26,28,12.11,
|
||||
54.3,15.2,31,20,2.15,
|
||||
65.1,73.0,19,9,2.84,
|
||||
65.5,59.8,22,10,5.23,
|
||||
65.0,55.1,14,3,4.52,
|
||||
56.6,50.9,22,12,15.14,
|
||||
57.4,54.1,20,6,4.20,
|
||||
72.5,71.2,12,1,2.40,
|
||||
74.2,58.1,14,8,5.23,
|
||||
72.0,63.5,6,3,2.56,
|
||||
60.5,60.8,16,10,7.72,
|
||||
58.3,26.8,25,19,18.46,
|
||||
65.4,49.5,15,8,6.10,
|
||||
75.5,85.9,3,2,99.71,
|
||||
69.3,84.9,7,6,99.68,
|
||||
77.3,89.7,5,2,100.00,
|
||||
70.5,78.2,12,6,98.96,
|
||||
79.4,64.9,7,3,98.22,
|
||||
65.0,75.9,9,9,99.06,
|
||||
92.2,84.6,3,3,99.46,
|
||||
79.3,63.1,13,13,96.83,
|
||||
70.4,38.4,26,12,5.62,
|
||||
65.7,7.7,29,11,13.79,
|
||||
72.7,16.7,22,13,11.22,
|
||||
64.4,17.6,35,32,16.92,
|
||||
77.6,37.6,15,7,4.97,
|
||||
67.6,18.7,25,7,8.65,
|
||||
35.0,1.2,37,53,42.34,
|
||||
44.7,46.6,16,29,50.43,
|
||||
42.8,27.7,22,29,58.33),
|
||||
nrow = 47, ncol = 5, byrow = TRUE)
|
||||
|
||||
expectedCorrelation <- matrix(c(
|
||||
1.0000000000000000, 0.3530791836199747, -0.6458827064572875, -0.6637888570350691, 0.4636847006517939,
|
||||
0.3530791836199747, 1.0000000000000000,-0.6865422086171366, -0.6395225189483201, 0.4010950530487398,
|
||||
-0.6458827064572875, -0.6865422086171366, 1.0000000000000000, 0.6984152962884830, -0.5727418060641666,
|
||||
-0.6637888570350691, -0.6395225189483201, 0.6984152962884830, 1.0000000000000000, -0.1538589170909148,
|
||||
0.4636847006517939, 0.4010950530487398, -0.5727418060641666, -0.1538589170909148, 1.0000000000000000),
|
||||
nrow = 5, ncol = 5, byrow = TRUE)
|
||||
|
||||
verifyCorrelation(fertility, expectedCorrelation, "swiss fertility")
|
||||
|
||||
displayDashes(WIDTH)
|
|
@ -50,6 +50,9 @@ source("multipleOLSRegressionTestCases")
|
|||
# covariance
|
||||
source("covarianceTestCases")
|
||||
|
||||
# correlation
|
||||
source("correlationTestCases")
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# if output has been diverted, change it back
|
||||
if (sink.number()) {
|
||||
|
|
|
@ -0,0 +1,274 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.commons.math.stat.correlation;
|
||||
|
||||
import org.apache.commons.math.TestUtils;
|
||||
import org.apache.commons.math.distribution.TDistribution;
|
||||
import org.apache.commons.math.distribution.TDistributionImpl;
|
||||
import org.apache.commons.math.linear.RealMatrix;
|
||||
import org.apache.commons.math.linear.DenseRealMatrix;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class PearsonsCorrelationTest extends TestCase {
|
||||
|
||||
protected final double[] longleyData = new double[] {
|
||||
60323,83.0,234289,2356,1590,107608,1947,
|
||||
61122,88.5,259426,2325,1456,108632,1948,
|
||||
60171,88.2,258054,3682,1616,109773,1949,
|
||||
61187,89.5,284599,3351,1650,110929,1950,
|
||||
63221,96.2,328975,2099,3099,112075,1951,
|
||||
63639,98.1,346999,1932,3594,113270,1952,
|
||||
64989,99.0,365385,1870,3547,115094,1953,
|
||||
63761,100.0,363112,3578,3350,116219,1954,
|
||||
66019,101.2,397469,2904,3048,117388,1955,
|
||||
67857,104.6,419180,2822,2857,118734,1956,
|
||||
68169,108.4,442769,2936,2798,120445,1957,
|
||||
66513,110.8,444546,4681,2637,121950,1958,
|
||||
68655,112.6,482704,3813,2552,123366,1959,
|
||||
69564,114.2,502601,3931,2514,125368,1960,
|
||||
69331,115.7,518173,4806,2572,127852,1961,
|
||||
70551,116.9,554894,4007,2827,130081,1962
|
||||
};
|
||||
|
||||
protected final double[] swissData = new double[] {
|
||||
80.2,17.0,15,12,9.96,
|
||||
83.1,45.1,6,9,84.84,
|
||||
92.5,39.7,5,5,93.40,
|
||||
85.8,36.5,12,7,33.77,
|
||||
76.9,43.5,17,15,5.16,
|
||||
76.1,35.3,9,7,90.57,
|
||||
83.8,70.2,16,7,92.85,
|
||||
92.4,67.8,14,8,97.16,
|
||||
82.4,53.3,12,7,97.67,
|
||||
82.9,45.2,16,13,91.38,
|
||||
87.1,64.5,14,6,98.61,
|
||||
64.1,62.0,21,12,8.52,
|
||||
66.9,67.5,14,7,2.27,
|
||||
68.9,60.7,19,12,4.43,
|
||||
61.7,69.3,22,5,2.82,
|
||||
68.3,72.6,18,2,24.20,
|
||||
71.7,34.0,17,8,3.30,
|
||||
55.7,19.4,26,28,12.11,
|
||||
54.3,15.2,31,20,2.15,
|
||||
65.1,73.0,19,9,2.84,
|
||||
65.5,59.8,22,10,5.23,
|
||||
65.0,55.1,14,3,4.52,
|
||||
56.6,50.9,22,12,15.14,
|
||||
57.4,54.1,20,6,4.20,
|
||||
72.5,71.2,12,1,2.40,
|
||||
74.2,58.1,14,8,5.23,
|
||||
72.0,63.5,6,3,2.56,
|
||||
60.5,60.8,16,10,7.72,
|
||||
58.3,26.8,25,19,18.46,
|
||||
65.4,49.5,15,8,6.10,
|
||||
75.5,85.9,3,2,99.71,
|
||||
69.3,84.9,7,6,99.68,
|
||||
77.3,89.7,5,2,100.00,
|
||||
70.5,78.2,12,6,98.96,
|
||||
79.4,64.9,7,3,98.22,
|
||||
65.0,75.9,9,9,99.06,
|
||||
92.2,84.6,3,3,99.46,
|
||||
79.3,63.1,13,13,96.83,
|
||||
70.4,38.4,26,12,5.62,
|
||||
65.7,7.7,29,11,13.79,
|
||||
72.7,16.7,22,13,11.22,
|
||||
64.4,17.6,35,32,16.92,
|
||||
77.6,37.6,15,7,4.97,
|
||||
67.6,18.7,25,7,8.65,
|
||||
35.0,1.2,37,53,42.34,
|
||||
44.7,46.6,16,29,50.43,
|
||||
42.8,27.7,22,29,58.33
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Test Longley dataset against R.
|
||||
*/
|
||||
public void testLongly() throws Exception {
|
||||
RealMatrix matrix = createRealMatrix(longleyData, 16, 7);
|
||||
PearsonsCorrelation corrInstance = new PearsonsCorrelation(matrix);
|
||||
RealMatrix correlationMatrix = corrInstance.getCorrelationMatrix();
|
||||
double[] rData = new double[] {
|
||||
1.000000000000000, 0.9708985250610560, 0.9835516111796693, 0.5024980838759942,
|
||||
0.4573073999764817, 0.960390571594376, 0.9713294591921188,
|
||||
0.970898525061056, 1.0000000000000000, 0.9915891780247822, 0.6206333925590966,
|
||||
0.4647441876006747, 0.979163432977498, 0.9911491900672053,
|
||||
0.983551611179669, 0.9915891780247822, 1.0000000000000000, 0.6042609398895580,
|
||||
0.4464367918926265, 0.991090069458478, 0.9952734837647849,
|
||||
0.502498083875994, 0.6206333925590966, 0.6042609398895580, 1.0000000000000000,
|
||||
-0.1774206295018783, 0.686551516365312, 0.6682566045621746,
|
||||
0.457307399976482, 0.4647441876006747, 0.4464367918926265, -0.1774206295018783,
|
||||
1.0000000000000000, 0.364416267189032, 0.4172451498349454,
|
||||
0.960390571594376, 0.9791634329774981, 0.9910900694584777, 0.6865515163653120,
|
||||
0.3644162671890320, 1.000000000000000, 0.9939528462329257,
|
||||
0.971329459192119, 0.9911491900672053, 0.9952734837647849, 0.6682566045621746,
|
||||
0.4172451498349454, 0.993952846232926, 1.0000000000000000
|
||||
};
|
||||
TestUtils.assertEquals("correlation matrix", createRealMatrix(rData, 7, 7), correlationMatrix, 10E-15);
|
||||
|
||||
double[] rPvalues = new double[] {
|
||||
4.38904690369668e-10,
|
||||
8.36353208910623e-12, 7.8159700933611e-14,
|
||||
0.0472894097790304, 0.01030636128354301, 0.01316878049026582,
|
||||
0.0749178049642416, 0.06971758330341182, 0.0830166169296545, 0.510948586323452,
|
||||
3.693245043123738e-09, 4.327782576751815e-11, 1.167954621905665e-13, 0.00331028281967516, 0.1652293725106684,
|
||||
3.95834476307755e-10, 1.114663916723657e-13, 1.332267629550188e-15, 0.00466039138541463, 0.1078477071581498, 7.771561172376096e-15
|
||||
};
|
||||
RealMatrix rPMatrix = createLowerTriangularRealMatrix(rPvalues, 7);
|
||||
fillUpper(rPMatrix, 0d);
|
||||
TestUtils.assertEquals("correlation p values", rPMatrix, corrInstance.getCorrelationPValues(), 10E-15);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test R Swiss fertility dataset against R.
|
||||
*/
|
||||
public void testSwissFertility() throws Exception {
|
||||
RealMatrix matrix = createRealMatrix(swissData, 47, 5);
|
||||
PearsonsCorrelation corrInstance = new PearsonsCorrelation(matrix);
|
||||
RealMatrix correlationMatrix = corrInstance.getCorrelationMatrix();
|
||||
double[] rData = new double[] {
|
||||
1.0000000000000000, 0.3530791836199747, -0.6458827064572875, -0.6637888570350691, 0.4636847006517939,
|
||||
0.3530791836199747, 1.0000000000000000,-0.6865422086171366, -0.6395225189483201, 0.4010950530487398,
|
||||
-0.6458827064572875, -0.6865422086171366, 1.0000000000000000, 0.6984152962884830, -0.5727418060641666,
|
||||
-0.6637888570350691, -0.6395225189483201, 0.6984152962884830, 1.0000000000000000, -0.1538589170909148,
|
||||
0.4636847006517939, 0.4010950530487398, -0.5727418060641666, -0.1538589170909148, 1.0000000000000000
|
||||
};
|
||||
TestUtils.assertEquals("correlation matrix", createRealMatrix(rData, 5, 5), correlationMatrix, 10E-15);
|
||||
|
||||
double[] rPvalues = new double[] {
|
||||
0.01491720061472623,
|
||||
9.45043734069043e-07, 9.95151527133974e-08,
|
||||
3.658616965962355e-07, 1.304590105694471e-06, 4.811397236181847e-08,
|
||||
0.001028523190118147, 0.005204433539191644, 2.588307925380906e-05, 0.301807756132683
|
||||
};
|
||||
RealMatrix rPMatrix = createLowerTriangularRealMatrix(rPvalues, 5);
|
||||
fillUpper(rPMatrix, 0d);
|
||||
TestUtils.assertEquals("correlation p values", rPMatrix, corrInstance.getCorrelationPValues(), 10E-15);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constant column
|
||||
*/
|
||||
public void testConstant() {
|
||||
double[] noVariance = new double[] {1, 1, 1, 1};
|
||||
double[] values = new double[] {1, 2, 3, 4};
|
||||
assertTrue(Double.isNaN(new PearsonsCorrelation().correlation(noVariance, values)));
|
||||
assertTrue(Double.isNaN(new PearsonsCorrelation().correlation(noVariance, values)));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Insufficient data
|
||||
*/
|
||||
|
||||
public void testInsufficientData() {
|
||||
double[] one = new double[] {1};
|
||||
double[] two = new double[] {2};
|
||||
try {
|
||||
new PearsonsCorrelation().correlation(one, two);
|
||||
fail("Expecting IllegalArgumentException");
|
||||
} catch (IllegalArgumentException ex) {
|
||||
// Expected
|
||||
}
|
||||
RealMatrix matrix = new DenseRealMatrix(new double[][] {{0},{1}});
|
||||
try {
|
||||
new PearsonsCorrelation(matrix);
|
||||
fail("Expecting IllegalArgumentException");
|
||||
} catch (IllegalArgumentException ex) {
|
||||
// Expected
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify that direct t-tests using standard error estimates are consistent
|
||||
* with reported p-values
|
||||
*/
|
||||
public void testStdErrorConsistency() throws Exception {
|
||||
TDistribution tDistribution = new TDistributionImpl(45);
|
||||
RealMatrix matrix = createRealMatrix(swissData, 47, 5);
|
||||
PearsonsCorrelation corrInstance = new PearsonsCorrelation(matrix);
|
||||
RealMatrix rValues = corrInstance.getCorrelationMatrix();
|
||||
RealMatrix pValues = corrInstance.getCorrelationPValues();
|
||||
RealMatrix stdErrors = corrInstance.getCorrelationStandardErrors();
|
||||
for (int i = 0; i < 5; i++) {
|
||||
for (int j = 0; j < i; j++) {
|
||||
double t = Math.abs(rValues.getEntry(i, j)) / stdErrors.getEntry(i, j);
|
||||
double p = 2 * (1 - tDistribution.cumulativeProbability(t));
|
||||
assertEquals(p, pValues.getEntry(i, j), 10E-15);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify that creating correlation from covariance gives same results as
|
||||
* direct computation from the original matrix
|
||||
*/
|
||||
public void testCovarianceConsistency() throws Exception {
|
||||
RealMatrix matrix = createRealMatrix(longleyData, 16, 7);
|
||||
PearsonsCorrelation corrInstance = new PearsonsCorrelation(matrix);
|
||||
Covariance covInstance = new Covariance(matrix);
|
||||
PearsonsCorrelation corrFromCovInstance = new PearsonsCorrelation(covInstance);
|
||||
TestUtils.assertEquals("correlation values", corrInstance.getCorrelationMatrix(),
|
||||
corrFromCovInstance.getCorrelationMatrix(), 10E-15);
|
||||
TestUtils.assertEquals("p values", corrInstance.getCorrelationPValues(),
|
||||
corrFromCovInstance.getCorrelationPValues(), 10E-15);
|
||||
TestUtils.assertEquals("standard errors", corrInstance.getCorrelationStandardErrors(),
|
||||
corrFromCovInstance.getCorrelationStandardErrors(), 10E-15);
|
||||
|
||||
PearsonsCorrelation corrFromCovInstance2 =
|
||||
new PearsonsCorrelation(covInstance.getCovarianceMatrix(), 16);
|
||||
TestUtils.assertEquals("correlation values", corrInstance.getCorrelationMatrix(),
|
||||
corrFromCovInstance2.getCorrelationMatrix(), 10E-15);
|
||||
TestUtils.assertEquals("p values", corrInstance.getCorrelationPValues(),
|
||||
corrFromCovInstance2.getCorrelationPValues(), 10E-15);
|
||||
TestUtils.assertEquals("standard errors", corrInstance.getCorrelationStandardErrors(),
|
||||
corrFromCovInstance2.getCorrelationStandardErrors(), 10E-15);
|
||||
}
|
||||
|
||||
protected RealMatrix createRealMatrix(double[] data, int nRows, int nCols) {
|
||||
double[][] matrixData = new double[nRows][nCols];
|
||||
int ptr = 0;
|
||||
for (int i = 0; i < nRows; i++) {
|
||||
System.arraycopy(data, ptr, matrixData[i], 0, nCols);
|
||||
ptr += nCols;
|
||||
}
|
||||
return new DenseRealMatrix(matrixData);
|
||||
}
|
||||
|
||||
protected RealMatrix createLowerTriangularRealMatrix(double[] data, int dimension) {
|
||||
int ptr = 0;
|
||||
RealMatrix result = new DenseRealMatrix(dimension, dimension);
|
||||
for (int i = 1; i < dimension; i++) {
|
||||
for (int j = 0; j < i; j++) {
|
||||
result.setEntry(i, j, data[ptr]);
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
protected void fillUpper(RealMatrix matrix, double diagonalValue) {
|
||||
int dimension = matrix.getColumnDimension();
|
||||
for (int i = 0; i < dimension; i++) {
|
||||
matrix.setEntry(i, i, diagonalValue);
|
||||
for (int j = i+1; j < dimension; j++) {
|
||||
matrix.setEntry(i, j, matrix.getEntry(j, i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue