Added check and rescaling of expected counts to sum to sum of expected

counts if necessary in ChiSquare test.
JIRA: MATH-175
Reported and patched by Carl Anderson.


git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@610274 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Phil Steitz 2008-01-09 05:16:00 +00:00
parent fb4949bd07
commit a3dc59a94d
5 changed files with 56 additions and 19 deletions

View File

@ -50,6 +50,11 @@ public class ChiSquareTestImpl implements UnknownDistributionChiSquareTest {
setDistribution(x); setDistribution(x);
} }
/** /**
* {@inheritDoc}
* <p><strong>Note: </strong>This implementation rescales the
* <code>expected</code> array if necessary to ensure that the sum of the
* expected and observed counts are equal.</p>
*
* @param observed array of observed frequency counts * @param observed array of observed frequency counts
* @param expected array of expected frequency counts * @param expected array of expected frequency counts
* @return chi-square test statistic * @return chi-square test statistic
@ -58,8 +63,6 @@ public class ChiSquareTestImpl implements UnknownDistributionChiSquareTest {
*/ */
public double chiSquare(double[] expected, long[] observed) public double chiSquare(double[] expected, long[] observed)
throws IllegalArgumentException { throws IllegalArgumentException {
double sumSq = 0.0d;
double dev = 0.0d;
if ((expected.length < 2) || (expected.length != observed.length)) { if ((expected.length < 2) || (expected.length != observed.length)) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"observed, expected array lengths incorrect"); "observed, expected array lengths incorrect");
@ -68,14 +71,38 @@ public class ChiSquareTestImpl implements UnknownDistributionChiSquareTest {
throw new IllegalArgumentException( throw new IllegalArgumentException(
"observed counts must be non-negative and expected counts must be postive"); "observed counts must be non-negative and expected counts must be postive");
} }
double sumExpected = 0d;
double sumObserved = 0d;
for (int i = 0; i < observed.length; i++) { for (int i = 0; i < observed.length; i++) {
dev = ((double) observed[i] - expected[i]); sumExpected += expected[i];
sumSq += dev * dev / expected[i]; sumObserved += observed[i];
}
double ratio = 1.0d;
boolean rescale = false;
if (Math.abs(sumExpected - sumObserved) > 10E-6) {
ratio = sumObserved / sumExpected;
rescale = true;
}
double sumSq = 0.0d;
double dev = 0.0d;
for (int i = 0; i < observed.length; i++) {
if (rescale) {
dev = ((double) observed[i] - ratio * expected[i]);
sumSq += dev * dev / (ratio * expected[i]);
} else {
dev = ((double) observed[i] - expected[i]);
sumSq += dev * dev / expected[i];
}
} }
return sumSq; return sumSq;
} }
/** /**
* {@inheritDoc}
* <p><strong>Note: </strong>This implementation rescales the
* <code>expected</code> array if necessary to ensure that the sum of the
* expected and observed counts are equal.</p>
*
* @param observed array of observed frequency counts * @param observed array of observed frequency counts
* @param expected array of exptected frequency counts * @param expected array of exptected frequency counts
* @return p-value * @return p-value
@ -90,6 +117,11 @@ public class ChiSquareTestImpl implements UnknownDistributionChiSquareTest {
} }
/** /**
* {@inheritDoc}
* <p><strong>Note: </strong>This implementation rescales the
* <code>expected</code> array if necessary to ensure that the sum of the
* expected and observed counts are equal.</p>
*
* @param observed array of observed frequency counts * @param observed array of observed frequency counts
* @param expected array of exptected frequency counts * @param expected array of exptected frequency counts
* @param alpha significance level of the test * @param alpha significance level of the test

View File

@ -49,8 +49,9 @@ verifyTable <- function(counts, expectedP, expectedStat, tol, desc) {
verifyHomogeneity <- function(obs, exp, expectedP, expectedStat, verifyHomogeneity <- function(obs, exp, expectedP, expectedStat,
tol, desc) { tol, desc) {
chi <- sum((obs - exp)^2/exp) results <- chisq.test(obs,p=exp,rescale.p=TRUE)
p <- 1 - pchisq(sum((obs - exp)^2/exp), length(obs) - 1) chi <- results$statistic
p <- results$p.value
if (assertEquals(expectedP, p, tol, "p-value")) { if (assertEquals(expectedP, p, tol, "p-value")) {
displayPadded(c(desc, " p-value test"), SUCCEEDED, WIDTH) displayPadded(c(desc, " p-value test"), SUCCEEDED, WIDTH)
} else { } else {
@ -73,14 +74,14 @@ verifyHomogeneity(observed, expected, 0.904837418036, 0.2, tol,
observed <- c(500, 623, 72, 70, 31) observed <- c(500, 623, 72, 70, 31)
expected <- c(485, 541, 82, 61, 37) expected <- c(485, 541, 82, 61, 37)
verifyHomogeneity(observed, expected, 0.002512096, 16.4131070362, tol, verifyHomogeneity(observed, expected, 0.06051952647453607, 9.023307936427388,
"testChiSquare2") tol, "testChiSquare2")
observed <- c(2372383, 584222, 257170, 17750155, 7903832, 489265, observed <- c(2372383, 584222, 257170, 17750155, 7903832, 489265,
209628, 393899) 209628, 393899)
expected <- c(3389119.5, 649136.6, 285745.4, 25357364.76, 11291189.78, expected <- c(3389119.5, 649136.6, 285745.4, 25357364.76, 11291189.78,
543628.0, 232921.0, 437665.75) 543628.0, 232921.0, 437665.75)
verifyHomogeneity(observed, expected, 0, 3624883.342907764, tol, verifyHomogeneity(observed, expected, 0, 114875.90421929007, tol,
"testChiSquareLargeTestStatistic") "testChiSquareLargeTestStatistic")
counts <- matrix(c(40, 22, 43, 91, 21, 28, 60, 10, 22), nc = 3); counts <- matrix(c(40, 22, 43, 91, 21, 28, 60, 10, 22), nc = 3);

View File

@ -57,10 +57,10 @@ public class ChiSquareTestTest extends TestCase {
long[] observed1 = { 500, 623, 72, 70, 31 }; long[] observed1 = { 500, 623, 72, 70, 31 };
double[] expected1 = { 485, 541, 82, 61, 37 }; double[] expected1 = { 485, 541, 82, 61, 37 };
assertEquals( "chi-square test statistic", 16.4131070362, testStatistic.chiSquare(expected1, observed1), 1E-10); assertEquals( "chi-square test statistic", 9.023307936427388, testStatistic.chiSquare(expected1, observed1), 1E-10);
assertEquals("chi-square p-value", 0.002512096, testStatistic.chiSquareTest(expected1, observed1), 1E-9); assertEquals("chi-square p-value", 0.06051952647453607, testStatistic.chiSquareTest(expected1, observed1), 1E-9);
assertTrue("chi-square test reject", testStatistic.chiSquareTest(expected1, observed1, 0.003)); assertTrue("chi-square test reject", testStatistic.chiSquareTest(expected1, observed1, 0.08));
assertTrue("chi-square test accept", !testStatistic.chiSquareTest(expected1, observed1, 0.002)); assertTrue("chi-square test accept", !testStatistic.chiSquareTest(expected1, observed1, 0.05));
try { try {
testStatistic.chiSquareTest(expected1, observed1, 95); testStatistic.chiSquareTest(expected1, observed1, 95);
@ -181,7 +181,7 @@ public class ChiSquareTestTest extends TestCase {
double cst = csti.chiSquareTest(exp, obs); double cst = csti.chiSquareTest(exp, obs);
assertEquals("chi-square p-value", 0.0, cst, 1E-3); assertEquals("chi-square p-value", 0.0, cst, 1E-3);
assertEquals( "chi-square test statistic", assertEquals( "chi-square test statistic",
3624883.342907764, testStatistic.chiSquare(exp, obs), 1E-9); 114875.90421929007, testStatistic.chiSquare(exp, obs), 1E-9);
} }
/** Contingency table containing zeros - PR # 32531 */ /** Contingency table containing zeros - PR # 32531 */

View File

@ -55,10 +55,10 @@ public class TestUtilsTest extends TestCase {
long[] observed1 = { 500, 623, 72, 70, 31 }; long[] observed1 = { 500, 623, 72, 70, 31 };
double[] expected1 = { 485, 541, 82, 61, 37 }; double[] expected1 = { 485, 541, 82, 61, 37 };
assertEquals( "chi-square test statistic", 16.4131070362, TestUtils.chiSquare(expected1, observed1), 1E-10); assertEquals( "chi-square test statistic", 9.023307936427388, TestUtils.chiSquare(expected1, observed1), 1E-10);
assertEquals("chi-square p-value", 0.002512096, TestUtils.chiSquareTest(expected1, observed1), 1E-9); assertEquals("chi-square p-value", 0.06051952647453607, TestUtils.chiSquareTest(expected1, observed1), 1E-9);
assertTrue("chi-square test reject", TestUtils.chiSquareTest(expected1, observed1, 0.003)); assertTrue("chi-square test reject", TestUtils.chiSquareTest(expected1, observed1, 0.07));
assertTrue("chi-square test accept", !TestUtils.chiSquareTest(expected1, observed1, 0.002)); assertTrue("chi-square test accept", !TestUtils.chiSquareTest(expected1, observed1, 0.05));
try { try {
TestUtils.chiSquareTest(expected1, observed1, 95); TestUtils.chiSquareTest(expected1, observed1, 95);
@ -179,7 +179,7 @@ public class TestUtilsTest extends TestCase {
double cst = csti.chiSquareTest(exp, obs); double cst = csti.chiSquareTest(exp, obs);
assertEquals("chi-square p-value", 0.0, cst, 1E-3); assertEquals("chi-square p-value", 0.0, cst, 1E-3);
assertEquals( "chi-square test statistic", assertEquals( "chi-square test statistic",
3624883.342907764, TestUtils.chiSquare(exp, obs), 1E-9); 114875.90421929007, TestUtils.chiSquare(exp, obs), 1E-9);
} }
/** Contingency table containing zeros - PR # 32531 */ /** Contingency table containing zeros - PR # 32531 */

View File

@ -117,6 +117,10 @@ Commons Math Release Notes</title>
by exploiting the the fact that this method has access to the full by exploiting the the fact that this method has access to the full
array of data values. array of data values.
</action> </action>
<action dev="psteitz" type="fix" issue="MATH-175" due-to="Carl Anderson">
Added check and rescaling of expected counts to sum to sum of expected
counts if necessary in ChiSquare test.
</action>
</release> </release>
<release version="1.1" date="2005-12-17" <release version="1.1" date="2005-12-17"
description="This is a maintenance release containing bug fixes and enhancements. description="This is a maintenance release containing bug fixes and enhancements.