Added check and rescaling of expected counts to sum to sum of expected

counts if necessary in ChiSquare test.
JIRA: MATH-175
Reported and patched by Carl Anderson.


git-svn-id: https://svn.apache.org/repos/asf/commons/proper/math/trunk@610274 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Phil Steitz 2008-01-09 05:16:00 +00:00
parent fb4949bd07
commit a3dc59a94d
5 changed files with 56 additions and 19 deletions

View File

@ -50,6 +50,11 @@ public class ChiSquareTestImpl implements UnknownDistributionChiSquareTest {
setDistribution(x);
}
/**
* {@inheritDoc}
* <p><strong>Note: </strong>This implementation rescales the
* <code>expected</code> array if necessary to ensure that the sum of the
* expected and observed counts are equal.</p>
*
* @param observed array of observed frequency counts
* @param expected array of expected frequency counts
* @return chi-square test statistic
@ -58,8 +63,6 @@ public class ChiSquareTestImpl implements UnknownDistributionChiSquareTest {
*/
public double chiSquare(double[] expected, long[] observed)
throws IllegalArgumentException {
double sumSq = 0.0d;
double dev = 0.0d;
if ((expected.length < 2) || (expected.length != observed.length)) {
throw new IllegalArgumentException(
"observed, expected array lengths incorrect");
@ -68,14 +71,38 @@ public class ChiSquareTestImpl implements UnknownDistributionChiSquareTest {
throw new IllegalArgumentException(
"observed counts must be non-negative and expected counts must be postive");
}
double sumExpected = 0d;
double sumObserved = 0d;
for (int i = 0; i < observed.length; i++) {
dev = ((double) observed[i] - expected[i]);
sumSq += dev * dev / expected[i];
sumExpected += expected[i];
sumObserved += observed[i];
}
double ratio = 1.0d;
boolean rescale = false;
if (Math.abs(sumExpected - sumObserved) > 10E-6) {
ratio = sumObserved / sumExpected;
rescale = true;
}
double sumSq = 0.0d;
double dev = 0.0d;
for (int i = 0; i < observed.length; i++) {
if (rescale) {
dev = ((double) observed[i] - ratio * expected[i]);
sumSq += dev * dev / (ratio * expected[i]);
} else {
dev = ((double) observed[i] - expected[i]);
sumSq += dev * dev / expected[i];
}
}
return sumSq;
}
/**
* {@inheritDoc}
* <p><strong>Note: </strong>This implementation rescales the
* <code>expected</code> array if necessary to ensure that the sum of the
* expected and observed counts are equal.</p>
*
* @param observed array of observed frequency counts
* @param expected array of exptected frequency counts
* @return p-value
@ -90,6 +117,11 @@ public class ChiSquareTestImpl implements UnknownDistributionChiSquareTest {
}
/**
* {@inheritDoc}
* <p><strong>Note: </strong>This implementation rescales the
* <code>expected</code> array if necessary to ensure that the sum of the
* expected and observed counts are equal.</p>
*
* @param observed array of observed frequency counts
* @param expected array of exptected frequency counts
* @param alpha significance level of the test

View File

@ -49,8 +49,9 @@ verifyTable <- function(counts, expectedP, expectedStat, tol, desc) {
verifyHomogeneity <- function(obs, exp, expectedP, expectedStat,
tol, desc) {
chi <- sum((obs - exp)^2/exp)
p <- 1 - pchisq(sum((obs - exp)^2/exp), length(obs) - 1)
results <- chisq.test(obs,p=exp,rescale.p=TRUE)
chi <- results$statistic
p <- results$p.value
if (assertEquals(expectedP, p, tol, "p-value")) {
displayPadded(c(desc, " p-value test"), SUCCEEDED, WIDTH)
} else {
@ -73,14 +74,14 @@ verifyHomogeneity(observed, expected, 0.904837418036, 0.2, tol,
observed <- c(500, 623, 72, 70, 31)
expected <- c(485, 541, 82, 61, 37)
verifyHomogeneity(observed, expected, 0.002512096, 16.4131070362, tol,
"testChiSquare2")
verifyHomogeneity(observed, expected, 0.06051952647453607, 9.023307936427388,
tol, "testChiSquare2")
observed <- c(2372383, 584222, 257170, 17750155, 7903832, 489265,
209628, 393899)
expected <- c(3389119.5, 649136.6, 285745.4, 25357364.76, 11291189.78,
543628.0, 232921.0, 437665.75)
verifyHomogeneity(observed, expected, 0, 3624883.342907764, tol,
verifyHomogeneity(observed, expected, 0, 114875.90421929007, tol,
"testChiSquareLargeTestStatistic")
counts <- matrix(c(40, 22, 43, 91, 21, 28, 60, 10, 22), nc = 3);

View File

@ -57,10 +57,10 @@ public class ChiSquareTestTest extends TestCase {
long[] observed1 = { 500, 623, 72, 70, 31 };
double[] expected1 = { 485, 541, 82, 61, 37 };
assertEquals( "chi-square test statistic", 16.4131070362, testStatistic.chiSquare(expected1, observed1), 1E-10);
assertEquals("chi-square p-value", 0.002512096, testStatistic.chiSquareTest(expected1, observed1), 1E-9);
assertTrue("chi-square test reject", testStatistic.chiSquareTest(expected1, observed1, 0.003));
assertTrue("chi-square test accept", !testStatistic.chiSquareTest(expected1, observed1, 0.002));
assertEquals( "chi-square test statistic", 9.023307936427388, testStatistic.chiSquare(expected1, observed1), 1E-10);
assertEquals("chi-square p-value", 0.06051952647453607, testStatistic.chiSquareTest(expected1, observed1), 1E-9);
assertTrue("chi-square test reject", testStatistic.chiSquareTest(expected1, observed1, 0.08));
assertTrue("chi-square test accept", !testStatistic.chiSquareTest(expected1, observed1, 0.05));
try {
testStatistic.chiSquareTest(expected1, observed1, 95);
@ -181,7 +181,7 @@ public class ChiSquareTestTest extends TestCase {
double cst = csti.chiSquareTest(exp, obs);
assertEquals("chi-square p-value", 0.0, cst, 1E-3);
assertEquals( "chi-square test statistic",
3624883.342907764, testStatistic.chiSquare(exp, obs), 1E-9);
114875.90421929007, testStatistic.chiSquare(exp, obs), 1E-9);
}
/** Contingency table containing zeros - PR # 32531 */

View File

@ -55,10 +55,10 @@ public class TestUtilsTest extends TestCase {
long[] observed1 = { 500, 623, 72, 70, 31 };
double[] expected1 = { 485, 541, 82, 61, 37 };
assertEquals( "chi-square test statistic", 16.4131070362, TestUtils.chiSquare(expected1, observed1), 1E-10);
assertEquals("chi-square p-value", 0.002512096, TestUtils.chiSquareTest(expected1, observed1), 1E-9);
assertTrue("chi-square test reject", TestUtils.chiSquareTest(expected1, observed1, 0.003));
assertTrue("chi-square test accept", !TestUtils.chiSquareTest(expected1, observed1, 0.002));
assertEquals( "chi-square test statistic", 9.023307936427388, TestUtils.chiSquare(expected1, observed1), 1E-10);
assertEquals("chi-square p-value", 0.06051952647453607, TestUtils.chiSquareTest(expected1, observed1), 1E-9);
assertTrue("chi-square test reject", TestUtils.chiSquareTest(expected1, observed1, 0.07));
assertTrue("chi-square test accept", !TestUtils.chiSquareTest(expected1, observed1, 0.05));
try {
TestUtils.chiSquareTest(expected1, observed1, 95);
@ -179,7 +179,7 @@ public class TestUtilsTest extends TestCase {
double cst = csti.chiSquareTest(exp, obs);
assertEquals("chi-square p-value", 0.0, cst, 1E-3);
assertEquals( "chi-square test statistic",
3624883.342907764, TestUtils.chiSquare(exp, obs), 1E-9);
114875.90421929007, TestUtils.chiSquare(exp, obs), 1E-9);
}
/** Contingency table containing zeros - PR # 32531 */

View File

@ -116,6 +116,10 @@ Commons Math Release Notes</title>
Changed Mean.evaluate() to use a two-pass algorithm, improving accuracy
by exploiting the the fact that this method has access to the full
array of data values.
</action>
<action dev="psteitz" type="fix" issue="MATH-175" due-to="Carl Anderson">
Added check and rescaling of expected counts to sum to sum of expected
counts if necessary in ChiSquare test.
</action>
</release>
<release version="1.1" date="2005-12-17"