HADOOP-11149. Increase the timeout of TestZKFailoverController. Contributed by Steve Loughran.
This commit is contained in:
parent
529103c51f
commit
579f9030da
|
@ -833,6 +833,9 @@ Release 2.8.0 - UNRELEASED
|
||||||
HADOOP-10068. Improve log4j regex in testFindContainingJar.
|
HADOOP-10068. Improve log4j regex in testFindContainingJar.
|
||||||
(Robert Rati via wheat9)
|
(Robert Rati via wheat9)
|
||||||
|
|
||||||
|
HADOOP-11149. Increase the timeout of TestZKFailoverController.
|
||||||
|
(Steve Loughran via wheat9)
|
||||||
|
|
||||||
Release 2.7.3 - UNRELEASED
|
Release 2.7.3 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -102,9 +102,11 @@ public class MiniZKFCCluster {
|
||||||
* @throws Exception if either of the services had encountered a fatal error
|
* @throws Exception if either of the services had encountered a fatal error
|
||||||
*/
|
*/
|
||||||
public void stop() throws Exception {
|
public void stop() throws Exception {
|
||||||
for (DummyZKFCThread thr : thrs) {
|
if (thrs != null) {
|
||||||
if (thr != null) {
|
for (DummyZKFCThread thr : thrs) {
|
||||||
thr.interrupt();
|
if (thr != null) {
|
||||||
|
thr.interrupt();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ctx != null) {
|
if (ctx != null) {
|
||||||
|
|
|
@ -34,14 +34,23 @@ import org.apache.zookeeper.KeeperException;
|
||||||
import org.apache.zookeeper.ZooKeeper;
|
import org.apache.zookeeper.ZooKeeper;
|
||||||
import org.apache.zookeeper.data.Stat;
|
import org.apache.zookeeper.data.Stat;
|
||||||
import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
|
import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
|
||||||
|
import org.junit.After;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
import org.junit.Rule;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.junit.rules.Timeout;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
public class TestZKFailoverController extends ClientBaseWithFixes {
|
public class TestZKFailoverController extends ClientBaseWithFixes {
|
||||||
private Configuration conf;
|
private Configuration conf;
|
||||||
private MiniZKFCCluster cluster;
|
private MiniZKFCCluster cluster;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the timeout for every test
|
||||||
|
*/
|
||||||
|
@Rule
|
||||||
|
public Timeout testTimeout = new Timeout(3 * 60 * 1000);
|
||||||
|
|
||||||
// Set up ZK digest-based credentials for the purposes of the tests,
|
// Set up ZK digest-based credentials for the purposes of the tests,
|
||||||
// to make sure all of our functionality works with auth and ACLs
|
// to make sure all of our functionality works with auth and ACLs
|
||||||
// present.
|
// present.
|
||||||
|
@ -74,11 +83,21 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
|
||||||
this.cluster = new MiniZKFCCluster(conf, getServer(serverFactory));
|
this.cluster = new MiniZKFCCluster(conf, getServer(serverFactory));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void teardown() {
|
||||||
|
if (cluster != null) {
|
||||||
|
try {
|
||||||
|
cluster.stop();
|
||||||
|
} catch (Exception e) {
|
||||||
|
LOG.warn("When stopping the cluster", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* Test that the various command lines for formatting the ZK directory
|
* Test that the various command lines for formatting the ZK directory
|
||||||
* function correctly.
|
* function correctly.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testFormatZK() throws Exception {
|
public void testFormatZK() throws Exception {
|
||||||
DummyHAService svc = cluster.getService(1);
|
DummyHAService svc = cluster.getService(1);
|
||||||
// Run without formatting the base dir,
|
// Run without formatting the base dir,
|
||||||
|
@ -101,14 +120,14 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
|
||||||
* Test that if ZooKeeper is not running, the correct error
|
* Test that if ZooKeeper is not running, the correct error
|
||||||
* code is returned.
|
* code is returned.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testNoZK() throws Exception {
|
public void testNoZK() throws Exception {
|
||||||
stopServer();
|
stopServer();
|
||||||
DummyHAService svc = cluster.getService(1);
|
DummyHAService svc = cluster.getService(1);
|
||||||
assertEquals(ZKFailoverController.ERR_CODE_NO_ZK,
|
assertEquals(ZKFailoverController.ERR_CODE_NO_ZK,
|
||||||
runFC(svc));
|
runFC(svc));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFormatOneClusterLeavesOtherClustersAlone() throws Exception {
|
public void testFormatOneClusterLeavesOtherClustersAlone() throws Exception {
|
||||||
DummyHAService svc = cluster.getService(1);
|
DummyHAService svc = cluster.getService(1);
|
||||||
|
@ -146,7 +165,7 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
|
||||||
* Test that automatic failover won't run against a target that hasn't
|
* Test that automatic failover won't run against a target that hasn't
|
||||||
* explicitly enabled the feature.
|
* explicitly enabled the feature.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=10000)
|
@Test
|
||||||
public void testWontRunWhenAutoFailoverDisabled() throws Exception {
|
public void testWontRunWhenAutoFailoverDisabled() throws Exception {
|
||||||
DummyHAService svc = cluster.getService(1);
|
DummyHAService svc = cluster.getService(1);
|
||||||
svc = Mockito.spy(svc);
|
svc = Mockito.spy(svc);
|
||||||
|
@ -162,7 +181,7 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
|
||||||
* Test that, if ACLs are specified in the configuration, that
|
* Test that, if ACLs are specified in the configuration, that
|
||||||
* it sets the ACLs when formatting the parent node.
|
* it sets the ACLs when formatting the parent node.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testFormatSetsAcls() throws Exception {
|
public void testFormatSetsAcls() throws Exception {
|
||||||
// Format the base dir, should succeed
|
// Format the base dir, should succeed
|
||||||
DummyHAService svc = cluster.getService(1);
|
DummyHAService svc = cluster.getService(1);
|
||||||
|
@ -184,7 +203,7 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
|
||||||
* Test that the ZKFC won't run if fencing is not configured for the
|
* Test that the ZKFC won't run if fencing is not configured for the
|
||||||
* local service.
|
* local service.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testFencingMustBeConfigured() throws Exception {
|
public void testFencingMustBeConfigured() throws Exception {
|
||||||
DummyHAService svc = Mockito.spy(cluster.getService(0));
|
DummyHAService svc = Mockito.spy(cluster.getService(0));
|
||||||
Mockito.doThrow(new BadFencingConfigurationException("no fencing"))
|
Mockito.doThrow(new BadFencingConfigurationException("no fencing"))
|
||||||
|
@ -202,31 +221,27 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
|
||||||
* transition is used when possible, falling back to fencing when
|
* transition is used when possible, falling back to fencing when
|
||||||
* the graceful approach fails.
|
* the graceful approach fails.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testAutoFailoverOnBadHealth() throws Exception {
|
public void testAutoFailoverOnBadHealth() throws Exception {
|
||||||
try {
|
cluster.start();
|
||||||
cluster.start();
|
DummyHAService svc1 = cluster.getService(1);
|
||||||
DummyHAService svc1 = cluster.getService(1);
|
|
||||||
|
LOG.info("Faking svc0 unhealthy, should failover to svc1");
|
||||||
LOG.info("Faking svc0 unhealthy, should failover to svc1");
|
cluster.setHealthy(0, false);
|
||||||
cluster.setHealthy(0, false);
|
|
||||||
|
LOG.info("Waiting for svc0 to enter initializing state");
|
||||||
LOG.info("Waiting for svc0 to enter initializing state");
|
cluster.waitForHAState(0, HAServiceState.INITIALIZING);
|
||||||
cluster.waitForHAState(0, HAServiceState.INITIALIZING);
|
cluster.waitForHAState(1, HAServiceState.ACTIVE);
|
||||||
cluster.waitForHAState(1, HAServiceState.ACTIVE);
|
|
||||||
|
LOG.info("Allowing svc0 to be healthy again, making svc1 unreachable " +
|
||||||
LOG.info("Allowing svc0 to be healthy again, making svc1 unreachable " +
|
"and fail to gracefully go to standby");
|
||||||
"and fail to gracefully go to standby");
|
cluster.setUnreachable(1, true);
|
||||||
cluster.setUnreachable(1, true);
|
cluster.setHealthy(0, true);
|
||||||
cluster.setHealthy(0, true);
|
|
||||||
|
// Should fail back to svc0 at this point
|
||||||
// Should fail back to svc0 at this point
|
cluster.waitForHAState(0, HAServiceState.ACTIVE);
|
||||||
cluster.waitForHAState(0, HAServiceState.ACTIVE);
|
// and fence svc1
|
||||||
// and fence svc1
|
Mockito.verify(svc1.fencer).fence(Mockito.same(svc1));
|
||||||
Mockito.verify(svc1.fencer).fence(Mockito.same(svc1));
|
|
||||||
} finally {
|
|
||||||
cluster.stop();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -235,120 +250,104 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
|
||||||
* transition is used when possible, falling back to fencing when
|
* transition is used when possible, falling back to fencing when
|
||||||
* the graceful approach fails.
|
* the graceful approach fails.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testAutoFailoverOnBadState() throws Exception {
|
public void testAutoFailoverOnBadState() throws Exception {
|
||||||
try {
|
cluster.start();
|
||||||
cluster.start();
|
DummyHAService svc0 = cluster.getService(0);
|
||||||
DummyHAService svc0 = cluster.getService(0);
|
LOG.info("Faking svc0 to change the state, should failover to svc1");
|
||||||
LOG.info("Faking svc0 to change the state, should failover to svc1");
|
svc0.state = HAServiceState.STANDBY;
|
||||||
svc0.state = HAServiceState.STANDBY;
|
|
||||||
|
|
||||||
// Should fail back to svc0 at this point
|
|
||||||
cluster.waitForHAState(1, HAServiceState.ACTIVE);
|
|
||||||
} finally {
|
|
||||||
cluster.stop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test(timeout=15000)
|
|
||||||
public void testAutoFailoverOnLostZKSession() throws Exception {
|
|
||||||
try {
|
|
||||||
cluster.start();
|
|
||||||
|
|
||||||
// Expire svc0, it should fail over to svc1
|
// Should fail back to svc0 at this point
|
||||||
cluster.expireAndVerifyFailover(0, 1);
|
cluster.waitForHAState(1, HAServiceState.ACTIVE);
|
||||||
|
}
|
||||||
// Expire svc1, it should fail back to svc0
|
|
||||||
cluster.expireAndVerifyFailover(1, 0);
|
@Test
|
||||||
|
public void testAutoFailoverOnLostZKSession() throws Exception {
|
||||||
LOG.info("======= Running test cases second time to test " +
|
cluster.start();
|
||||||
"re-establishment =========");
|
|
||||||
// Expire svc0, it should fail over to svc1
|
// Expire svc0, it should fail over to svc1
|
||||||
cluster.expireAndVerifyFailover(0, 1);
|
cluster.expireAndVerifyFailover(0, 1);
|
||||||
|
|
||||||
// Expire svc1, it should fail back to svc0
|
// Expire svc1, it should fail back to svc0
|
||||||
cluster.expireAndVerifyFailover(1, 0);
|
cluster.expireAndVerifyFailover(1, 0);
|
||||||
} finally {
|
|
||||||
cluster.stop();
|
LOG.info("======= Running test cases second time to test " +
|
||||||
}
|
"re-establishment =========");
|
||||||
|
// Expire svc0, it should fail over to svc1
|
||||||
|
cluster.expireAndVerifyFailover(0, 1);
|
||||||
|
|
||||||
|
// Expire svc1, it should fail back to svc0
|
||||||
|
cluster.expireAndVerifyFailover(1, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test that, if the standby node is unhealthy, it doesn't try to become
|
* Test that, if the standby node is unhealthy, it doesn't try to become
|
||||||
* active
|
* active
|
||||||
*/
|
*/
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testDontFailoverToUnhealthyNode() throws Exception {
|
public void testDontFailoverToUnhealthyNode() throws Exception {
|
||||||
|
cluster.start();
|
||||||
|
|
||||||
|
// Make svc1 unhealthy, and wait for its FC to notice the bad health.
|
||||||
|
cluster.setHealthy(1, false);
|
||||||
|
cluster.waitForHealthState(1, HealthMonitor.State.SERVICE_UNHEALTHY);
|
||||||
|
|
||||||
|
// Expire svc0
|
||||||
|
cluster.getElector(0).preventSessionReestablishmentForTests();
|
||||||
try {
|
try {
|
||||||
cluster.start();
|
cluster.expireActiveLockHolder(0);
|
||||||
|
|
||||||
// Make svc1 unhealthy, and wait for its FC to notice the bad health.
|
LOG.info("Expired svc0's ZK session. Waiting a second to give svc1" +
|
||||||
cluster.setHealthy(1, false);
|
" a chance to take the lock, if it is ever going to.");
|
||||||
cluster.waitForHealthState(1, HealthMonitor.State.SERVICE_UNHEALTHY);
|
Thread.sleep(1000);
|
||||||
|
|
||||||
// Expire svc0
|
// Ensure that no one holds the lock.
|
||||||
cluster.getElector(0).preventSessionReestablishmentForTests();
|
cluster.waitForActiveLockHolder(null);
|
||||||
try {
|
|
||||||
cluster.expireActiveLockHolder(0);
|
|
||||||
|
|
||||||
LOG.info("Expired svc0's ZK session. Waiting a second to give svc1" +
|
|
||||||
" a chance to take the lock, if it is ever going to.");
|
|
||||||
Thread.sleep(1000);
|
|
||||||
|
|
||||||
// Ensure that no one holds the lock.
|
|
||||||
cluster.waitForActiveLockHolder(null);
|
|
||||||
|
|
||||||
} finally {
|
|
||||||
LOG.info("Allowing svc0's elector to re-establish its connection");
|
|
||||||
cluster.getElector(0).allowSessionReestablishmentForTests();
|
|
||||||
}
|
|
||||||
// svc0 should get the lock again
|
|
||||||
cluster.waitForActiveLockHolder(0);
|
|
||||||
} finally {
|
} finally {
|
||||||
cluster.stop();
|
LOG.info("Allowing svc0's elector to re-establish its connection");
|
||||||
|
cluster.getElector(0).allowSessionReestablishmentForTests();
|
||||||
}
|
}
|
||||||
|
// svc0 should get the lock again
|
||||||
|
cluster.waitForActiveLockHolder(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test that the ZKFC successfully quits the election when it fails to
|
* Test that the ZKFC successfully quits the election when it fails to
|
||||||
* become active. This allows the old node to successfully fail back.
|
* become active. This allows the old node to successfully fail back.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testBecomingActiveFails() throws Exception {
|
public void testBecomingActiveFails() throws Exception {
|
||||||
try {
|
cluster.start();
|
||||||
cluster.start();
|
DummyHAService svc1 = cluster.getService(1);
|
||||||
DummyHAService svc1 = cluster.getService(1);
|
|
||||||
|
|
||||||
LOG.info("Making svc1 fail to become active");
|
|
||||||
cluster.setFailToBecomeActive(1, true);
|
|
||||||
|
|
||||||
LOG.info("Faking svc0 unhealthy, should NOT successfully " +
|
|
||||||
"failover to svc1");
|
|
||||||
cluster.setHealthy(0, false);
|
|
||||||
cluster.waitForHealthState(0, State.SERVICE_UNHEALTHY);
|
|
||||||
cluster.waitForActiveLockHolder(null);
|
|
||||||
|
|
||||||
|
LOG.info("Making svc1 fail to become active");
|
||||||
Mockito.verify(svc1.proxy, Mockito.timeout(2000).atLeastOnce())
|
cluster.setFailToBecomeActive(1, true);
|
||||||
.transitionToActive(Mockito.<StateChangeRequestInfo>any());
|
|
||||||
|
|
||||||
cluster.waitForHAState(0, HAServiceState.INITIALIZING);
|
LOG.info("Faking svc0 unhealthy, should NOT successfully " +
|
||||||
cluster.waitForHAState(1, HAServiceState.STANDBY);
|
"failover to svc1");
|
||||||
|
cluster.setHealthy(0, false);
|
||||||
LOG.info("Faking svc0 healthy again, should go back to svc0");
|
cluster.waitForHealthState(0, State.SERVICE_UNHEALTHY);
|
||||||
cluster.setHealthy(0, true);
|
cluster.waitForActiveLockHolder(null);
|
||||||
cluster.waitForHAState(0, HAServiceState.ACTIVE);
|
|
||||||
cluster.waitForHAState(1, HAServiceState.STANDBY);
|
|
||||||
cluster.waitForActiveLockHolder(0);
|
Mockito.verify(svc1.proxy, Mockito.timeout(2000).atLeastOnce())
|
||||||
|
.transitionToActive(Mockito.<StateChangeRequestInfo>any());
|
||||||
// Ensure that we can fail back to svc1 once it it is able
|
|
||||||
// to become active (e.g the admin has restarted it)
|
cluster.waitForHAState(0, HAServiceState.INITIALIZING);
|
||||||
LOG.info("Allowing svc1 to become active, expiring svc0");
|
cluster.waitForHAState(1, HAServiceState.STANDBY);
|
||||||
svc1.failToBecomeActive = false;
|
|
||||||
cluster.expireAndVerifyFailover(0, 1);
|
LOG.info("Faking svc0 healthy again, should go back to svc0");
|
||||||
} finally {
|
cluster.setHealthy(0, true);
|
||||||
cluster.stop();
|
cluster.waitForHAState(0, HAServiceState.ACTIVE);
|
||||||
}
|
cluster.waitForHAState(1, HAServiceState.STANDBY);
|
||||||
|
cluster.waitForActiveLockHolder(0);
|
||||||
|
|
||||||
|
// Ensure that we can fail back to svc1 once it it is able
|
||||||
|
// to become active (e.g the admin has restarted it)
|
||||||
|
LOG.info("Allowing svc1 to become active, expiring svc0");
|
||||||
|
svc1.failToBecomeActive = false;
|
||||||
|
cluster.expireAndVerifyFailover(0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -356,211 +355,183 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
|
||||||
* current state, without triggering any failovers, and without
|
* current state, without triggering any failovers, and without
|
||||||
* causing the active node to enter standby state.
|
* causing the active node to enter standby state.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testZooKeeperFailure() throws Exception {
|
public void testZooKeeperFailure() throws Exception {
|
||||||
try {
|
cluster.start();
|
||||||
cluster.start();
|
|
||||||
|
|
||||||
// Record initial ZK sessions
|
// Record initial ZK sessions
|
||||||
long session0 = cluster.getElector(0).getZKSessionIdForTests();
|
long session0 = cluster.getElector(0).getZKSessionIdForTests();
|
||||||
long session1 = cluster.getElector(1).getZKSessionIdForTests();
|
long session1 = cluster.getElector(1).getZKSessionIdForTests();
|
||||||
|
|
||||||
LOG.info("====== Stopping ZK server");
|
LOG.info("====== Stopping ZK server");
|
||||||
stopServer();
|
stopServer();
|
||||||
waitForServerDown(hostPort, CONNECTION_TIMEOUT);
|
waitForServerDown(hostPort, CONNECTION_TIMEOUT);
|
||||||
|
|
||||||
LOG.info("====== Waiting for services to enter NEUTRAL mode");
|
|
||||||
cluster.waitForElectorState(0,
|
|
||||||
ActiveStandbyElector.State.NEUTRAL);
|
|
||||||
cluster.waitForElectorState(1,
|
|
||||||
ActiveStandbyElector.State.NEUTRAL);
|
|
||||||
|
|
||||||
LOG.info("====== Checking that the services didn't change HA state");
|
LOG.info("====== Waiting for services to enter NEUTRAL mode");
|
||||||
assertEquals(HAServiceState.ACTIVE, cluster.getService(0).state);
|
cluster.waitForElectorState(0,
|
||||||
assertEquals(HAServiceState.STANDBY, cluster.getService(1).state);
|
ActiveStandbyElector.State.NEUTRAL);
|
||||||
|
cluster.waitForElectorState(1,
|
||||||
LOG.info("====== Restarting server");
|
ActiveStandbyElector.State.NEUTRAL);
|
||||||
startServer();
|
|
||||||
waitForServerUp(hostPort, CONNECTION_TIMEOUT);
|
|
||||||
|
|
||||||
// Nodes should go back to their original states, since they re-obtain
|
LOG.info("====== Checking that the services didn't change HA state");
|
||||||
// the same sessions.
|
assertEquals(HAServiceState.ACTIVE, cluster.getService(0).state);
|
||||||
cluster.waitForElectorState(0, ActiveStandbyElector.State.ACTIVE);
|
assertEquals(HAServiceState.STANDBY, cluster.getService(1).state);
|
||||||
cluster.waitForElectorState(1, ActiveStandbyElector.State.STANDBY);
|
|
||||||
// Check HA states didn't change.
|
|
||||||
cluster.waitForHAState(0, HAServiceState.ACTIVE);
|
|
||||||
cluster.waitForHAState(1, HAServiceState.STANDBY);
|
|
||||||
|
|
||||||
// Check they re-used the same sessions and didn't spuriously reconnect
|
LOG.info("====== Restarting server");
|
||||||
assertEquals(session0,
|
startServer();
|
||||||
cluster.getElector(0).getZKSessionIdForTests());
|
waitForServerUp(hostPort, CONNECTION_TIMEOUT);
|
||||||
assertEquals(session1,
|
|
||||||
cluster.getElector(1).getZKSessionIdForTests());
|
// Nodes should go back to their original states, since they re-obtain
|
||||||
} finally {
|
// the same sessions.
|
||||||
cluster.stop();
|
cluster.waitForElectorState(0, ActiveStandbyElector.State.ACTIVE);
|
||||||
}
|
cluster.waitForElectorState(1, ActiveStandbyElector.State.STANDBY);
|
||||||
|
// Check HA states didn't change.
|
||||||
|
cluster.waitForHAState(0, HAServiceState.ACTIVE);
|
||||||
|
cluster.waitForHAState(1, HAServiceState.STANDBY);
|
||||||
|
|
||||||
|
// Check they re-used the same sessions and didn't spuriously reconnect
|
||||||
|
assertEquals(session0,
|
||||||
|
cluster.getElector(0).getZKSessionIdForTests());
|
||||||
|
assertEquals(session1,
|
||||||
|
cluster.getElector(1).getZKSessionIdForTests());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test that the ZKFC can gracefully cede its active status.
|
* Test that the ZKFC can gracefully cede its active status.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testCedeActive() throws Exception {
|
public void testCedeActive() throws Exception {
|
||||||
try {
|
cluster.start();
|
||||||
cluster.start();
|
DummyZKFC zkfc = cluster.getZkfc(0);
|
||||||
DummyZKFC zkfc = cluster.getZkfc(0);
|
// It should be in active to start.
|
||||||
// It should be in active to start.
|
assertEquals(ActiveStandbyElector.State.ACTIVE,
|
||||||
assertEquals(ActiveStandbyElector.State.ACTIVE,
|
zkfc.getElectorForTests().getStateForTests());
|
||||||
zkfc.getElectorForTests().getStateForTests());
|
|
||||||
|
|
||||||
// Ask it to cede active for 3 seconds. It should respond promptly
|
// Ask it to cede active for 3 seconds. It should respond promptly
|
||||||
// (i.e. the RPC itself should not take 3 seconds!)
|
// (i.e. the RPC itself should not take 3 seconds!)
|
||||||
ZKFCProtocol proxy = zkfc.getLocalTarget().getZKFCProxy(conf, 5000);
|
ZKFCProtocol proxy = zkfc.getLocalTarget().getZKFCProxy(conf, 5000);
|
||||||
long st = Time.now();
|
long st = Time.now();
|
||||||
proxy.cedeActive(3000);
|
proxy.cedeActive(3000);
|
||||||
long et = Time.now();
|
long et = Time.now();
|
||||||
assertTrue("RPC to cedeActive took " + (et - st) + " ms",
|
assertTrue("RPC to cedeActive took " + (et - st) + " ms",
|
||||||
et - st < 1000);
|
et - st < 1000);
|
||||||
|
|
||||||
// Should be in "INIT" state since it's not in the election
|
|
||||||
// at this point.
|
|
||||||
assertEquals(ActiveStandbyElector.State.INIT,
|
|
||||||
zkfc.getElectorForTests().getStateForTests());
|
|
||||||
|
|
||||||
// After the prescribed 3 seconds, should go into STANDBY state,
|
// Should be in "INIT" state since it's not in the election
|
||||||
// since the other node in the cluster would have taken ACTIVE.
|
// at this point.
|
||||||
cluster.waitForElectorState(0, ActiveStandbyElector.State.STANDBY);
|
assertEquals(ActiveStandbyElector.State.INIT,
|
||||||
long et2 = Time.now();
|
zkfc.getElectorForTests().getStateForTests());
|
||||||
assertTrue("Should take ~3 seconds to rejoin. Only took " + (et2 - et) +
|
|
||||||
"ms before rejoining.",
|
// After the prescribed 3 seconds, should go into STANDBY state,
|
||||||
et2 - et > 2800);
|
// since the other node in the cluster would have taken ACTIVE.
|
||||||
} finally {
|
cluster.waitForElectorState(0, ActiveStandbyElector.State.STANDBY);
|
||||||
cluster.stop();
|
long et2 = Time.now();
|
||||||
}
|
assertTrue("Should take ~3 seconds to rejoin. Only took " + (et2 - et) +
|
||||||
|
"ms before rejoining.",
|
||||||
|
et2 - et > 2800);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(timeout=25000)
|
@Test
|
||||||
public void testGracefulFailover() throws Exception {
|
public void testGracefulFailover() throws Exception {
|
||||||
try {
|
cluster.start();
|
||||||
cluster.start();
|
|
||||||
|
|
||||||
cluster.waitForActiveLockHolder(0);
|
cluster.waitForActiveLockHolder(0);
|
||||||
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
|
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
|
||||||
cluster.waitForActiveLockHolder(1);
|
cluster.waitForActiveLockHolder(1);
|
||||||
|
|
||||||
cluster.getService(0).getZKFCProxy(conf, 5000).gracefulFailover();
|
cluster.getService(0).getZKFCProxy(conf, 5000).gracefulFailover();
|
||||||
cluster.waitForActiveLockHolder(0);
|
cluster.waitForActiveLockHolder(0);
|
||||||
|
|
||||||
Thread.sleep(10000); // allow to quiesce
|
Thread.sleep(10000); // allow to quiesce
|
||||||
|
|
||||||
assertEquals(0, cluster.getService(0).fenceCount);
|
assertEquals(0, cluster.getService(0).fenceCount);
|
||||||
assertEquals(0, cluster.getService(1).fenceCount);
|
assertEquals(0, cluster.getService(1).fenceCount);
|
||||||
assertEquals(2, cluster.getService(0).activeTransitionCount);
|
assertEquals(2, cluster.getService(0).activeTransitionCount);
|
||||||
assertEquals(1, cluster.getService(1).activeTransitionCount);
|
assertEquals(1, cluster.getService(1).activeTransitionCount);
|
||||||
} finally {
|
|
||||||
cluster.stop();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testGracefulFailoverToUnhealthy() throws Exception {
|
public void testGracefulFailoverToUnhealthy() throws Exception {
|
||||||
|
cluster.start();
|
||||||
|
|
||||||
|
cluster.waitForActiveLockHolder(0);
|
||||||
|
|
||||||
|
// Mark it unhealthy, wait for it to exit election
|
||||||
|
cluster.setHealthy(1, false);
|
||||||
|
cluster.waitForElectorState(1, ActiveStandbyElector.State.INIT);
|
||||||
|
|
||||||
|
// Ask for failover, it should fail, because it's unhealthy
|
||||||
try {
|
try {
|
||||||
cluster.start();
|
|
||||||
|
|
||||||
cluster.waitForActiveLockHolder(0);
|
|
||||||
|
|
||||||
// Mark it unhealthy, wait for it to exit election
|
|
||||||
cluster.setHealthy(1, false);
|
|
||||||
cluster.waitForElectorState(1, ActiveStandbyElector.State.INIT);
|
|
||||||
|
|
||||||
// Ask for failover, it should fail, because it's unhealthy
|
|
||||||
try {
|
|
||||||
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
|
|
||||||
fail("Did not fail to graceful failover to unhealthy service!");
|
|
||||||
} catch (ServiceFailedException sfe) {
|
|
||||||
GenericTestUtils.assertExceptionContains(
|
|
||||||
cluster.getService(1).toString() +
|
|
||||||
" is not currently healthy.", sfe);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
cluster.stop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test(timeout=15000)
|
|
||||||
public void testGracefulFailoverFailBecomingActive() throws Exception {
|
|
||||||
try {
|
|
||||||
cluster.start();
|
|
||||||
|
|
||||||
cluster.waitForActiveLockHolder(0);
|
|
||||||
cluster.setFailToBecomeActive(1, true);
|
|
||||||
|
|
||||||
// Ask for failover, it should fail and report back to user.
|
|
||||||
try {
|
|
||||||
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
|
|
||||||
fail("Did not fail to graceful failover when target failed " +
|
|
||||||
"to become active!");
|
|
||||||
} catch (ServiceFailedException sfe) {
|
|
||||||
GenericTestUtils.assertExceptionContains(
|
|
||||||
"Couldn't make " + cluster.getService(1) + " active", sfe);
|
|
||||||
GenericTestUtils.assertExceptionContains(
|
|
||||||
"injected failure", sfe);
|
|
||||||
}
|
|
||||||
|
|
||||||
// No fencing
|
|
||||||
assertEquals(0, cluster.getService(0).fenceCount);
|
|
||||||
assertEquals(0, cluster.getService(1).fenceCount);
|
|
||||||
|
|
||||||
// Service 0 should go back to being active after the failed failover
|
|
||||||
cluster.waitForActiveLockHolder(0);
|
|
||||||
} finally {
|
|
||||||
cluster.stop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test(timeout=15000)
|
|
||||||
public void testGracefulFailoverFailBecomingStandby() throws Exception {
|
|
||||||
try {
|
|
||||||
cluster.start();
|
|
||||||
|
|
||||||
cluster.waitForActiveLockHolder(0);
|
|
||||||
|
|
||||||
// Ask for failover when old node fails to transition to standby.
|
|
||||||
// This should trigger fencing, since the cedeActive() command
|
|
||||||
// still works, but leaves the breadcrumb in place.
|
|
||||||
cluster.setFailToBecomeStandby(0, true);
|
|
||||||
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
|
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
|
||||||
|
fail("Did not fail to graceful failover to unhealthy service!");
|
||||||
// Check that the old node was fenced
|
} catch (ServiceFailedException sfe) {
|
||||||
assertEquals(1, cluster.getService(0).fenceCount);
|
GenericTestUtils.assertExceptionContains(
|
||||||
} finally {
|
cluster.getService(1).toString() +
|
||||||
cluster.stop();
|
" is not currently healthy.", sfe);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(timeout=15000)
|
@Test
|
||||||
public void testGracefulFailoverFailBecomingStandbyAndFailFence()
|
public void testGracefulFailoverFailBecomingActive() throws Exception {
|
||||||
throws Exception {
|
cluster.start();
|
||||||
|
|
||||||
|
cluster.waitForActiveLockHolder(0);
|
||||||
|
cluster.setFailToBecomeActive(1, true);
|
||||||
|
|
||||||
|
// Ask for failover, it should fail and report back to user.
|
||||||
try {
|
try {
|
||||||
cluster.start();
|
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
|
||||||
|
fail("Did not fail to graceful failover when target failed " +
|
||||||
|
"to become active!");
|
||||||
|
} catch (ServiceFailedException sfe) {
|
||||||
|
GenericTestUtils.assertExceptionContains(
|
||||||
|
"Couldn't make " + cluster.getService(1) + " active", sfe);
|
||||||
|
GenericTestUtils.assertExceptionContains(
|
||||||
|
"injected failure", sfe);
|
||||||
|
}
|
||||||
|
|
||||||
cluster.waitForActiveLockHolder(0);
|
// No fencing
|
||||||
|
assertEquals(0, cluster.getService(0).fenceCount);
|
||||||
// Ask for failover when old node fails to transition to standby.
|
assertEquals(0, cluster.getService(1).fenceCount);
|
||||||
// This should trigger fencing, since the cedeActive() command
|
|
||||||
// still works, but leaves the breadcrumb in place.
|
|
||||||
cluster.setFailToBecomeStandby(0, true);
|
|
||||||
cluster.setFailToFence(0, true);
|
|
||||||
|
|
||||||
try {
|
// Service 0 should go back to being active after the failed failover
|
||||||
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
|
cluster.waitForActiveLockHolder(0);
|
||||||
fail("Failover should have failed when old node wont fence");
|
}
|
||||||
} catch (ServiceFailedException sfe) {
|
|
||||||
GenericTestUtils.assertExceptionContains(
|
@Test
|
||||||
"Unable to fence " + cluster.getService(0), sfe);
|
public void testGracefulFailoverFailBecomingStandby() throws Exception {
|
||||||
}
|
cluster.start();
|
||||||
} finally {
|
|
||||||
cluster.stop();
|
cluster.waitForActiveLockHolder(0);
|
||||||
|
|
||||||
|
// Ask for failover when old node fails to transition to standby.
|
||||||
|
// This should trigger fencing, since the cedeActive() command
|
||||||
|
// still works, but leaves the breadcrumb in place.
|
||||||
|
cluster.setFailToBecomeStandby(0, true);
|
||||||
|
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
|
||||||
|
|
||||||
|
// Check that the old node was fenced
|
||||||
|
assertEquals(1, cluster.getService(0).fenceCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGracefulFailoverFailBecomingStandbyAndFailFence()
|
||||||
|
throws Exception {
|
||||||
|
cluster.start();
|
||||||
|
|
||||||
|
cluster.waitForActiveLockHolder(0);
|
||||||
|
|
||||||
|
// Ask for failover when old node fails to transition to standby.
|
||||||
|
// This should trigger fencing, since the cedeActive() command
|
||||||
|
// still works, but leaves the breadcrumb in place.
|
||||||
|
cluster.setFailToBecomeStandby(0, true);
|
||||||
|
cluster.setFailToFence(0, true);
|
||||||
|
|
||||||
|
try {
|
||||||
|
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
|
||||||
|
fail("Failover should have failed when old node wont fence");
|
||||||
|
} catch (ServiceFailedException sfe) {
|
||||||
|
GenericTestUtils.assertExceptionContains(
|
||||||
|
"Unable to fence " + cluster.getService(0), sfe);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -568,43 +539,39 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
|
||||||
* Test which exercises all of the inputs into ZKFC. This is particularly
|
* Test which exercises all of the inputs into ZKFC. This is particularly
|
||||||
* useful for running under jcarder to check for lock order violations.
|
* useful for running under jcarder to check for lock order violations.
|
||||||
*/
|
*/
|
||||||
@Test(timeout=30000)
|
@Test
|
||||||
public void testOneOfEverything() throws Exception {
|
public void testOneOfEverything() throws Exception {
|
||||||
try {
|
cluster.start();
|
||||||
cluster.start();
|
|
||||||
|
|
||||||
// Failover by session expiration
|
|
||||||
LOG.info("====== Failing over by session expiration");
|
|
||||||
cluster.expireAndVerifyFailover(0, 1);
|
|
||||||
cluster.expireAndVerifyFailover(1, 0);
|
|
||||||
|
|
||||||
// Restart ZK
|
|
||||||
LOG.info("====== Restarting server");
|
|
||||||
stopServer();
|
|
||||||
waitForServerDown(hostPort, CONNECTION_TIMEOUT);
|
|
||||||
startServer();
|
|
||||||
waitForServerUp(hostPort, CONNECTION_TIMEOUT);
|
|
||||||
|
|
||||||
// Failover by bad health
|
// Failover by session expiration
|
||||||
cluster.setHealthy(0, false);
|
LOG.info("====== Failing over by session expiration");
|
||||||
cluster.waitForHAState(0, HAServiceState.INITIALIZING);
|
cluster.expireAndVerifyFailover(0, 1);
|
||||||
cluster.waitForHAState(1, HAServiceState.ACTIVE);
|
cluster.expireAndVerifyFailover(1, 0);
|
||||||
cluster.setHealthy(1, true);
|
|
||||||
cluster.setHealthy(0, false);
|
// Restart ZK
|
||||||
cluster.waitForHAState(1, HAServiceState.ACTIVE);
|
LOG.info("====== Restarting server");
|
||||||
cluster.waitForHAState(0, HAServiceState.INITIALIZING);
|
stopServer();
|
||||||
cluster.setHealthy(0, true);
|
waitForServerDown(hostPort, CONNECTION_TIMEOUT);
|
||||||
|
startServer();
|
||||||
cluster.waitForHealthState(0, State.SERVICE_HEALTHY);
|
waitForServerUp(hostPort, CONNECTION_TIMEOUT);
|
||||||
|
|
||||||
// Graceful failovers
|
// Failover by bad health
|
||||||
cluster.getZkfc(1).gracefulFailoverToYou();
|
cluster.setHealthy(0, false);
|
||||||
cluster.getZkfc(0).gracefulFailoverToYou();
|
cluster.waitForHAState(0, HAServiceState.INITIALIZING);
|
||||||
} finally {
|
cluster.waitForHAState(1, HAServiceState.ACTIVE);
|
||||||
cluster.stop();
|
cluster.setHealthy(1, true);
|
||||||
}
|
cluster.setHealthy(0, false);
|
||||||
|
cluster.waitForHAState(1, HAServiceState.ACTIVE);
|
||||||
|
cluster.waitForHAState(0, HAServiceState.INITIALIZING);
|
||||||
|
cluster.setHealthy(0, true);
|
||||||
|
|
||||||
|
cluster.waitForHealthState(0, State.SERVICE_HEALTHY);
|
||||||
|
|
||||||
|
// Graceful failovers
|
||||||
|
cluster.getZkfc(1).gracefulFailoverToYou();
|
||||||
|
cluster.getZkfc(0).gracefulFailoverToYou();
|
||||||
}
|
}
|
||||||
|
|
||||||
private int runFC(DummyHAService target, String ... args) throws Exception {
|
private int runFC(DummyHAService target, String ... args) throws Exception {
|
||||||
DummyZKFC zkfc = new DummyZKFC(conf, target);
|
DummyZKFC zkfc = new DummyZKFC(conf, target);
|
||||||
return zkfc.run(args);
|
return zkfc.run(args);
|
||||||
|
|
Loading…
Reference in New Issue