HADOOP-11149. TestZKFailoverController times out. Contributed by Steve Loughran. closes apache/hadoop#51

This commit is contained in:
Tsuyoshi Ozawa 2015-11-23 05:38:42 +09:00
parent a4bd54f9d7
commit 053a511919
3 changed files with 325 additions and 357 deletions

View File

@ -1355,6 +1355,9 @@ Release 2.8.0 - UNRELEASED
HADOOP-8419. Fixed GzipCode NPE reset for IBM JDK. (Yu Li via eyang) HADOOP-8419. Fixed GzipCode NPE reset for IBM JDK. (Yu Li via eyang)
HADOOP-11149. TestZKFailoverController times out. (Steve Loughran
via ozawa)
OPTIMIZATIONS OPTIMIZATIONS
HADOOP-12051. ProtobufRpcEngine.invoke() should use Exception.toString() HADOOP-12051. ProtobufRpcEngine.invoke() should use Exception.toString()

View File

@ -133,9 +133,11 @@ public void start(int count) throws Exception {
* @throws Exception if either of the services had encountered a fatal error * @throws Exception if either of the services had encountered a fatal error
*/ */
public void stop() throws Exception { public void stop() throws Exception {
for (DummyZKFCThread thr : thrs) { if (thrs != null) {
if (thr != null) { for (DummyZKFCThread thr : thrs) {
thr.interrupt(); if (thr != null) {
thr.interrupt();
}
} }
} }
if (ctx != null) { if (ctx != null) {

View File

@ -34,14 +34,23 @@
import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.data.Stat; import org.apache.zookeeper.data.Stat;
import org.apache.zookeeper.server.auth.DigestAuthenticationProvider; import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
import org.junit.After;
import org.junit.Before; import org.junit.Before;
import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.Timeout;
import org.mockito.Mockito; import org.mockito.Mockito;
public class TestZKFailoverController extends ClientBaseWithFixes { public class TestZKFailoverController extends ClientBaseWithFixes {
private Configuration conf; private Configuration conf;
private MiniZKFCCluster cluster; private MiniZKFCCluster cluster;
/**
* Set the timeout for every test
*/
@Rule
public Timeout testTimeout = new Timeout(3 * 60 * 1000);
// Set up ZK digest-based credentials for the purposes of the tests, // Set up ZK digest-based credentials for the purposes of the tests,
// to make sure all of our functionality works with auth and ACLs // to make sure all of our functionality works with auth and ACLs
// present. // present.
@ -74,11 +83,21 @@ public void setupConfAndServices() {
this.cluster = new MiniZKFCCluster(conf, getServer(serverFactory)); this.cluster = new MiniZKFCCluster(conf, getServer(serverFactory));
} }
@After
public void teardown() {
if (cluster != null) {
try {
cluster.stop();
} catch (Exception e) {
LOG.warn("When stopping the cluster", e);
}
}
}
/** /**
* Test that the various command lines for formatting the ZK directory * Test that the various command lines for formatting the ZK directory
* function correctly. * function correctly.
*/ */
@Test(timeout=15000) @Test
public void testFormatZK() throws Exception { public void testFormatZK() throws Exception {
DummyHAService svc = cluster.getService(1); DummyHAService svc = cluster.getService(1);
// Run without formatting the base dir, // Run without formatting the base dir,
@ -101,14 +120,14 @@ public void testFormatZK() throws Exception {
* Test that if ZooKeeper is not running, the correct error * Test that if ZooKeeper is not running, the correct error
* code is returned. * code is returned.
*/ */
@Test(timeout=15000) @Test
public void testNoZK() throws Exception { public void testNoZK() throws Exception {
stopServer(); stopServer();
DummyHAService svc = cluster.getService(1); DummyHAService svc = cluster.getService(1);
assertEquals(ZKFailoverController.ERR_CODE_NO_ZK, assertEquals(ZKFailoverController.ERR_CODE_NO_ZK,
runFC(svc)); runFC(svc));
} }
@Test @Test
public void testFormatOneClusterLeavesOtherClustersAlone() throws Exception { public void testFormatOneClusterLeavesOtherClustersAlone() throws Exception {
DummyHAService svc = cluster.getService(1); DummyHAService svc = cluster.getService(1);
@ -146,7 +165,7 @@ protected String getScopeInsideParentNode() {
* Test that automatic failover won't run against a target that hasn't * Test that automatic failover won't run against a target that hasn't
* explicitly enabled the feature. * explicitly enabled the feature.
*/ */
@Test(timeout=10000) @Test
public void testWontRunWhenAutoFailoverDisabled() throws Exception { public void testWontRunWhenAutoFailoverDisabled() throws Exception {
DummyHAService svc = cluster.getService(1); DummyHAService svc = cluster.getService(1);
svc = Mockito.spy(svc); svc = Mockito.spy(svc);
@ -162,7 +181,7 @@ public void testWontRunWhenAutoFailoverDisabled() throws Exception {
* Test that, if ACLs are specified in the configuration, that * Test that, if ACLs are specified in the configuration, that
* it sets the ACLs when formatting the parent node. * it sets the ACLs when formatting the parent node.
*/ */
@Test(timeout=15000) @Test
public void testFormatSetsAcls() throws Exception { public void testFormatSetsAcls() throws Exception {
// Format the base dir, should succeed // Format the base dir, should succeed
DummyHAService svc = cluster.getService(1); DummyHAService svc = cluster.getService(1);
@ -184,7 +203,7 @@ public void testFormatSetsAcls() throws Exception {
* Test that the ZKFC won't run if fencing is not configured for the * Test that the ZKFC won't run if fencing is not configured for the
* local service. * local service.
*/ */
@Test(timeout=15000) @Test
public void testFencingMustBeConfigured() throws Exception { public void testFencingMustBeConfigured() throws Exception {
DummyHAService svc = Mockito.spy(cluster.getService(0)); DummyHAService svc = Mockito.spy(cluster.getService(0));
Mockito.doThrow(new BadFencingConfigurationException("no fencing")) Mockito.doThrow(new BadFencingConfigurationException("no fencing"))
@ -202,31 +221,27 @@ public void testFencingMustBeConfigured() throws Exception {
* transition is used when possible, falling back to fencing when * transition is used when possible, falling back to fencing when
* the graceful approach fails. * the graceful approach fails.
*/ */
@Test(timeout=15000) @Test
public void testAutoFailoverOnBadHealth() throws Exception { public void testAutoFailoverOnBadHealth() throws Exception {
try { cluster.start();
cluster.start(); DummyHAService svc1 = cluster.getService(1);
DummyHAService svc1 = cluster.getService(1);
LOG.info("Faking svc0 unhealthy, should failover to svc1");
LOG.info("Faking svc0 unhealthy, should failover to svc1"); cluster.setHealthy(0, false);
cluster.setHealthy(0, false);
LOG.info("Waiting for svc0 to enter initializing state");
LOG.info("Waiting for svc0 to enter initializing state"); cluster.waitForHAState(0, HAServiceState.INITIALIZING);
cluster.waitForHAState(0, HAServiceState.INITIALIZING); cluster.waitForHAState(1, HAServiceState.ACTIVE);
cluster.waitForHAState(1, HAServiceState.ACTIVE);
LOG.info("Allowing svc0 to be healthy again, making svc1 unreachable " +
LOG.info("Allowing svc0 to be healthy again, making svc1 unreachable " + "and fail to gracefully go to standby");
"and fail to gracefully go to standby"); cluster.setUnreachable(1, true);
cluster.setUnreachable(1, true); cluster.setHealthy(0, true);
cluster.setHealthy(0, true);
// Should fail back to svc0 at this point
// Should fail back to svc0 at this point cluster.waitForHAState(0, HAServiceState.ACTIVE);
cluster.waitForHAState(0, HAServiceState.ACTIVE); // and fence svc1
// and fence svc1 Mockito.verify(svc1.fencer).fence(Mockito.same(svc1));
Mockito.verify(svc1.fencer).fence(Mockito.same(svc1));
} finally {
cluster.stop();
}
} }
/** /**
@ -235,120 +250,104 @@ public void testAutoFailoverOnBadHealth() throws Exception {
* transition is used when possible, falling back to fencing when * transition is used when possible, falling back to fencing when
* the graceful approach fails. * the graceful approach fails.
*/ */
@Test(timeout=15000) @Test
public void testAutoFailoverOnBadState() throws Exception { public void testAutoFailoverOnBadState() throws Exception {
try { cluster.start();
cluster.start(); DummyHAService svc0 = cluster.getService(0);
DummyHAService svc0 = cluster.getService(0); LOG.info("Faking svc0 to change the state, should failover to svc1");
LOG.info("Faking svc0 to change the state, should failover to svc1"); svc0.state = HAServiceState.STANDBY;
svc0.state = HAServiceState.STANDBY;
// Should fail back to svc0 at this point
cluster.waitForHAState(1, HAServiceState.ACTIVE);
} finally {
cluster.stop();
}
}
@Test(timeout=15000)
public void testAutoFailoverOnLostZKSession() throws Exception {
try {
cluster.start();
// Expire svc0, it should fail over to svc1 // Should fail back to svc0 at this point
cluster.expireAndVerifyFailover(0, 1); cluster.waitForHAState(1, HAServiceState.ACTIVE);
}
// Expire svc1, it should fail back to svc0
cluster.expireAndVerifyFailover(1, 0); @Test
public void testAutoFailoverOnLostZKSession() throws Exception {
LOG.info("======= Running test cases second time to test " + cluster.start();
"re-establishment =========");
// Expire svc0, it should fail over to svc1 // Expire svc0, it should fail over to svc1
cluster.expireAndVerifyFailover(0, 1); cluster.expireAndVerifyFailover(0, 1);
// Expire svc1, it should fail back to svc0 // Expire svc1, it should fail back to svc0
cluster.expireAndVerifyFailover(1, 0); cluster.expireAndVerifyFailover(1, 0);
} finally {
cluster.stop(); LOG.info("======= Running test cases second time to test " +
} "re-establishment =========");
// Expire svc0, it should fail over to svc1
cluster.expireAndVerifyFailover(0, 1);
// Expire svc1, it should fail back to svc0
cluster.expireAndVerifyFailover(1, 0);
} }
/** /**
* Test that, if the standby node is unhealthy, it doesn't try to become * Test that, if the standby node is unhealthy, it doesn't try to become
* active * active
*/ */
@Test(timeout=15000) @Test
public void testDontFailoverToUnhealthyNode() throws Exception { public void testDontFailoverToUnhealthyNode() throws Exception {
cluster.start();
// Make svc1 unhealthy, and wait for its FC to notice the bad health.
cluster.setHealthy(1, false);
cluster.waitForHealthState(1, HealthMonitor.State.SERVICE_UNHEALTHY);
// Expire svc0
cluster.getElector(0).preventSessionReestablishmentForTests();
try { try {
cluster.start(); cluster.expireActiveLockHolder(0);
// Make svc1 unhealthy, and wait for its FC to notice the bad health. LOG.info("Expired svc0's ZK session. Waiting a second to give svc1" +
cluster.setHealthy(1, false); " a chance to take the lock, if it is ever going to.");
cluster.waitForHealthState(1, HealthMonitor.State.SERVICE_UNHEALTHY); Thread.sleep(1000);
// Expire svc0 // Ensure that no one holds the lock.
cluster.getElector(0).preventSessionReestablishmentForTests(); cluster.waitForActiveLockHolder(null);
try {
cluster.expireActiveLockHolder(0);
LOG.info("Expired svc0's ZK session. Waiting a second to give svc1" +
" a chance to take the lock, if it is ever going to.");
Thread.sleep(1000);
// Ensure that no one holds the lock.
cluster.waitForActiveLockHolder(null);
} finally {
LOG.info("Allowing svc0's elector to re-establish its connection");
cluster.getElector(0).allowSessionReestablishmentForTests();
}
// svc0 should get the lock again
cluster.waitForActiveLockHolder(0);
} finally { } finally {
cluster.stop(); LOG.info("Allowing svc0's elector to re-establish its connection");
cluster.getElector(0).allowSessionReestablishmentForTests();
} }
// svc0 should get the lock again
cluster.waitForActiveLockHolder(0);
} }
/** /**
* Test that the ZKFC successfully quits the election when it fails to * Test that the ZKFC successfully quits the election when it fails to
* become active. This allows the old node to successfully fail back. * become active. This allows the old node to successfully fail back.
*/ */
@Test(timeout=15000) @Test
public void testBecomingActiveFails() throws Exception { public void testBecomingActiveFails() throws Exception {
try { cluster.start();
cluster.start(); DummyHAService svc1 = cluster.getService(1);
DummyHAService svc1 = cluster.getService(1);
LOG.info("Making svc1 fail to become active");
cluster.setFailToBecomeActive(1, true);
LOG.info("Faking svc0 unhealthy, should NOT successfully " +
"failover to svc1");
cluster.setHealthy(0, false);
cluster.waitForHealthState(0, State.SERVICE_UNHEALTHY);
cluster.waitForActiveLockHolder(null);
LOG.info("Making svc1 fail to become active");
Mockito.verify(svc1.proxy, Mockito.timeout(2000).atLeastOnce()) cluster.setFailToBecomeActive(1, true);
.transitionToActive(Mockito.<StateChangeRequestInfo>any());
cluster.waitForHAState(0, HAServiceState.INITIALIZING); LOG.info("Faking svc0 unhealthy, should NOT successfully " +
cluster.waitForHAState(1, HAServiceState.STANDBY); "failover to svc1");
cluster.setHealthy(0, false);
LOG.info("Faking svc0 healthy again, should go back to svc0"); cluster.waitForHealthState(0, State.SERVICE_UNHEALTHY);
cluster.setHealthy(0, true); cluster.waitForActiveLockHolder(null);
cluster.waitForHAState(0, HAServiceState.ACTIVE);
cluster.waitForHAState(1, HAServiceState.STANDBY);
cluster.waitForActiveLockHolder(0); Mockito.verify(svc1.proxy, Mockito.timeout(2000).atLeastOnce())
.transitionToActive(Mockito.<StateChangeRequestInfo>any());
// Ensure that we can fail back to svc1 once it it is able
// to become active (e.g the admin has restarted it) cluster.waitForHAState(0, HAServiceState.INITIALIZING);
LOG.info("Allowing svc1 to become active, expiring svc0"); cluster.waitForHAState(1, HAServiceState.STANDBY);
svc1.failToBecomeActive = false;
cluster.expireAndVerifyFailover(0, 1); LOG.info("Faking svc0 healthy again, should go back to svc0");
} finally { cluster.setHealthy(0, true);
cluster.stop(); cluster.waitForHAState(0, HAServiceState.ACTIVE);
} cluster.waitForHAState(1, HAServiceState.STANDBY);
cluster.waitForActiveLockHolder(0);
// Ensure that we can fail back to svc1 once it it is able
// to become active (e.g the admin has restarted it)
LOG.info("Allowing svc1 to become active, expiring svc0");
svc1.failToBecomeActive = false;
cluster.expireAndVerifyFailover(0, 1);
} }
/** /**
@ -356,211 +355,183 @@ public void testBecomingActiveFails() throws Exception {
* current state, without triggering any failovers, and without * current state, without triggering any failovers, and without
* causing the active node to enter standby state. * causing the active node to enter standby state.
*/ */
@Test(timeout=15000) @Test
public void testZooKeeperFailure() throws Exception { public void testZooKeeperFailure() throws Exception {
try { cluster.start();
cluster.start();
// Record initial ZK sessions // Record initial ZK sessions
long session0 = cluster.getElector(0).getZKSessionIdForTests(); long session0 = cluster.getElector(0).getZKSessionIdForTests();
long session1 = cluster.getElector(1).getZKSessionIdForTests(); long session1 = cluster.getElector(1).getZKSessionIdForTests();
LOG.info("====== Stopping ZK server"); LOG.info("====== Stopping ZK server");
stopServer(); stopServer();
waitForServerDown(hostPort, CONNECTION_TIMEOUT); waitForServerDown(hostPort, CONNECTION_TIMEOUT);
LOG.info("====== Waiting for services to enter NEUTRAL mode");
cluster.waitForElectorState(0,
ActiveStandbyElector.State.NEUTRAL);
cluster.waitForElectorState(1,
ActiveStandbyElector.State.NEUTRAL);
LOG.info("====== Checking that the services didn't change HA state"); LOG.info("====== Waiting for services to enter NEUTRAL mode");
assertEquals(HAServiceState.ACTIVE, cluster.getService(0).state); cluster.waitForElectorState(0,
assertEquals(HAServiceState.STANDBY, cluster.getService(1).state); ActiveStandbyElector.State.NEUTRAL);
cluster.waitForElectorState(1,
LOG.info("====== Restarting server"); ActiveStandbyElector.State.NEUTRAL);
startServer();
waitForServerUp(hostPort, CONNECTION_TIMEOUT);
// Nodes should go back to their original states, since they re-obtain LOG.info("====== Checking that the services didn't change HA state");
// the same sessions. assertEquals(HAServiceState.ACTIVE, cluster.getService(0).state);
cluster.waitForElectorState(0, ActiveStandbyElector.State.ACTIVE); assertEquals(HAServiceState.STANDBY, cluster.getService(1).state);
cluster.waitForElectorState(1, ActiveStandbyElector.State.STANDBY);
// Check HA states didn't change.
cluster.waitForHAState(0, HAServiceState.ACTIVE);
cluster.waitForHAState(1, HAServiceState.STANDBY);
// Check they re-used the same sessions and didn't spuriously reconnect LOG.info("====== Restarting server");
assertEquals(session0, startServer();
cluster.getElector(0).getZKSessionIdForTests()); waitForServerUp(hostPort, CONNECTION_TIMEOUT);
assertEquals(session1,
cluster.getElector(1).getZKSessionIdForTests()); // Nodes should go back to their original states, since they re-obtain
} finally { // the same sessions.
cluster.stop(); cluster.waitForElectorState(0, ActiveStandbyElector.State.ACTIVE);
} cluster.waitForElectorState(1, ActiveStandbyElector.State.STANDBY);
// Check HA states didn't change.
cluster.waitForHAState(0, HAServiceState.ACTIVE);
cluster.waitForHAState(1, HAServiceState.STANDBY);
// Check they re-used the same sessions and didn't spuriously reconnect
assertEquals(session0,
cluster.getElector(0).getZKSessionIdForTests());
assertEquals(session1,
cluster.getElector(1).getZKSessionIdForTests());
} }
/** /**
* Test that the ZKFC can gracefully cede its active status. * Test that the ZKFC can gracefully cede its active status.
*/ */
@Test(timeout=15000) @Test
public void testCedeActive() throws Exception { public void testCedeActive() throws Exception {
try { cluster.start();
cluster.start(); DummyZKFC zkfc = cluster.getZkfc(0);
DummyZKFC zkfc = cluster.getZkfc(0); // It should be in active to start.
// It should be in active to start. assertEquals(ActiveStandbyElector.State.ACTIVE,
assertEquals(ActiveStandbyElector.State.ACTIVE, zkfc.getElectorForTests().getStateForTests());
zkfc.getElectorForTests().getStateForTests());
// Ask it to cede active for 3 seconds. It should respond promptly // Ask it to cede active for 3 seconds. It should respond promptly
// (i.e. the RPC itself should not take 3 seconds!) // (i.e. the RPC itself should not take 3 seconds!)
ZKFCProtocol proxy = zkfc.getLocalTarget().getZKFCProxy(conf, 5000); ZKFCProtocol proxy = zkfc.getLocalTarget().getZKFCProxy(conf, 5000);
long st = Time.now(); long st = Time.now();
proxy.cedeActive(3000); proxy.cedeActive(3000);
long et = Time.now(); long et = Time.now();
assertTrue("RPC to cedeActive took " + (et - st) + " ms", assertTrue("RPC to cedeActive took " + (et - st) + " ms",
et - st < 1000); et - st < 1000);
// Should be in "INIT" state since it's not in the election
// at this point.
assertEquals(ActiveStandbyElector.State.INIT,
zkfc.getElectorForTests().getStateForTests());
// After the prescribed 3 seconds, should go into STANDBY state, // Should be in "INIT" state since it's not in the election
// since the other node in the cluster would have taken ACTIVE. // at this point.
cluster.waitForElectorState(0, ActiveStandbyElector.State.STANDBY); assertEquals(ActiveStandbyElector.State.INIT,
long et2 = Time.now(); zkfc.getElectorForTests().getStateForTests());
assertTrue("Should take ~3 seconds to rejoin. Only took " + (et2 - et) +
"ms before rejoining.", // After the prescribed 3 seconds, should go into STANDBY state,
et2 - et > 2800); // since the other node in the cluster would have taken ACTIVE.
} finally { cluster.waitForElectorState(0, ActiveStandbyElector.State.STANDBY);
cluster.stop(); long et2 = Time.now();
} assertTrue("Should take ~3 seconds to rejoin. Only took " + (et2 - et) +
"ms before rejoining.",
et2 - et > 2800);
} }
@Test(timeout=25000) @Test
public void testGracefulFailover() throws Exception { public void testGracefulFailover() throws Exception {
try { cluster.start();
cluster.start();
cluster.waitForActiveLockHolder(0); cluster.waitForActiveLockHolder(0);
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover(); cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
cluster.waitForActiveLockHolder(1); cluster.waitForActiveLockHolder(1);
cluster.getService(0).getZKFCProxy(conf, 5000).gracefulFailover(); cluster.getService(0).getZKFCProxy(conf, 5000).gracefulFailover();
cluster.waitForActiveLockHolder(0); cluster.waitForActiveLockHolder(0);
Thread.sleep(10000); // allow to quiesce Thread.sleep(10000); // allow to quiesce
assertEquals(0, cluster.getService(0).fenceCount); assertEquals(0, cluster.getService(0).fenceCount);
assertEquals(0, cluster.getService(1).fenceCount); assertEquals(0, cluster.getService(1).fenceCount);
assertEquals(2, cluster.getService(0).activeTransitionCount); assertEquals(2, cluster.getService(0).activeTransitionCount);
assertEquals(1, cluster.getService(1).activeTransitionCount); assertEquals(1, cluster.getService(1).activeTransitionCount);
} finally {
cluster.stop();
}
} }
@Test(timeout=15000) @Test
public void testGracefulFailoverToUnhealthy() throws Exception { public void testGracefulFailoverToUnhealthy() throws Exception {
cluster.start();
cluster.waitForActiveLockHolder(0);
// Mark it unhealthy, wait for it to exit election
cluster.setHealthy(1, false);
cluster.waitForElectorState(1, ActiveStandbyElector.State.INIT);
// Ask for failover, it should fail, because it's unhealthy
try { try {
cluster.start();
cluster.waitForActiveLockHolder(0);
// Mark it unhealthy, wait for it to exit election
cluster.setHealthy(1, false);
cluster.waitForElectorState(1, ActiveStandbyElector.State.INIT);
// Ask for failover, it should fail, because it's unhealthy
try {
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
fail("Did not fail to graceful failover to unhealthy service!");
} catch (ServiceFailedException sfe) {
GenericTestUtils.assertExceptionContains(
cluster.getService(1).toString() +
" is not currently healthy.", sfe);
}
} finally {
cluster.stop();
}
}
@Test(timeout=15000)
public void testGracefulFailoverFailBecomingActive() throws Exception {
try {
cluster.start();
cluster.waitForActiveLockHolder(0);
cluster.setFailToBecomeActive(1, true);
// Ask for failover, it should fail and report back to user.
try {
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
fail("Did not fail to graceful failover when target failed " +
"to become active!");
} catch (ServiceFailedException sfe) {
GenericTestUtils.assertExceptionContains(
"Couldn't make " + cluster.getService(1) + " active", sfe);
GenericTestUtils.assertExceptionContains(
"injected failure", sfe);
}
// No fencing
assertEquals(0, cluster.getService(0).fenceCount);
assertEquals(0, cluster.getService(1).fenceCount);
// Service 0 should go back to being active after the failed failover
cluster.waitForActiveLockHolder(0);
} finally {
cluster.stop();
}
}
@Test(timeout=15000)
public void testGracefulFailoverFailBecomingStandby() throws Exception {
try {
cluster.start();
cluster.waitForActiveLockHolder(0);
// Ask for failover when old node fails to transition to standby.
// This should trigger fencing, since the cedeActive() command
// still works, but leaves the breadcrumb in place.
cluster.setFailToBecomeStandby(0, true);
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover(); cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
fail("Did not fail to graceful failover to unhealthy service!");
// Check that the old node was fenced } catch (ServiceFailedException sfe) {
assertEquals(1, cluster.getService(0).fenceCount); GenericTestUtils.assertExceptionContains(
} finally { cluster.getService(1).toString() +
cluster.stop(); " is not currently healthy.", sfe);
} }
} }
@Test(timeout=15000) @Test
public void testGracefulFailoverFailBecomingStandbyAndFailFence() public void testGracefulFailoverFailBecomingActive() throws Exception {
throws Exception { cluster.start();
cluster.waitForActiveLockHolder(0);
cluster.setFailToBecomeActive(1, true);
// Ask for failover, it should fail and report back to user.
try { try {
cluster.start(); cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
fail("Did not fail to graceful failover when target failed " +
"to become active!");
} catch (ServiceFailedException sfe) {
GenericTestUtils.assertExceptionContains(
"Couldn't make " + cluster.getService(1) + " active", sfe);
GenericTestUtils.assertExceptionContains(
"injected failure", sfe);
}
cluster.waitForActiveLockHolder(0); // No fencing
assertEquals(0, cluster.getService(0).fenceCount);
// Ask for failover when old node fails to transition to standby. assertEquals(0, cluster.getService(1).fenceCount);
// This should trigger fencing, since the cedeActive() command
// still works, but leaves the breadcrumb in place.
cluster.setFailToBecomeStandby(0, true);
cluster.setFailToFence(0, true);
try { // Service 0 should go back to being active after the failed failover
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover(); cluster.waitForActiveLockHolder(0);
fail("Failover should have failed when old node wont fence"); }
} catch (ServiceFailedException sfe) {
GenericTestUtils.assertExceptionContains( @Test
"Unable to fence " + cluster.getService(0), sfe); public void testGracefulFailoverFailBecomingStandby() throws Exception {
} cluster.start();
} finally {
cluster.stop(); cluster.waitForActiveLockHolder(0);
// Ask for failover when old node fails to transition to standby.
// This should trigger fencing, since the cedeActive() command
// still works, but leaves the breadcrumb in place.
cluster.setFailToBecomeStandby(0, true);
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
// Check that the old node was fenced
assertEquals(1, cluster.getService(0).fenceCount);
}
@Test
public void testGracefulFailoverFailBecomingStandbyAndFailFence()
throws Exception {
cluster.start();
cluster.waitForActiveLockHolder(0);
// Ask for failover when old node fails to transition to standby.
// This should trigger fencing, since the cedeActive() command
// still works, but leaves the breadcrumb in place.
cluster.setFailToBecomeStandby(0, true);
cluster.setFailToFence(0, true);
try {
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
fail("Failover should have failed when old node wont fence");
} catch (ServiceFailedException sfe) {
GenericTestUtils.assertExceptionContains(
"Unable to fence " + cluster.getService(0), sfe);
} }
} }
@ -568,73 +539,65 @@ public void testGracefulFailoverFailBecomingStandbyAndFailFence()
* Test which exercises all of the inputs into ZKFC. This is particularly * Test which exercises all of the inputs into ZKFC. This is particularly
* useful for running under jcarder to check for lock order violations. * useful for running under jcarder to check for lock order violations.
*/ */
@Test(timeout=30000) @Test
public void testOneOfEverything() throws Exception { public void testOneOfEverything() throws Exception {
try { cluster.start();
cluster.start();
// Failover by session expiration
LOG.info("====== Failing over by session expiration");
cluster.expireAndVerifyFailover(0, 1);
cluster.expireAndVerifyFailover(1, 0);
// Restart ZK
LOG.info("====== Restarting server");
stopServer();
waitForServerDown(hostPort, CONNECTION_TIMEOUT);
startServer();
waitForServerUp(hostPort, CONNECTION_TIMEOUT);
// Failover by bad health // Failover by session expiration
cluster.setHealthy(0, false); LOG.info("====== Failing over by session expiration");
cluster.waitForHAState(0, HAServiceState.INITIALIZING); cluster.expireAndVerifyFailover(0, 1);
cluster.waitForHAState(1, HAServiceState.ACTIVE); cluster.expireAndVerifyFailover(1, 0);
cluster.setHealthy(1, true);
cluster.setHealthy(0, false); // Restart ZK
cluster.waitForHAState(1, HAServiceState.ACTIVE); LOG.info("====== Restarting server");
cluster.waitForHAState(0, HAServiceState.INITIALIZING); stopServer();
cluster.setHealthy(0, true); waitForServerDown(hostPort, CONNECTION_TIMEOUT);
startServer();
cluster.waitForHealthState(0, State.SERVICE_HEALTHY); waitForServerUp(hostPort, CONNECTION_TIMEOUT);
// Graceful failovers // Failover by bad health
cluster.getZkfc(1).gracefulFailoverToYou(); cluster.setHealthy(0, false);
cluster.getZkfc(0).gracefulFailoverToYou(); cluster.waitForHAState(0, HAServiceState.INITIALIZING);
} finally { cluster.waitForHAState(1, HAServiceState.ACTIVE);
cluster.stop(); cluster.setHealthy(1, true);
} cluster.setHealthy(0, false);
cluster.waitForHAState(1, HAServiceState.ACTIVE);
cluster.waitForHAState(0, HAServiceState.INITIALIZING);
cluster.setHealthy(0, true);
cluster.waitForHealthState(0, State.SERVICE_HEALTHY);
// Graceful failovers
cluster.getZkfc(1).gracefulFailoverToYou();
cluster.getZkfc(0).gracefulFailoverToYou();
} }
@Test(timeout = 25000) @Test
public void testGracefulFailoverMultipleZKfcs() throws Exception { public void testGracefulFailoverMultipleZKfcs() throws Exception {
try { cluster.start(3);
cluster.start(3);
cluster.waitForActiveLockHolder(0); cluster.waitForActiveLockHolder(0);
// failover to first // failover to first
cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover(); cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
cluster.waitForActiveLockHolder(1); cluster.waitForActiveLockHolder(1);
// failover to second // failover to second
cluster.getService(2).getZKFCProxy(conf, 5000).gracefulFailover(); cluster.getService(2).getZKFCProxy(conf, 5000).gracefulFailover();
cluster.waitForActiveLockHolder(2); cluster.waitForActiveLockHolder(2);
// failover back to original // failover back to original
cluster.getService(0).getZKFCProxy(conf, 5000).gracefulFailover(); cluster.getService(0).getZKFCProxy(conf, 5000).gracefulFailover();
cluster.waitForActiveLockHolder(0); cluster.waitForActiveLockHolder(0);
Thread.sleep(10000); // allow to quiesce Thread.sleep(10000); // allow to quiesce
assertEquals(0, cluster.getService(0).fenceCount); assertEquals(0, cluster.getService(0).fenceCount);
assertEquals(0, cluster.getService(1).fenceCount); assertEquals(0, cluster.getService(1).fenceCount);
assertEquals(0, cluster.getService(2).fenceCount); assertEquals(0, cluster.getService(2).fenceCount);
assertEquals(2, cluster.getService(0).activeTransitionCount); assertEquals(2, cluster.getService(0).activeTransitionCount);
assertEquals(1, cluster.getService(1).activeTransitionCount); assertEquals(1, cluster.getService(1).activeTransitionCount);
assertEquals(1, cluster.getService(2).activeTransitionCount); assertEquals(1, cluster.getService(2).activeTransitionCount);
} finally {
cluster.stop();
}
} }
private int runFC(DummyHAService target, String ... args) throws Exception { private int runFC(DummyHAService target, String ... args) throws Exception {