HADOOP-7991. HA: the FailoverController should check the standby is ready before failing over. Contributed by Eli Collins

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1239774 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Eli Collins 2012-02-02 19:20:32 +00:00
parent 4d779e088a
commit 4324e1bcd7
8 changed files with 122 additions and 25 deletions

View File

@ -39,3 +39,6 @@ HADOOP-7983. HA: failover should be able to pass args to fencers. (eli)
HADOOP-7938. HA: the FailoverController should optionally fence the HADOOP-7938. HA: the FailoverController should optionally fence the
active during failover. (eli) active during failover. (eli)
HADOOP-7991. HA: the FailoverController should check the standby is
ready before failing over. (eli)

View File

@ -46,12 +46,19 @@ public class FailoverController {
* failover to, eg to prevent failing over to a service (eg due * failover to, eg to prevent failing over to a service (eg due
* to it being inaccessible, already active, not healthy, etc). * to it being inaccessible, already active, not healthy, etc).
* *
* An option to ignore toSvc if it claims it is not ready to
* become active is provided in case performing a failover will
* allow it to become active, eg because it triggers a log roll
* so the standby can learn about new blocks and leave safemode.
*
* @param toSvc service to make active * @param toSvc service to make active
* @param toSvcName name of service to make active * @param toSvcName name of service to make active
* @param forceActive ignore toSvc if it reports that it is not ready
* @throws FailoverFailedException if we should avoid failover * @throws FailoverFailedException if we should avoid failover
*/ */
private static void preFailoverChecks(HAServiceProtocol toSvc, private static void preFailoverChecks(HAServiceProtocol toSvc,
InetSocketAddress toSvcAddr) InetSocketAddress toSvcAddr,
boolean forceActive)
throws FailoverFailedException { throws FailoverFailedException {
HAServiceState toSvcState; HAServiceState toSvcState;
try { try {
@ -74,7 +81,17 @@ public class FailoverController {
throw new FailoverFailedException( throw new FailoverFailedException(
"Got an IO exception", e); "Got an IO exception", e);
} }
// TODO(HA): ask toSvc if it's capable. Eg not in SM. try {
if (!toSvc.readyToBecomeActive()) {
if (!forceActive) {
throw new FailoverFailedException(
toSvcAddr + " is not ready to become active");
}
}
} catch (IOException e) {
throw new FailoverFailedException(
"Got an IO exception", e);
}
} }
/** /**
@ -87,16 +104,19 @@ public class FailoverController {
* @param toSvcAddr addr of the service to make active * @param toSvcAddr addr of the service to make active
* @param fencer for fencing fromSvc * @param fencer for fencing fromSvc
* @param forceFence to fence fromSvc even if not strictly necessary * @param forceFence to fence fromSvc even if not strictly necessary
* @param forceActive try to make toSvc active even if it is not ready
* @throws FailoverFailedException if the failover fails * @throws FailoverFailedException if the failover fails
*/ */
public static void failover(HAServiceProtocol fromSvc, public static void failover(HAServiceProtocol fromSvc,
InetSocketAddress fromSvcAddr, InetSocketAddress fromSvcAddr,
HAServiceProtocol toSvc, HAServiceProtocol toSvc,
InetSocketAddress toSvcAddr, InetSocketAddress toSvcAddr,
NodeFencer fencer, boolean forceFence) NodeFencer fencer,
boolean forceFence,
boolean forceActive)
throws FailoverFailedException { throws FailoverFailedException {
Preconditions.checkArgument(fencer != null, "failover requires a fencer"); Preconditions.checkArgument(fencer != null, "failover requires a fencer");
preFailoverChecks(toSvc, toSvcAddr); preFailoverChecks(toSvc, toSvcAddr, forceActive);
// Try to make fromSvc standby // Try to make fromSvc standby
boolean tryFence = true; boolean tryFence = true;
@ -145,7 +165,9 @@ public class FailoverController {
try { try {
// Unconditionally fence toSvc in case it is still trying to // Unconditionally fence toSvc in case it is still trying to
// become active, eg we timed out waiting for its response. // become active, eg we timed out waiting for its response.
failover(toSvc, toSvcAddr, fromSvc, fromSvcAddr, fencer, true); // Unconditionally force fromSvc to become active since it
// was previously active when we initiated failover.
failover(toSvc, toSvcAddr, fromSvc, fromSvcAddr, fencer, true, true);
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
msg += ". Failback to " + fromSvcAddr + msg += ". Failback to " + fromSvcAddr +
" failed (" + ffe.getMessage() + ")"; " failed (" + ffe.getMessage() + ")";

View File

@ -48,6 +48,7 @@ import com.google.common.collect.ImmutableMap;
public abstract class HAAdmin extends Configured implements Tool { public abstract class HAAdmin extends Configured implements Tool {
private static final String FORCEFENCE = "forcefence"; private static final String FORCEFENCE = "forcefence";
private static final String FORCEACTIVE = "forceactive";
private static Map<String, UsageInfo> USAGE = private static Map<String, UsageInfo> USAGE =
ImmutableMap.<String, UsageInfo>builder() ImmutableMap.<String, UsageInfo>builder()
@ -56,9 +57,11 @@ public abstract class HAAdmin extends Configured implements Tool {
.put("-transitionToStandby", .put("-transitionToStandby",
new UsageInfo("<host:port>", "Transitions the daemon into Standby state")) new UsageInfo("<host:port>", "Transitions the daemon into Standby state"))
.put("-failover", .put("-failover",
new UsageInfo("[--"+FORCEFENCE+"] <host:port> <host:port>", new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <host:port> <host:port>",
"Failover from the first daemon to the second.\n" + "Failover from the first daemon to the second.\n" +
"Unconditionally fence services if the "+FORCEFENCE+" option is used.")) "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" +
"Try to failover to the target service even if it is not ready if the " +
FORCEACTIVE + " option is used."))
.put("-getServiceState", .put("-getServiceState",
new UsageInfo("<host:port>", "Returns the state of the daemon")) new UsageInfo("<host:port>", "Returns the state of the daemon"))
.put("-checkHealth", .put("-checkHealth",
@ -124,12 +127,14 @@ public abstract class HAAdmin extends Configured implements Tool {
throws IOException, ServiceFailedException { throws IOException, ServiceFailedException {
Configuration conf = getConf(); Configuration conf = getConf();
boolean forceFence = false; boolean forceFence = false;
boolean forceActive = false;
Options failoverOpts = new Options(); Options failoverOpts = new Options();
// "-failover" isn't really an option but we need to add // "-failover" isn't really an option but we need to add
// it to appease CommandLineParser // it to appease CommandLineParser
failoverOpts.addOption("failover", false, "failover"); failoverOpts.addOption("failover", false, "failover");
failoverOpts.addOption(FORCEFENCE, false, "force fencing"); failoverOpts.addOption(FORCEFENCE, false, "force fencing");
failoverOpts.addOption(FORCEACTIVE, false, "force failover");
CommandLineParser parser = new GnuParser(); CommandLineParser parser = new GnuParser();
CommandLine cmd; CommandLine cmd;
@ -137,6 +142,7 @@ public abstract class HAAdmin extends Configured implements Tool {
try { try {
cmd = parser.parse(failoverOpts, argv); cmd = parser.parse(failoverOpts, argv);
forceFence = cmd.hasOption(FORCEFENCE); forceFence = cmd.hasOption(FORCEFENCE);
forceActive = cmd.hasOption(FORCEACTIVE);
} catch (ParseException pe) { } catch (ParseException pe) {
errOut.println("failover: incorrect arguments"); errOut.println("failover: incorrect arguments");
printUsage(errOut, "-failover"); printUsage(errOut, "-failover");
@ -172,7 +178,7 @@ public abstract class HAAdmin extends Configured implements Tool {
try { try {
FailoverController.failover(proto1, addr1, proto2, addr2, FailoverController.failover(proto1, addr1, proto2, addr2,
fencer, forceFence); fencer, forceFence, forceActive);
out.println("Failover from "+args[0]+" to "+args[1]+" successful"); out.println("Failover from "+args[0]+" to "+args[1]+" successful");
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
errOut.println("Failover failed: " + ffe.getLocalizedMessage()); errOut.println("Failover failed: " + ffe.getLocalizedMessage());

View File

@ -112,4 +112,15 @@ public interface HAServiceProtocol extends VersionedProtocol {
* if other errors happen * if other errors happen
*/ */
public HAServiceState getServiceState() throws IOException; public HAServiceState getServiceState() throws IOException;
/**
* Return true if the service is capable and ready to transition
* from the standby state to the active state.
*
* @return true if the service is ready to become active, false otherwise.
* @throws IOException
* if other errors happen
*/
public boolean readyToBecomeActive() throws ServiceFailedException,
IOException;
} }

View File

@ -79,6 +79,11 @@ public class TestFailoverController {
public HAServiceState getServiceState() throws IOException { public HAServiceState getServiceState() throws IOException {
return state; return state;
} }
@Override
public boolean readyToBecomeActive() throws ServiceFailedException, IOException {
return true;
}
} }
@Test @Test
@ -88,13 +93,13 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
AlwaysSucceedFencer.fenceCalled = 0; AlwaysSucceedFencer.fenceCalled = 0;
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled); assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
AlwaysSucceedFencer.fenceCalled = 0; AlwaysSucceedFencer.fenceCalled = 0;
FailoverController.failover(svc2, svc2Addr, svc1, svc1Addr, fencer, false); FailoverController.failover(svc2, svc2Addr, svc1, svc1Addr, fencer, false, false);
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled); assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
assertEquals(HAServiceState.STANDBY, svc2.getServiceState()); assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
@ -106,7 +111,7 @@ public class TestFailoverController {
DummyService svc2 = new DummyService(HAServiceState.STANDBY); DummyService svc2 = new DummyService(HAServiceState.STANDBY);
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
} }
@ -118,7 +123,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Can't failover to an already active service"); fail("Can't failover to an already active service");
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
// Expected // Expected
@ -128,6 +133,33 @@ public class TestFailoverController {
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
} }
@Test
public void testFailoverToUnreadyService() throws Exception {
DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
@Override
public boolean readyToBecomeActive() throws ServiceFailedException, IOException {
return false;
}
};
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Can't failover to a service that's not ready");
} catch (FailoverFailedException ffe) {
// Expected
}
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
// Forcing it means we ignore readyToBecomeActive
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, true);
assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
}
@Test @Test
public void testFailoverToUnhealthyServiceFailsAndFailsback() throws Exception { public void testFailoverToUnhealthyServiceFailsAndFailsback() throws Exception {
DummyService svc1 = new DummyService(HAServiceState.ACTIVE); DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
@ -140,7 +172,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failover to unhealthy service"); fail("Failover to unhealthy service");
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
// Expected // Expected
@ -162,7 +194,7 @@ public class TestFailoverController {
AlwaysSucceedFencer.fenceCalled = 0; AlwaysSucceedFencer.fenceCalled = 0;
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
fail("Faulty active prevented failover"); fail("Faulty active prevented failover");
} }
@ -187,7 +219,7 @@ public class TestFailoverController {
AlwaysFailFencer.fenceCalled = 0; AlwaysFailFencer.fenceCalled = 0;
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failed over even though fencing failed"); fail("Failed over even though fencing failed");
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
// Expected // Expected
@ -207,7 +239,7 @@ public class TestFailoverController {
AlwaysFailFencer.fenceCalled = 0; AlwaysFailFencer.fenceCalled = 0;
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false);
fail("Failed over even though fencing requested and failed"); fail("Failed over even though fencing requested and failed");
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
// Expected // Expected
@ -238,7 +270,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
fail("Non-existant active prevented failover"); fail("Non-existant active prevented failover");
} }
@ -254,7 +286,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failed over to a non-existant standby"); fail("Failed over to a non-existant standby");
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
// Expected // Expected
@ -275,7 +307,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failover to already active service"); fail("Failover to already active service");
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
// Expected // Expected
@ -300,7 +332,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false);
fail("Failed over to service that won't transition to active"); fail("Failed over to service that won't transition to active");
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
// Expected // Expected
@ -325,7 +357,7 @@ public class TestFailoverController {
AlwaysSucceedFencer.fenceCalled = 0; AlwaysSucceedFencer.fenceCalled = 0;
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failed over to service that won't transition to active"); fail("Failed over to service that won't transition to active");
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
// Expected // Expected
@ -352,7 +384,7 @@ public class TestFailoverController {
AlwaysFailFencer.fenceCalled = 0; AlwaysFailFencer.fenceCalled = 0;
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failed over to service that won't transition to active"); fail("Failed over to service that won't transition to active");
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
// Expected // Expected
@ -383,7 +415,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try { try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failover to already active service"); fail("Failover to already active service");
} catch (FailoverFailedException ffe) { } catch (FailoverFailedException ffe) {
// Expected // Expected

View File

@ -31,6 +31,7 @@ import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.mockito.Mockito; import org.mockito.Mockito;
import static org.mockito.Mockito.when;
import com.google.common.base.Charsets; import com.google.common.base.Charsets;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
@ -44,8 +45,9 @@ public class TestHAAdmin {
private HAServiceProtocol mockProtocol; private HAServiceProtocol mockProtocol;
@Before @Before
public void setup() { public void setup() throws IOException {
mockProtocol = Mockito.mock(HAServiceProtocol.class); mockProtocol = Mockito.mock(HAServiceProtocol.class);
when(mockProtocol.readyToBecomeActive()).thenReturn(true);
tool = new HAAdmin() { tool = new HAAdmin() {
@Override @Override
protected HAServiceProtocol getProtocol(String target) throws IOException { protected HAServiceProtocol getProtocol(String target) throws IOException {
@ -130,6 +132,15 @@ public class TestHAAdmin {
assertEquals(0, runTool("-failover", "foo:1234", "bar:5678", "--forcefence")); assertEquals(0, runTool("-failover", "foo:1234", "bar:5678", "--forcefence"));
} }
@Test
public void testFailoverWithForceActive() throws Exception {
Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
Configuration conf = new Configuration();
conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)");
tool.setConf(conf);
assertEquals(0, runTool("-failover", "foo:1234", "bar:5678", "--forceactive"));
}
@Test @Test
public void testFailoverWithInvalidFenceArg() throws Exception { public void testFailoverWithInvalidFenceArg() throws Exception {
Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();

View File

@ -929,6 +929,13 @@ public class NameNode {
return state.getServiceState(); return state.getServiceState();
} }
synchronized boolean readyToBecomeActive() throws ServiceFailedException {
if (!haEnabled) {
throw new ServiceFailedException("HA for namenode is not enabled");
}
return !isInSafeMode();
}
/** /**
* Class used as expose {@link NameNode} as context to {@link HAState} * Class used as expose {@link NameNode} as context to {@link HAState}

View File

@ -1007,6 +1007,11 @@ class NameNodeRpcServer implements NamenodeProtocols {
return nn.getServiceState(); return nn.getServiceState();
} }
@Override // HAServiceProtocol
public synchronized boolean readyToBecomeActive() throws ServiceFailedException {
return nn.readyToBecomeActive();
}
/** /**
* Verify version. * Verify version.
* *