HADOOP-7991. HA: the FailoverController should check the standby is ready before failing over. Contributed by Eli Collins

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1239774 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Eli Collins 2012-02-02 19:20:32 +00:00
parent 4d779e088a
commit 4324e1bcd7
8 changed files with 122 additions and 25 deletions

View File

@ -39,3 +39,6 @@ HADOOP-7983. HA: failover should be able to pass args to fencers. (eli)
HADOOP-7938. HA: the FailoverController should optionally fence the
active during failover. (eli)
HADOOP-7991. HA: the FailoverController should check the standby is
ready before failing over. (eli)

View File

@ -46,12 +46,19 @@ public class FailoverController {
* failover to, eg to prevent failing over to a service (eg due
* to it being inaccessible, already active, not healthy, etc).
*
* An option to ignore toSvc if it claims it is not ready to
* become active is provided in case performing a failover will
* allow it to become active, eg because it triggers a log roll
* so the standby can learn about new blocks and leave safemode.
*
* @param toSvc service to make active
* @param toSvcName name of service to make active
* @param forceActive ignore toSvc if it reports that it is not ready
* @throws FailoverFailedException if we should avoid failover
*/
private static void preFailoverChecks(HAServiceProtocol toSvc,
InetSocketAddress toSvcAddr)
InetSocketAddress toSvcAddr,
boolean forceActive)
throws FailoverFailedException {
HAServiceState toSvcState;
try {
@ -74,7 +81,17 @@ public class FailoverController {
throw new FailoverFailedException(
"Got an IO exception", e);
}
// TODO(HA): ask toSvc if it's capable. Eg not in SM.
try {
if (!toSvc.readyToBecomeActive()) {
if (!forceActive) {
throw new FailoverFailedException(
toSvcAddr + " is not ready to become active");
}
}
} catch (IOException e) {
throw new FailoverFailedException(
"Got an IO exception", e);
}
}
/**
@ -87,16 +104,19 @@ public class FailoverController {
* @param toSvcAddr addr of the service to make active
* @param fencer for fencing fromSvc
* @param forceFence to fence fromSvc even if not strictly necessary
* @param forceActive try to make toSvc active even if it is not ready
* @throws FailoverFailedException if the failover fails
*/
public static void failover(HAServiceProtocol fromSvc,
InetSocketAddress fromSvcAddr,
HAServiceProtocol toSvc,
InetSocketAddress toSvcAddr,
NodeFencer fencer, boolean forceFence)
NodeFencer fencer,
boolean forceFence,
boolean forceActive)
throws FailoverFailedException {
Preconditions.checkArgument(fencer != null, "failover requires a fencer");
preFailoverChecks(toSvc, toSvcAddr);
preFailoverChecks(toSvc, toSvcAddr, forceActive);
// Try to make fromSvc standby
boolean tryFence = true;
@ -145,7 +165,9 @@ public class FailoverController {
try {
// Unconditionally fence toSvc in case it is still trying to
// become active, eg we timed out waiting for its response.
failover(toSvc, toSvcAddr, fromSvc, fromSvcAddr, fencer, true);
// Unconditionally force fromSvc to become active since it
// was previously active when we initiated failover.
failover(toSvc, toSvcAddr, fromSvc, fromSvcAddr, fencer, true, true);
} catch (FailoverFailedException ffe) {
msg += ". Failback to " + fromSvcAddr +
" failed (" + ffe.getMessage() + ")";

View File

@ -47,7 +47,8 @@ import com.google.common.collect.ImmutableMap;
public abstract class HAAdmin extends Configured implements Tool {
private static final String FORCEFENCE = "forcefence";
private static final String FORCEFENCE = "forcefence";
private static final String FORCEACTIVE = "forceactive";
private static Map<String, UsageInfo> USAGE =
ImmutableMap.<String, UsageInfo>builder()
@ -56,9 +57,11 @@ public abstract class HAAdmin extends Configured implements Tool {
.put("-transitionToStandby",
new UsageInfo("<host:port>", "Transitions the daemon into Standby state"))
.put("-failover",
new UsageInfo("[--"+FORCEFENCE+"] <host:port> <host:port>",
new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <host:port> <host:port>",
"Failover from the first daemon to the second.\n" +
"Unconditionally fence services if the "+FORCEFENCE+" option is used."))
"Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" +
"Try to failover to the target service even if it is not ready if the " +
FORCEACTIVE + " option is used."))
.put("-getServiceState",
new UsageInfo("<host:port>", "Returns the state of the daemon"))
.put("-checkHealth",
@ -124,12 +127,14 @@ public abstract class HAAdmin extends Configured implements Tool {
throws IOException, ServiceFailedException {
Configuration conf = getConf();
boolean forceFence = false;
boolean forceActive = false;
Options failoverOpts = new Options();
// "-failover" isn't really an option but we need to add
// it to appease CommandLineParser
failoverOpts.addOption("failover", false, "failover");
failoverOpts.addOption(FORCEFENCE, false, "force fencing");
failoverOpts.addOption(FORCEACTIVE, false, "force failover");
CommandLineParser parser = new GnuParser();
CommandLine cmd;
@ -137,6 +142,7 @@ public abstract class HAAdmin extends Configured implements Tool {
try {
cmd = parser.parse(failoverOpts, argv);
forceFence = cmd.hasOption(FORCEFENCE);
forceActive = cmd.hasOption(FORCEACTIVE);
} catch (ParseException pe) {
errOut.println("failover: incorrect arguments");
printUsage(errOut, "-failover");
@ -172,7 +178,7 @@ public abstract class HAAdmin extends Configured implements Tool {
try {
FailoverController.failover(proto1, addr1, proto2, addr2,
fencer, forceFence);
fencer, forceFence, forceActive);
out.println("Failover from "+args[0]+" to "+args[1]+" successful");
} catch (FailoverFailedException ffe) {
errOut.println("Failover failed: " + ffe.getLocalizedMessage());

View File

@ -112,4 +112,15 @@ public interface HAServiceProtocol extends VersionedProtocol {
* if other errors happen
*/
public HAServiceState getServiceState() throws IOException;
/**
* Return true if the service is capable and ready to transition
* from the standby state to the active state.
*
* @return true if the service is ready to become active, false otherwise.
* @throws IOException
* if other errors happen
*/
public boolean readyToBecomeActive() throws ServiceFailedException,
IOException;
}

View File

@ -79,6 +79,11 @@ public class TestFailoverController {
public HAServiceState getServiceState() throws IOException {
return state;
}
@Override
public boolean readyToBecomeActive() throws ServiceFailedException, IOException {
return true;
}
}
@Test
@ -88,13 +93,13 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
AlwaysSucceedFencer.fenceCalled = 0;
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
AlwaysSucceedFencer.fenceCalled = 0;
FailoverController.failover(svc2, svc2Addr, svc1, svc1Addr, fencer, false);
FailoverController.failover(svc2, svc2Addr, svc1, svc1Addr, fencer, false, false);
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
@ -106,7 +111,7 @@ public class TestFailoverController {
DummyService svc2 = new DummyService(HAServiceState.STANDBY);
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
}
@ -118,7 +123,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Can't failover to an already active service");
} catch (FailoverFailedException ffe) {
// Expected
@ -128,6 +133,33 @@ public class TestFailoverController {
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
}
@Test
public void testFailoverToUnreadyService() throws Exception {
DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
DummyService svc2 = new DummyService(HAServiceState.STANDBY) {
@Override
public boolean readyToBecomeActive() throws ServiceFailedException, IOException {
return false;
}
};
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Can't failover to a service that's not ready");
} catch (FailoverFailedException ffe) {
// Expected
}
assertEquals(HAServiceState.ACTIVE, svc1.getServiceState());
assertEquals(HAServiceState.STANDBY, svc2.getServiceState());
// Forcing it means we ignore readyToBecomeActive
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, true);
assertEquals(HAServiceState.STANDBY, svc1.getServiceState());
assertEquals(HAServiceState.ACTIVE, svc2.getServiceState());
}
@Test
public void testFailoverToUnhealthyServiceFailsAndFailsback() throws Exception {
DummyService svc1 = new DummyService(HAServiceState.ACTIVE);
@ -140,7 +172,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failover to unhealthy service");
} catch (FailoverFailedException ffe) {
// Expected
@ -162,7 +194,7 @@ public class TestFailoverController {
AlwaysSucceedFencer.fenceCalled = 0;
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
} catch (FailoverFailedException ffe) {
fail("Faulty active prevented failover");
}
@ -187,7 +219,7 @@ public class TestFailoverController {
AlwaysFailFencer.fenceCalled = 0;
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failed over even though fencing failed");
} catch (FailoverFailedException ffe) {
// Expected
@ -207,7 +239,7 @@ public class TestFailoverController {
AlwaysFailFencer.fenceCalled = 0;
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false);
fail("Failed over even though fencing requested and failed");
} catch (FailoverFailedException ffe) {
// Expected
@ -238,7 +270,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
} catch (FailoverFailedException ffe) {
fail("Non-existant active prevented failover");
}
@ -254,7 +286,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failed over to a non-existant standby");
} catch (FailoverFailedException ffe) {
// Expected
@ -275,7 +307,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failover to already active service");
} catch (FailoverFailedException ffe) {
// Expected
@ -300,7 +332,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false);
fail("Failed over to service that won't transition to active");
} catch (FailoverFailedException ffe) {
// Expected
@ -325,7 +357,7 @@ public class TestFailoverController {
AlwaysSucceedFencer.fenceCalled = 0;
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failed over to service that won't transition to active");
} catch (FailoverFailedException ffe) {
// Expected
@ -352,7 +384,7 @@ public class TestFailoverController {
AlwaysFailFencer.fenceCalled = 0;
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failed over to service that won't transition to active");
} catch (FailoverFailedException ffe) {
// Expected
@ -383,7 +415,7 @@ public class TestFailoverController {
NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false);
FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false);
fail("Failover to already active service");
} catch (FailoverFailedException ffe) {
// Expected

View File

@ -31,6 +31,7 @@ import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.junit.Before;
import org.junit.Test;
import org.mockito.Mockito;
import static org.mockito.Mockito.when;
import com.google.common.base.Charsets;
import com.google.common.base.Joiner;
@ -44,8 +45,9 @@ public class TestHAAdmin {
private HAServiceProtocol mockProtocol;
@Before
public void setup() {
public void setup() throws IOException {
mockProtocol = Mockito.mock(HAServiceProtocol.class);
when(mockProtocol.readyToBecomeActive()).thenReturn(true);
tool = new HAAdmin() {
@Override
protected HAServiceProtocol getProtocol(String target) throws IOException {
@ -130,6 +132,15 @@ public class TestHAAdmin {
assertEquals(0, runTool("-failover", "foo:1234", "bar:5678", "--forcefence"));
}
@Test
public void testFailoverWithForceActive() throws Exception {
Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();
Configuration conf = new Configuration();
conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)");
tool.setConf(conf);
assertEquals(0, runTool("-failover", "foo:1234", "bar:5678", "--forceactive"));
}
@Test
public void testFailoverWithInvalidFenceArg() throws Exception {
Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState();

View File

@ -929,6 +929,13 @@ public class NameNode {
return state.getServiceState();
}
synchronized boolean readyToBecomeActive() throws ServiceFailedException {
if (!haEnabled) {
throw new ServiceFailedException("HA for namenode is not enabled");
}
return !isInSafeMode();
}
/**
* Class used as expose {@link NameNode} as context to {@link HAState}

View File

@ -1007,6 +1007,11 @@ class NameNodeRpcServer implements NamenodeProtocols {
return nn.getServiceState();
}
@Override // HAServiceProtocol
public synchronized boolean readyToBecomeActive() throws ServiceFailedException {
return nn.readyToBecomeActive();
}
/**
* Verify version.
*