From 4324e1bcd78a98e2fb92c81ee959e25b4193da4f Mon Sep 17 00:00:00 2001 From: Eli Collins Date: Thu, 2 Feb 2012 19:20:32 +0000 Subject: [PATCH] HADOOP-7991. HA: the FailoverController should check the standby is ready before failing over. Contributed by Eli Collins git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1239774 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-common/CHANGES.HDFS-1623.txt | 3 + .../apache/hadoop/ha/FailoverController.java | 32 ++++++++-- .../java/org/apache/hadoop/ha/HAAdmin.java | 14 +++-- .../apache/hadoop/ha/HAServiceProtocol.java | 11 ++++ .../hadoop/ha/TestFailoverController.java | 62 ++++++++++++++----- .../org/apache/hadoop/ha/TestHAAdmin.java | 13 +++- .../hadoop/hdfs/server/namenode/NameNode.java | 7 +++ .../server/namenode/NameNodeRpcServer.java | 5 ++ 8 files changed, 122 insertions(+), 25 deletions(-) diff --git a/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt b/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt index f62c7177214..2170cd2a69a 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.HDFS-1623.txt @@ -39,3 +39,6 @@ HADOOP-7983. HA: failover should be able to pass args to fencers. (eli) HADOOP-7938. HA: the FailoverController should optionally fence the active during failover. (eli) + +HADOOP-7991. HA: the FailoverController should check the standby is +ready before failing over. (eli) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java index 711296d342f..7205f9f53b5 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java @@ -46,12 +46,19 @@ public class FailoverController { * failover to, eg to prevent failing over to a service (eg due * to it being inaccessible, already active, not healthy, etc). * + * An option to ignore toSvc if it claims it is not ready to + * become active is provided in case performing a failover will + * allow it to become active, eg because it triggers a log roll + * so the standby can learn about new blocks and leave safemode. + * * @param toSvc service to make active * @param toSvcName name of service to make active + * @param forceActive ignore toSvc if it reports that it is not ready * @throws FailoverFailedException if we should avoid failover */ private static void preFailoverChecks(HAServiceProtocol toSvc, - InetSocketAddress toSvcAddr) + InetSocketAddress toSvcAddr, + boolean forceActive) throws FailoverFailedException { HAServiceState toSvcState; try { @@ -74,7 +81,17 @@ public class FailoverController { throw new FailoverFailedException( "Got an IO exception", e); } - // TODO(HA): ask toSvc if it's capable. Eg not in SM. + try { + if (!toSvc.readyToBecomeActive()) { + if (!forceActive) { + throw new FailoverFailedException( + toSvcAddr + " is not ready to become active"); + } + } + } catch (IOException e) { + throw new FailoverFailedException( + "Got an IO exception", e); + } } /** @@ -87,16 +104,19 @@ public class FailoverController { * @param toSvcAddr addr of the service to make active * @param fencer for fencing fromSvc * @param forceFence to fence fromSvc even if not strictly necessary + * @param forceActive try to make toSvc active even if it is not ready * @throws FailoverFailedException if the failover fails */ public static void failover(HAServiceProtocol fromSvc, InetSocketAddress fromSvcAddr, HAServiceProtocol toSvc, InetSocketAddress toSvcAddr, - NodeFencer fencer, boolean forceFence) + NodeFencer fencer, + boolean forceFence, + boolean forceActive) throws FailoverFailedException { Preconditions.checkArgument(fencer != null, "failover requires a fencer"); - preFailoverChecks(toSvc, toSvcAddr); + preFailoverChecks(toSvc, toSvcAddr, forceActive); // Try to make fromSvc standby boolean tryFence = true; @@ -145,7 +165,9 @@ public class FailoverController { try { // Unconditionally fence toSvc in case it is still trying to // become active, eg we timed out waiting for its response. - failover(toSvc, toSvcAddr, fromSvc, fromSvcAddr, fencer, true); + // Unconditionally force fromSvc to become active since it + // was previously active when we initiated failover. + failover(toSvc, toSvcAddr, fromSvc, fromSvcAddr, fencer, true, true); } catch (FailoverFailedException ffe) { msg += ". Failback to " + fromSvcAddr + " failed (" + ffe.getMessage() + ")"; diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java index 714fe6c110c..2286a357662 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java @@ -47,7 +47,8 @@ import com.google.common.collect.ImmutableMap; public abstract class HAAdmin extends Configured implements Tool { - private static final String FORCEFENCE = "forcefence"; + private static final String FORCEFENCE = "forcefence"; + private static final String FORCEACTIVE = "forceactive"; private static Map USAGE = ImmutableMap.builder() @@ -56,9 +57,11 @@ public abstract class HAAdmin extends Configured implements Tool { .put("-transitionToStandby", new UsageInfo("", "Transitions the daemon into Standby state")) .put("-failover", - new UsageInfo("[--"+FORCEFENCE+"] ", + new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] ", "Failover from the first daemon to the second.\n" + - "Unconditionally fence services if the "+FORCEFENCE+" option is used.")) + "Unconditionally fence services if the "+FORCEFENCE+" option is used.\n" + + "Try to failover to the target service even if it is not ready if the " + + FORCEACTIVE + " option is used.")) .put("-getServiceState", new UsageInfo("", "Returns the state of the daemon")) .put("-checkHealth", @@ -124,12 +127,14 @@ public abstract class HAAdmin extends Configured implements Tool { throws IOException, ServiceFailedException { Configuration conf = getConf(); boolean forceFence = false; + boolean forceActive = false; Options failoverOpts = new Options(); // "-failover" isn't really an option but we need to add // it to appease CommandLineParser failoverOpts.addOption("failover", false, "failover"); failoverOpts.addOption(FORCEFENCE, false, "force fencing"); + failoverOpts.addOption(FORCEACTIVE, false, "force failover"); CommandLineParser parser = new GnuParser(); CommandLine cmd; @@ -137,6 +142,7 @@ public abstract class HAAdmin extends Configured implements Tool { try { cmd = parser.parse(failoverOpts, argv); forceFence = cmd.hasOption(FORCEFENCE); + forceActive = cmd.hasOption(FORCEACTIVE); } catch (ParseException pe) { errOut.println("failover: incorrect arguments"); printUsage(errOut, "-failover"); @@ -172,7 +178,7 @@ public abstract class HAAdmin extends Configured implements Tool { try { FailoverController.failover(proto1, addr1, proto2, addr2, - fencer, forceFence); + fencer, forceFence, forceActive); out.println("Failover from "+args[0]+" to "+args[1]+" successful"); } catch (FailoverFailedException ffe) { errOut.println("Failover failed: " + ffe.getLocalizedMessage()); diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocol.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocol.java index 672c6d6fba3..9a7316db054 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocol.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceProtocol.java @@ -112,4 +112,15 @@ public interface HAServiceProtocol extends VersionedProtocol { * if other errors happen */ public HAServiceState getServiceState() throws IOException; + + /** + * Return true if the service is capable and ready to transition + * from the standby state to the active state. + * + * @return true if the service is ready to become active, false otherwise. + * @throws IOException + * if other errors happen + */ + public boolean readyToBecomeActive() throws ServiceFailedException, + IOException; } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java index 36aead56b95..7b5cc32b765 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java @@ -79,6 +79,11 @@ public class TestFailoverController { public HAServiceState getServiceState() throws IOException { return state; } + + @Override + public boolean readyToBecomeActive() throws ServiceFailedException, IOException { + return true; + } } @Test @@ -88,13 +93,13 @@ public class TestFailoverController { NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); AlwaysSucceedFencer.fenceCalled = 0; - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled); assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); AlwaysSucceedFencer.fenceCalled = 0; - FailoverController.failover(svc2, svc2Addr, svc1, svc1Addr, fencer, false); + FailoverController.failover(svc2, svc2Addr, svc1, svc1Addr, fencer, false, false); assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled); assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); assertEquals(HAServiceState.STANDBY, svc2.getServiceState()); @@ -106,7 +111,7 @@ public class TestFailoverController { DummyService svc2 = new DummyService(HAServiceState.STANDBY); NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); } @@ -118,7 +123,7 @@ public class TestFailoverController { NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); fail("Can't failover to an already active service"); } catch (FailoverFailedException ffe) { // Expected @@ -128,6 +133,33 @@ public class TestFailoverController { assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); } + @Test + public void testFailoverToUnreadyService() throws Exception { + DummyService svc1 = new DummyService(HAServiceState.ACTIVE); + DummyService svc2 = new DummyService(HAServiceState.STANDBY) { + @Override + public boolean readyToBecomeActive() throws ServiceFailedException, IOException { + return false; + } + }; + NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); + + try { + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); + fail("Can't failover to a service that's not ready"); + } catch (FailoverFailedException ffe) { + // Expected + } + + assertEquals(HAServiceState.ACTIVE, svc1.getServiceState()); + assertEquals(HAServiceState.STANDBY, svc2.getServiceState()); + + // Forcing it means we ignore readyToBecomeActive + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, true); + assertEquals(HAServiceState.STANDBY, svc1.getServiceState()); + assertEquals(HAServiceState.ACTIVE, svc2.getServiceState()); + } + @Test public void testFailoverToUnhealthyServiceFailsAndFailsback() throws Exception { DummyService svc1 = new DummyService(HAServiceState.ACTIVE); @@ -140,7 +172,7 @@ public class TestFailoverController { NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); fail("Failover to unhealthy service"); } catch (FailoverFailedException ffe) { // Expected @@ -162,7 +194,7 @@ public class TestFailoverController { AlwaysSucceedFencer.fenceCalled = 0; try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); } catch (FailoverFailedException ffe) { fail("Faulty active prevented failover"); } @@ -187,7 +219,7 @@ public class TestFailoverController { AlwaysFailFencer.fenceCalled = 0; try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); fail("Failed over even though fencing failed"); } catch (FailoverFailedException ffe) { // Expected @@ -207,7 +239,7 @@ public class TestFailoverController { AlwaysFailFencer.fenceCalled = 0; try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false); fail("Failed over even though fencing requested and failed"); } catch (FailoverFailedException ffe) { // Expected @@ -238,7 +270,7 @@ public class TestFailoverController { NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); } catch (FailoverFailedException ffe) { fail("Non-existant active prevented failover"); } @@ -254,7 +286,7 @@ public class TestFailoverController { NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); fail("Failed over to a non-existant standby"); } catch (FailoverFailedException ffe) { // Expected @@ -275,7 +307,7 @@ public class TestFailoverController { NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); fail("Failover to already active service"); } catch (FailoverFailedException ffe) { // Expected @@ -300,7 +332,7 @@ public class TestFailoverController { NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, true, false); fail("Failed over to service that won't transition to active"); } catch (FailoverFailedException ffe) { // Expected @@ -325,7 +357,7 @@ public class TestFailoverController { AlwaysSucceedFencer.fenceCalled = 0; try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); fail("Failed over to service that won't transition to active"); } catch (FailoverFailedException ffe) { // Expected @@ -352,7 +384,7 @@ public class TestFailoverController { AlwaysFailFencer.fenceCalled = 0; try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); fail("Failed over to service that won't transition to active"); } catch (FailoverFailedException ffe) { // Expected @@ -383,7 +415,7 @@ public class TestFailoverController { NodeFencer fencer = setupFencer(AlwaysSucceedFencer.class.getName()); try { - FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false); + FailoverController.failover(svc1, svc1Addr, svc2, svc2Addr, fencer, false, false); fail("Failover to already active service"); } catch (FailoverFailedException ffe) { // Expected diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestHAAdmin.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestHAAdmin.java index 02e7fffff32..a5a58648d46 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestHAAdmin.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestHAAdmin.java @@ -31,6 +31,7 @@ import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; import org.junit.Before; import org.junit.Test; import org.mockito.Mockito; +import static org.mockito.Mockito.when; import com.google.common.base.Charsets; import com.google.common.base.Joiner; @@ -44,8 +45,9 @@ public class TestHAAdmin { private HAServiceProtocol mockProtocol; @Before - public void setup() { + public void setup() throws IOException { mockProtocol = Mockito.mock(HAServiceProtocol.class); + when(mockProtocol.readyToBecomeActive()).thenReturn(true); tool = new HAAdmin() { @Override protected HAServiceProtocol getProtocol(String target) throws IOException { @@ -130,6 +132,15 @@ public class TestHAAdmin { assertEquals(0, runTool("-failover", "foo:1234", "bar:5678", "--forcefence")); } + @Test + public void testFailoverWithForceActive() throws Exception { + Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); + Configuration conf = new Configuration(); + conf.set(NodeFencer.CONF_METHODS_KEY, "shell(true)"); + tool.setConf(conf); + assertEquals(0, runTool("-failover", "foo:1234", "bar:5678", "--forceactive")); + } + @Test public void testFailoverWithInvalidFenceArg() throws Exception { Mockito.doReturn(HAServiceState.STANDBY).when(mockProtocol).getServiceState(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java index bf31695eae8..27bd92d874d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java @@ -929,6 +929,13 @@ public class NameNode { return state.getServiceState(); } + synchronized boolean readyToBecomeActive() throws ServiceFailedException { + if (!haEnabled) { + throw new ServiceFailedException("HA for namenode is not enabled"); + } + return !isInSafeMode(); + } + /** * Class used as expose {@link NameNode} as context to {@link HAState} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java index b293b5a14fc..45dd8ec55ce 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeRpcServer.java @@ -1007,6 +1007,11 @@ class NameNodeRpcServer implements NamenodeProtocols { return nn.getServiceState(); } + @Override // HAServiceProtocol + public synchronized boolean readyToBecomeActive() throws ServiceFailedException { + return nn.readyToBecomeActive(); + } + /** * Verify version. *