HADOOP-8236. haadmin should have configurable timeouts for failover commands. Contributed by Todd Lipcon.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1308233 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2012-04-02 03:48:21 +00:00
parent 4bebc505a5
commit 8033cb5881
6 changed files with 114 additions and 42 deletions

View File

@ -125,6 +125,9 @@ Release 2.0.0 - UNRELEASED
HADOOP-8211. Update commons-net version to 3.1. (eli)
HADOOP-8236. haadmin should have configurable timeouts for failover
commands. (todd)
OPTIMIZATIONS
BUG FIXES

View File

@ -145,5 +145,21 @@ public class CommonConfigurationKeys extends CommonConfigurationKeysPublic {
public static final String HA_HM_RPC_TIMEOUT_KEY =
"ha.health-monitor.rpc-timeout.ms";
public static final int HA_HM_RPC_TIMEOUT_DEFAULT = 45000;
/* Timeout that the FC waits for the new active to become active */
public static final String HA_FC_NEW_ACTIVE_TIMEOUT_KEY =
"ha.failover-controller.new-active.rpc-timeout.ms";
public static final int HA_FC_NEW_ACTIVE_TIMEOUT_DEFAULT = 60000;
/* Timeout that the FC waits for the old active to go to standby */
public static final String HA_FC_GRACEFUL_FENCE_TIMEOUT_KEY =
"ha.failover-controller.graceful-fence.rpc-timeout.ms";
public static final int HA_FC_GRACEFUL_FENCE_TIMEOUT_DEFAULT = 5000;
/* Timeout that the CLI (manual) FC waits for monitorHealth, getServiceState */
public static final String HA_FC_CLI_CHECK_TIMEOUT_KEY =
"ha.failover-controller.cli-check.rpc-timeout.ms";
public static final int HA_FC_CLI_CHECK_TIMEOUT_DEFAULT = 20000;
}

View File

@ -25,6 +25,7 @@ import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.ipc.RPC;
@ -42,7 +43,22 @@ public class FailoverController {
private static final Log LOG = LogFactory.getLog(FailoverController.class);
private static final int GRACEFUL_FENCE_TIMEOUT = 5000;
private final int gracefulFenceTimeout;
private final int rpcTimeoutToNewActive;
private final Configuration conf;
public FailoverController(Configuration conf) {
this.conf = conf;
this.gracefulFenceTimeout = conf.getInt(
CommonConfigurationKeys.HA_FC_GRACEFUL_FENCE_TIMEOUT_KEY,
CommonConfigurationKeys.HA_FC_GRACEFUL_FENCE_TIMEOUT_DEFAULT);
this.rpcTimeoutToNewActive = conf.getInt(
CommonConfigurationKeys.HA_FC_NEW_ACTIVE_TIMEOUT_KEY,
CommonConfigurationKeys.HA_FC_NEW_ACTIVE_TIMEOUT_DEFAULT);
}
/**
* Perform pre-failover checks on the given service we plan to
@ -59,9 +75,9 @@ public class FailoverController {
* @param forceActive ignore toSvc if it reports that it is not ready
* @throws FailoverFailedException if we should avoid failover
*/
private static void preFailoverChecks(HAServiceTarget from,
HAServiceTarget target,
boolean forceActive)
private void preFailoverChecks(HAServiceTarget from,
HAServiceTarget target,
boolean forceActive)
throws FailoverFailedException {
HAServiceStatus toSvcStatus;
HAServiceProtocol toSvc;
@ -72,7 +88,7 @@ public class FailoverController {
}
try {
toSvc = target.getProxy();
toSvc = target.getProxy(conf, rpcTimeoutToNewActive);
toSvcStatus = toSvc.getServiceStatus();
} catch (IOException e) {
String msg = "Unable to get service state for " + target;
@ -115,11 +131,10 @@ public class FailoverController {
* and no retries. Its only purpose is to avoid fencing a node that
* has already restarted.
*/
static boolean tryGracefulFence(Configuration conf,
HAServiceTarget svc) {
boolean tryGracefulFence(HAServiceTarget svc) {
HAServiceProtocol proxy = null;
try {
proxy = svc.getProxy(conf, GRACEFUL_FENCE_TIMEOUT);
proxy = svc.getProxy(conf, gracefulFenceTimeout);
proxy.transitionToStandby();
return true;
} catch (ServiceFailedException sfe) {
@ -146,10 +161,10 @@ public class FailoverController {
* @param forceActive try to make toSvc active even if it is not ready
* @throws FailoverFailedException if the failover fails
*/
public static void failover(HAServiceTarget fromSvc,
HAServiceTarget toSvc,
boolean forceFence,
boolean forceActive)
public void failover(HAServiceTarget fromSvc,
HAServiceTarget toSvc,
boolean forceFence,
boolean forceActive)
throws FailoverFailedException {
Preconditions.checkArgument(fromSvc.getFencer() != null,
"failover requires a fencer");
@ -158,7 +173,7 @@ public class FailoverController {
// Try to make fromSvc standby
boolean tryFence = true;
if (tryGracefulFence(new Configuration(), fromSvc)) {
if (tryGracefulFence(fromSvc)) {
tryFence = forceFence;
}
@ -174,7 +189,8 @@ public class FailoverController {
boolean failed = false;
Throwable cause = null;
try {
HAServiceProtocolHelper.transitionToActive(toSvc.getProxy());
HAServiceProtocolHelper.transitionToActive(
toSvc.getProxy(conf, rpcTimeoutToNewActive));
} catch (ServiceFailedException sfe) {
LOG.error("Unable to make " + toSvc + " active (" +
sfe.getMessage() + "). Failing back.");

View File

@ -30,7 +30,9 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
@ -49,6 +51,8 @@ public abstract class HAAdmin extends Configured implements Tool {
private static final String FORCEACTIVE = "forceactive";
private static final Log LOG = LogFactory.getLog(HAAdmin.class);
private int rpcTimeoutForChecks = -1;
private static Map<String, UsageInfo> USAGE =
ImmutableMap.<String, UsageInfo>builder()
.put("-transitionToActive",
@ -165,9 +169,10 @@ public abstract class HAAdmin extends Configured implements Tool {
HAServiceTarget fromNode = resolveTarget(args[0]);
HAServiceTarget toNode = resolveTarget(args[1]);
FailoverController fc = new FailoverController(getConf());
try {
FailoverController.failover(fromNode, toNode,
forceFence, forceActive);
fc.failover(fromNode, toNode, forceFence, forceActive);
out.println("Failover from "+args[0]+" to "+args[1]+" successful");
} catch (FailoverFailedException ffe) {
errOut.println("Failover failed: " + ffe.getLocalizedMessage());
@ -184,7 +189,8 @@ public abstract class HAAdmin extends Configured implements Tool {
return -1;
}
HAServiceProtocol proto = resolveTarget(argv[1]).getProxy();
HAServiceProtocol proto = resolveTarget(argv[1]).getProxy(
getConf(), rpcTimeoutForChecks);
try {
HAServiceProtocolHelper.monitorHealth(proto);
} catch (HealthCheckFailedException e) {
@ -202,7 +208,8 @@ public abstract class HAAdmin extends Configured implements Tool {
return -1;
}
HAServiceProtocol proto = resolveTarget(argv[1]).getProxy();
HAServiceProtocol proto = resolveTarget(argv[1]).getProxy(
getConf(), rpcTimeoutForChecks);
out.println(proto.getServiceStatus().getState());
return 0;
}
@ -215,6 +222,16 @@ public abstract class HAAdmin extends Configured implements Tool {
return serviceId;
}
@Override
public void setConf(Configuration conf) {
super.setConf(conf);
if (conf != null) {
rpcTimeoutForChecks = conf.getInt(
CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY,
CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT);
}
}
@Override
public int run(String[] argv) throws Exception {
try {

View File

@ -320,8 +320,8 @@ public abstract class ZKFailoverController implements Tool {
HAServiceTarget target = dataToTarget(data);
LOG.info("Should fence: " + target);
boolean gracefulWorked =
FailoverController.tryGracefulFence(conf, target);
boolean gracefulWorked = new FailoverController(conf)
.tryGracefulFence(target);
if (gracefulWorked) {
// It's possible that it's in standby but just about to go into active,
// no? Is there some race here?

View File

@ -25,11 +25,13 @@ import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.verify;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.ha.TestNodeFencer.AlwaysSucceedFencer;
import org.apache.hadoop.ha.TestNodeFencer.AlwaysFailFencer;
import static org.apache.hadoop.ha.TestNodeFencer.setupFencer;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.test.MockitoUtil;
import org.junit.Test;
import org.mockito.Mockito;
@ -40,6 +42,8 @@ import static org.junit.Assert.*;
public class TestFailoverController {
private InetSocketAddress svc1Addr = new InetSocketAddress("svc1", 1234);
private InetSocketAddress svc2Addr = new InetSocketAddress("svc2", 5678);
private Configuration conf = new Configuration();
HAServiceStatus STATE_NOT_READY = new HAServiceStatus(HAServiceState.STANDBY)
.setNotReadyToBecomeActive("injected not ready");
@ -51,13 +55,13 @@ public class TestFailoverController {
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
AlwaysSucceedFencer.fenceCalled = 0;
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
assertEquals(HAServiceState.STANDBY, svc1.state);
assertEquals(HAServiceState.ACTIVE, svc2.state);
AlwaysSucceedFencer.fenceCalled = 0;
FailoverController.failover(svc2, svc1, false, false);
doFailover(svc2, svc1, false, false);
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
assertEquals(HAServiceState.ACTIVE, svc1.state);
assertEquals(HAServiceState.STANDBY, svc2.state);
@ -69,7 +73,7 @@ public class TestFailoverController {
DummyHAService svc2 = new DummyHAService(HAServiceState.STANDBY, svc2Addr);
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
assertEquals(HAServiceState.STANDBY, svc1.state);
assertEquals(HAServiceState.ACTIVE, svc2.state);
}
@ -81,7 +85,7 @@ public class TestFailoverController {
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
fail("Can't failover to an already active service");
} catch (FailoverFailedException ffe) {
// Expected
@ -102,7 +106,7 @@ public class TestFailoverController {
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
fail("Can't failover when access is denied");
} catch (FailoverFailedException ffe) {
assertTrue(ffe.getCause().getMessage().contains("Access denied"));
@ -118,7 +122,7 @@ public class TestFailoverController {
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
fail("Can't failover to a service that's not ready");
} catch (FailoverFailedException ffe) {
// Expected
@ -131,7 +135,7 @@ public class TestFailoverController {
assertEquals(HAServiceState.STANDBY, svc2.state);
// Forcing it means we ignore readyToBecomeActive
FailoverController.failover(svc1, svc2, false, true);
doFailover(svc1, svc2, false, true);
assertEquals(HAServiceState.STANDBY, svc1.state);
assertEquals(HAServiceState.ACTIVE, svc2.state);
}
@ -145,7 +149,7 @@ public class TestFailoverController {
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
fail("Failover to unhealthy service");
} catch (FailoverFailedException ffe) {
// Expected
@ -165,7 +169,7 @@ public class TestFailoverController {
AlwaysSucceedFencer.fenceCalled = 0;
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
} catch (FailoverFailedException ffe) {
fail("Faulty active prevented failover");
}
@ -188,7 +192,7 @@ public class TestFailoverController {
AlwaysFailFencer.fenceCalled = 0;
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
fail("Failed over even though fencing failed");
} catch (FailoverFailedException ffe) {
// Expected
@ -208,7 +212,7 @@ public class TestFailoverController {
AlwaysFailFencer.fenceCalled = 0;
try {
FailoverController.failover(svc1, svc2, true, false);
doFailover(svc1, svc2, true, false);
fail("Failed over even though fencing requested and failed");
} catch (FailoverFailedException ffe) {
// Expected
@ -232,16 +236,26 @@ public class TestFailoverController {
.defaultAnswer(new ThrowsException(
new IOException("Could not connect to host")))
.extraInterfaces(Closeable.class));
Mockito.doReturn(errorThrowingProxy).when(svc1).getProxy();
Mockito.doNothing().when((Closeable)errorThrowingProxy).close();
Mockito.doReturn(errorThrowingProxy).when(svc1).getProxy(
Mockito.<Configuration>any(),
Mockito.anyInt());
DummyHAService svc2 = new DummyHAService(HAServiceState.STANDBY, svc2Addr);
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
} catch (FailoverFailedException ffe) {
fail("Non-existant active prevented failover");
}
// Verify that the proxy created to try to make it go to standby
// gracefully used the right rpc timeout
Mockito.verify(svc1).getProxy(
Mockito.<Configuration>any(),
Mockito.eq(
CommonConfigurationKeys.HA_FC_GRACEFUL_FENCE_TIMEOUT_DEFAULT));
// Don't check svc1 because we can't reach it, but that's OK, it's been fenced.
assertEquals(HAServiceState.ACTIVE, svc2.state);
}
@ -256,7 +270,7 @@ public class TestFailoverController {
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
fail("Failed over to a non-existant standby");
} catch (FailoverFailedException ffe) {
// Expected
@ -274,7 +288,7 @@ public class TestFailoverController {
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
fail("Failover to already active service");
} catch (FailoverFailedException ffe) {
// Expected
@ -296,7 +310,7 @@ public class TestFailoverController {
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc2, true, false);
doFailover(svc1, svc2, true, false);
fail("Failed over to service that won't transition to active");
} catch (FailoverFailedException ffe) {
// Expected
@ -318,7 +332,7 @@ public class TestFailoverController {
AlwaysSucceedFencer.fenceCalled = 0;
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
fail("Failed over to service that won't transition to active");
} catch (FailoverFailedException ffe) {
// Expected
@ -342,7 +356,7 @@ public class TestFailoverController {
AlwaysFailFencer.fenceCalled = 0;
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
fail("Failed over to service that won't transition to active");
} catch (FailoverFailedException ffe) {
// Expected
@ -368,7 +382,7 @@ public class TestFailoverController {
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
try {
FailoverController.failover(svc1, svc2, false, false);
doFailover(svc1, svc2, false, false);
fail("Failover to already active service");
} catch (FailoverFailedException ffe) {
// Expected
@ -386,7 +400,7 @@ public class TestFailoverController {
AlwaysSucceedFencer.fenceCalled = 0;
try {
FailoverController.failover(svc1, svc1, false, false);
doFailover(svc1, svc1, false, false);
fail("Can't failover to yourself");
} catch (FailoverFailedException ffe) {
// Expected
@ -395,7 +409,7 @@ public class TestFailoverController {
assertEquals(HAServiceState.ACTIVE, svc1.state);
try {
FailoverController.failover(svc2, svc2, false, false);
doFailover(svc2, svc2, false, false);
fail("Can't failover to yourself");
} catch (FailoverFailedException ffe) {
// Expected
@ -403,5 +417,11 @@ public class TestFailoverController {
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
assertEquals(HAServiceState.STANDBY, svc2.state);
}
private void doFailover(HAServiceTarget tgt1, HAServiceTarget tgt2,
boolean forceFence, boolean forceActive) throws FailoverFailedException {
FailoverController fc = new FailoverController(conf);
fc.failover(tgt1, tgt2, forceFence, forceActive);
}
}