HADOOP-8236. haadmin should have configurable timeouts for failover commands. Contributed by Todd Lipcon.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1308235 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
be8a0d1e62
commit
a4ccb8f504
|
@ -236,6 +236,9 @@ Release 2.0.0 - UNRELEASED
|
|||
|
||||
HADOOP-8211. Update commons-net version to 3.1. (eli)
|
||||
|
||||
HADOOP-8236. haadmin should have configurable timeouts for failover
|
||||
commands. (todd)
|
||||
|
||||
OPTIMIZATIONS
|
||||
|
||||
BUG FIXES
|
||||
|
|
|
@ -145,5 +145,21 @@ public class CommonConfigurationKeys extends CommonConfigurationKeysPublic {
|
|||
public static final String HA_HM_RPC_TIMEOUT_KEY =
|
||||
"ha.health-monitor.rpc-timeout.ms";
|
||||
public static final int HA_HM_RPC_TIMEOUT_DEFAULT = 45000;
|
||||
|
||||
/* Timeout that the FC waits for the new active to become active */
|
||||
public static final String HA_FC_NEW_ACTIVE_TIMEOUT_KEY =
|
||||
"ha.failover-controller.new-active.rpc-timeout.ms";
|
||||
public static final int HA_FC_NEW_ACTIVE_TIMEOUT_DEFAULT = 60000;
|
||||
|
||||
/* Timeout that the FC waits for the old active to go to standby */
|
||||
public static final String HA_FC_GRACEFUL_FENCE_TIMEOUT_KEY =
|
||||
"ha.failover-controller.graceful-fence.rpc-timeout.ms";
|
||||
public static final int HA_FC_GRACEFUL_FENCE_TIMEOUT_DEFAULT = 5000;
|
||||
|
||||
/* Timeout that the CLI (manual) FC waits for monitorHealth, getServiceState */
|
||||
public static final String HA_FC_CLI_CHECK_TIMEOUT_KEY =
|
||||
"ha.failover-controller.cli-check.rpc-timeout.ms";
|
||||
public static final int HA_FC_CLI_CHECK_TIMEOUT_DEFAULT = 20000;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.commons.logging.LogFactory;
|
|||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeys;
|
||||
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
|
||||
import org.apache.hadoop.ipc.RPC;
|
||||
|
||||
|
@ -42,7 +43,22 @@ public class FailoverController {
|
|||
|
||||
private static final Log LOG = LogFactory.getLog(FailoverController.class);
|
||||
|
||||
private static final int GRACEFUL_FENCE_TIMEOUT = 5000;
|
||||
private final int gracefulFenceTimeout;
|
||||
private final int rpcTimeoutToNewActive;
|
||||
|
||||
private final Configuration conf;
|
||||
|
||||
|
||||
public FailoverController(Configuration conf) {
|
||||
this.conf = conf;
|
||||
|
||||
this.gracefulFenceTimeout = conf.getInt(
|
||||
CommonConfigurationKeys.HA_FC_GRACEFUL_FENCE_TIMEOUT_KEY,
|
||||
CommonConfigurationKeys.HA_FC_GRACEFUL_FENCE_TIMEOUT_DEFAULT);
|
||||
this.rpcTimeoutToNewActive = conf.getInt(
|
||||
CommonConfigurationKeys.HA_FC_NEW_ACTIVE_TIMEOUT_KEY,
|
||||
CommonConfigurationKeys.HA_FC_NEW_ACTIVE_TIMEOUT_DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform pre-failover checks on the given service we plan to
|
||||
|
@ -59,9 +75,9 @@ public class FailoverController {
|
|||
* @param forceActive ignore toSvc if it reports that it is not ready
|
||||
* @throws FailoverFailedException if we should avoid failover
|
||||
*/
|
||||
private static void preFailoverChecks(HAServiceTarget from,
|
||||
HAServiceTarget target,
|
||||
boolean forceActive)
|
||||
private void preFailoverChecks(HAServiceTarget from,
|
||||
HAServiceTarget target,
|
||||
boolean forceActive)
|
||||
throws FailoverFailedException {
|
||||
HAServiceStatus toSvcStatus;
|
||||
HAServiceProtocol toSvc;
|
||||
|
@ -72,7 +88,7 @@ public class FailoverController {
|
|||
}
|
||||
|
||||
try {
|
||||
toSvc = target.getProxy();
|
||||
toSvc = target.getProxy(conf, rpcTimeoutToNewActive);
|
||||
toSvcStatus = toSvc.getServiceStatus();
|
||||
} catch (IOException e) {
|
||||
String msg = "Unable to get service state for " + target;
|
||||
|
@ -115,11 +131,10 @@ public class FailoverController {
|
|||
* and no retries. Its only purpose is to avoid fencing a node that
|
||||
* has already restarted.
|
||||
*/
|
||||
static boolean tryGracefulFence(Configuration conf,
|
||||
HAServiceTarget svc) {
|
||||
boolean tryGracefulFence(HAServiceTarget svc) {
|
||||
HAServiceProtocol proxy = null;
|
||||
try {
|
||||
proxy = svc.getProxy(conf, GRACEFUL_FENCE_TIMEOUT);
|
||||
proxy = svc.getProxy(conf, gracefulFenceTimeout);
|
||||
proxy.transitionToStandby();
|
||||
return true;
|
||||
} catch (ServiceFailedException sfe) {
|
||||
|
@ -146,10 +161,10 @@ public class FailoverController {
|
|||
* @param forceActive try to make toSvc active even if it is not ready
|
||||
* @throws FailoverFailedException if the failover fails
|
||||
*/
|
||||
public static void failover(HAServiceTarget fromSvc,
|
||||
HAServiceTarget toSvc,
|
||||
boolean forceFence,
|
||||
boolean forceActive)
|
||||
public void failover(HAServiceTarget fromSvc,
|
||||
HAServiceTarget toSvc,
|
||||
boolean forceFence,
|
||||
boolean forceActive)
|
||||
throws FailoverFailedException {
|
||||
Preconditions.checkArgument(fromSvc.getFencer() != null,
|
||||
"failover requires a fencer");
|
||||
|
@ -158,7 +173,7 @@ public class FailoverController {
|
|||
// Try to make fromSvc standby
|
||||
boolean tryFence = true;
|
||||
|
||||
if (tryGracefulFence(new Configuration(), fromSvc)) {
|
||||
if (tryGracefulFence(fromSvc)) {
|
||||
tryFence = forceFence;
|
||||
}
|
||||
|
||||
|
@ -174,7 +189,8 @@ public class FailoverController {
|
|||
boolean failed = false;
|
||||
Throwable cause = null;
|
||||
try {
|
||||
HAServiceProtocolHelper.transitionToActive(toSvc.getProxy());
|
||||
HAServiceProtocolHelper.transitionToActive(
|
||||
toSvc.getProxy(conf, rpcTimeoutToNewActive));
|
||||
} catch (ServiceFailedException sfe) {
|
||||
LOG.error("Unable to make " + toSvc + " active (" +
|
||||
sfe.getMessage() + "). Failing back.");
|
||||
|
|
|
@ -30,7 +30,9 @@ import org.apache.commons.logging.Log;
|
|||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.conf.Configured;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeys;
|
||||
import org.apache.hadoop.util.Tool;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
|
||||
|
@ -49,6 +51,8 @@ public abstract class HAAdmin extends Configured implements Tool {
|
|||
private static final String FORCEACTIVE = "forceactive";
|
||||
private static final Log LOG = LogFactory.getLog(HAAdmin.class);
|
||||
|
||||
private int rpcTimeoutForChecks = -1;
|
||||
|
||||
private static Map<String, UsageInfo> USAGE =
|
||||
ImmutableMap.<String, UsageInfo>builder()
|
||||
.put("-transitionToActive",
|
||||
|
@ -165,9 +169,10 @@ public abstract class HAAdmin extends Configured implements Tool {
|
|||
HAServiceTarget fromNode = resolveTarget(args[0]);
|
||||
HAServiceTarget toNode = resolveTarget(args[1]);
|
||||
|
||||
FailoverController fc = new FailoverController(getConf());
|
||||
|
||||
try {
|
||||
FailoverController.failover(fromNode, toNode,
|
||||
forceFence, forceActive);
|
||||
fc.failover(fromNode, toNode, forceFence, forceActive);
|
||||
out.println("Failover from "+args[0]+" to "+args[1]+" successful");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
errOut.println("Failover failed: " + ffe.getLocalizedMessage());
|
||||
|
@ -184,7 +189,8 @@ public abstract class HAAdmin extends Configured implements Tool {
|
|||
return -1;
|
||||
}
|
||||
|
||||
HAServiceProtocol proto = resolveTarget(argv[1]).getProxy();
|
||||
HAServiceProtocol proto = resolveTarget(argv[1]).getProxy(
|
||||
getConf(), rpcTimeoutForChecks);
|
||||
try {
|
||||
HAServiceProtocolHelper.monitorHealth(proto);
|
||||
} catch (HealthCheckFailedException e) {
|
||||
|
@ -202,7 +208,8 @@ public abstract class HAAdmin extends Configured implements Tool {
|
|||
return -1;
|
||||
}
|
||||
|
||||
HAServiceProtocol proto = resolveTarget(argv[1]).getProxy();
|
||||
HAServiceProtocol proto = resolveTarget(argv[1]).getProxy(
|
||||
getConf(), rpcTimeoutForChecks);
|
||||
out.println(proto.getServiceStatus().getState());
|
||||
return 0;
|
||||
}
|
||||
|
@ -215,6 +222,16 @@ public abstract class HAAdmin extends Configured implements Tool {
|
|||
return serviceId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setConf(Configuration conf) {
|
||||
super.setConf(conf);
|
||||
if (conf != null) {
|
||||
rpcTimeoutForChecks = conf.getInt(
|
||||
CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY,
|
||||
CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int run(String[] argv) throws Exception {
|
||||
try {
|
||||
|
|
|
@ -320,8 +320,8 @@ public abstract class ZKFailoverController implements Tool {
|
|||
HAServiceTarget target = dataToTarget(data);
|
||||
|
||||
LOG.info("Should fence: " + target);
|
||||
boolean gracefulWorked =
|
||||
FailoverController.tryGracefulFence(conf, target);
|
||||
boolean gracefulWorked = new FailoverController(conf)
|
||||
.tryGracefulFence(target);
|
||||
if (gracefulWorked) {
|
||||
// It's possible that it's in standby but just about to go into active,
|
||||
// no? Is there some race here?
|
||||
|
|
|
@ -25,11 +25,13 @@ import static org.mockito.Mockito.spy;
|
|||
import static org.mockito.Mockito.verify;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeys;
|
||||
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
|
||||
import org.apache.hadoop.ha.TestNodeFencer.AlwaysSucceedFencer;
|
||||
import org.apache.hadoop.ha.TestNodeFencer.AlwaysFailFencer;
|
||||
import static org.apache.hadoop.ha.TestNodeFencer.setupFencer;
|
||||
import org.apache.hadoop.security.AccessControlException;
|
||||
import org.apache.hadoop.test.MockitoUtil;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
@ -40,6 +42,8 @@ import static org.junit.Assert.*;
|
|||
public class TestFailoverController {
|
||||
private InetSocketAddress svc1Addr = new InetSocketAddress("svc1", 1234);
|
||||
private InetSocketAddress svc2Addr = new InetSocketAddress("svc2", 5678);
|
||||
|
||||
private Configuration conf = new Configuration();
|
||||
|
||||
HAServiceStatus STATE_NOT_READY = new HAServiceStatus(HAServiceState.STANDBY)
|
||||
.setNotReadyToBecomeActive("injected not ready");
|
||||
|
@ -51,13 +55,13 @@ public class TestFailoverController {
|
|||
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
AlwaysSucceedFencer.fenceCalled = 0;
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
|
||||
assertEquals(HAServiceState.STANDBY, svc1.state);
|
||||
assertEquals(HAServiceState.ACTIVE, svc2.state);
|
||||
|
||||
AlwaysSucceedFencer.fenceCalled = 0;
|
||||
FailoverController.failover(svc2, svc1, false, false);
|
||||
doFailover(svc2, svc1, false, false);
|
||||
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
|
||||
assertEquals(HAServiceState.ACTIVE, svc1.state);
|
||||
assertEquals(HAServiceState.STANDBY, svc2.state);
|
||||
|
@ -69,7 +73,7 @@ public class TestFailoverController {
|
|||
DummyHAService svc2 = new DummyHAService(HAServiceState.STANDBY, svc2Addr);
|
||||
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
assertEquals(HAServiceState.STANDBY, svc1.state);
|
||||
assertEquals(HAServiceState.ACTIVE, svc2.state);
|
||||
}
|
||||
|
@ -81,7 +85,7 @@ public class TestFailoverController {
|
|||
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
fail("Can't failover to an already active service");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -102,7 +106,7 @@ public class TestFailoverController {
|
|||
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
fail("Can't failover when access is denied");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
assertTrue(ffe.getCause().getMessage().contains("Access denied"));
|
||||
|
@ -118,7 +122,7 @@ public class TestFailoverController {
|
|||
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
fail("Can't failover to a service that's not ready");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -131,7 +135,7 @@ public class TestFailoverController {
|
|||
assertEquals(HAServiceState.STANDBY, svc2.state);
|
||||
|
||||
// Forcing it means we ignore readyToBecomeActive
|
||||
FailoverController.failover(svc1, svc2, false, true);
|
||||
doFailover(svc1, svc2, false, true);
|
||||
assertEquals(HAServiceState.STANDBY, svc1.state);
|
||||
assertEquals(HAServiceState.ACTIVE, svc2.state);
|
||||
}
|
||||
|
@ -145,7 +149,7 @@ public class TestFailoverController {
|
|||
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
fail("Failover to unhealthy service");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -165,7 +169,7 @@ public class TestFailoverController {
|
|||
|
||||
AlwaysSucceedFencer.fenceCalled = 0;
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
} catch (FailoverFailedException ffe) {
|
||||
fail("Faulty active prevented failover");
|
||||
}
|
||||
|
@ -188,7 +192,7 @@ public class TestFailoverController {
|
|||
|
||||
AlwaysFailFencer.fenceCalled = 0;
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
fail("Failed over even though fencing failed");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -208,7 +212,7 @@ public class TestFailoverController {
|
|||
|
||||
AlwaysFailFencer.fenceCalled = 0;
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, true, false);
|
||||
doFailover(svc1, svc2, true, false);
|
||||
fail("Failed over even though fencing requested and failed");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -232,16 +236,26 @@ public class TestFailoverController {
|
|||
.defaultAnswer(new ThrowsException(
|
||||
new IOException("Could not connect to host")))
|
||||
.extraInterfaces(Closeable.class));
|
||||
Mockito.doReturn(errorThrowingProxy).when(svc1).getProxy();
|
||||
Mockito.doNothing().when((Closeable)errorThrowingProxy).close();
|
||||
|
||||
Mockito.doReturn(errorThrowingProxy).when(svc1).getProxy(
|
||||
Mockito.<Configuration>any(),
|
||||
Mockito.anyInt());
|
||||
DummyHAService svc2 = new DummyHAService(HAServiceState.STANDBY, svc2Addr);
|
||||
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
} catch (FailoverFailedException ffe) {
|
||||
fail("Non-existant active prevented failover");
|
||||
}
|
||||
|
||||
// Verify that the proxy created to try to make it go to standby
|
||||
// gracefully used the right rpc timeout
|
||||
Mockito.verify(svc1).getProxy(
|
||||
Mockito.<Configuration>any(),
|
||||
Mockito.eq(
|
||||
CommonConfigurationKeys.HA_FC_GRACEFUL_FENCE_TIMEOUT_DEFAULT));
|
||||
|
||||
// Don't check svc1 because we can't reach it, but that's OK, it's been fenced.
|
||||
assertEquals(HAServiceState.ACTIVE, svc2.state);
|
||||
}
|
||||
|
@ -256,7 +270,7 @@ public class TestFailoverController {
|
|||
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
fail("Failed over to a non-existant standby");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -274,7 +288,7 @@ public class TestFailoverController {
|
|||
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
fail("Failover to already active service");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -296,7 +310,7 @@ public class TestFailoverController {
|
|||
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, true, false);
|
||||
doFailover(svc1, svc2, true, false);
|
||||
fail("Failed over to service that won't transition to active");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -318,7 +332,7 @@ public class TestFailoverController {
|
|||
AlwaysSucceedFencer.fenceCalled = 0;
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
fail("Failed over to service that won't transition to active");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -342,7 +356,7 @@ public class TestFailoverController {
|
|||
AlwaysFailFencer.fenceCalled = 0;
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
fail("Failed over to service that won't transition to active");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -368,7 +382,7 @@ public class TestFailoverController {
|
|||
svc1.fencer = svc2.fencer = setupFencer(AlwaysSucceedFencer.class.getName());
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc2, false, false);
|
||||
doFailover(svc1, svc2, false, false);
|
||||
fail("Failover to already active service");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -386,7 +400,7 @@ public class TestFailoverController {
|
|||
AlwaysSucceedFencer.fenceCalled = 0;
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc1, svc1, false, false);
|
||||
doFailover(svc1, svc1, false, false);
|
||||
fail("Can't failover to yourself");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -395,7 +409,7 @@ public class TestFailoverController {
|
|||
assertEquals(HAServiceState.ACTIVE, svc1.state);
|
||||
|
||||
try {
|
||||
FailoverController.failover(svc2, svc2, false, false);
|
||||
doFailover(svc2, svc2, false, false);
|
||||
fail("Can't failover to yourself");
|
||||
} catch (FailoverFailedException ffe) {
|
||||
// Expected
|
||||
|
@ -403,5 +417,11 @@ public class TestFailoverController {
|
|||
assertEquals(0, TestNodeFencer.AlwaysSucceedFencer.fenceCalled);
|
||||
assertEquals(HAServiceState.STANDBY, svc2.state);
|
||||
}
|
||||
|
||||
private void doFailover(HAServiceTarget tgt1, HAServiceTarget tgt2,
|
||||
boolean forceFence, boolean forceActive) throws FailoverFailedException {
|
||||
FailoverController fc = new FailoverController(conf);
|
||||
fc.failover(tgt1, tgt2, forceFence, forceActive);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue