YARN-1861. Fixed a bug in RM to reset leader-election on fencing that was causing both RMs to be stuck in standby mode when automatic failover is enabled. Contributed by Karthik Kambatla and Xuan Gong.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1594356 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Vinod Kumar Vavilapalli 2014-05-13 20:00:44 +00:00
parent dac9028ef9
commit 2f87e77cb7
6 changed files with 51 additions and 5 deletions

View File

@ -209,6 +209,10 @@ Release 2.4.1 - UNRELEASED
YARN-1201. TestAMAuthorization fails with local hostname cannot be resolved.
(Wangda Tan via junping_du)
YARN-1861. Fixed a bug in RM to reset leader-election on fencing that was
causing both RMs to be stuck in standby mode when automatic failover is
enabled. (Karthik Kambatla and Xuan Gong via vinodkv)
Release 2.4.0 - 2014-04-07
INCOMPATIBLE CHANGES

View File

@ -34,6 +34,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.ha.ClientBaseWithFixes;
import org.apache.hadoop.ha.HAServiceProtocol;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.service.Service.STATE;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.client.api.YarnClient;
@ -42,6 +43,9 @@
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.MiniYARNCluster;
import org.apache.hadoop.yarn.server.resourcemanager.AdminService;
import org.apache.hadoop.yarn.server.resourcemanager.RMFatalEvent;
import org.apache.hadoop.yarn.server.resourcemanager.RMFatalEventType;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.webproxy.WebAppProxyServer;
import org.junit.After;
import org.junit.Assert;
@ -169,6 +173,7 @@ public void testExplicitFailover()
verifyConnections();
}
@SuppressWarnings("unchecked")
@Test
public void testAutomaticFailover()
throws YarnException, InterruptedException, IOException {
@ -186,6 +191,25 @@ public void testAutomaticFailover()
failover();
verifyConnections();
// Make the current Active handle an RMFatalEvent,
// so it transitions to standby.
ResourceManager rm = cluster.getResourceManager(
cluster.getActiveRMIndex());
RMFatalEvent event =
new RMFatalEvent(RMFatalEventType.STATE_STORE_FENCED,
"Fake RMFatalEvent");
rm.getRMContext().getDispatcher().getEventHandler().handle(event);
int maxWaitingAttempts = 2000;
while (maxWaitingAttempts-- > 0 ) {
if (rm.getRMContext().getHAServiceState() == HAServiceState.STANDBY) {
break;
}
Thread.sleep(1);
}
Assert.assertFalse("RM didn't transition to Standby ",
maxWaitingAttempts == 0);
verifyConnections();
}
@Test

View File

@ -26,6 +26,7 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.ha.HAServiceProtocol;
@ -86,6 +87,7 @@ public class AdminService extends CompositeService implements
private String rmId;
private boolean autoFailoverEnabled;
private EmbeddedElectorService embeddedElector;
private Server server;
private InetSocketAddress masterServiceAddress;
@ -106,7 +108,8 @@ public void serviceInit(Configuration conf) throws Exception {
autoFailoverEnabled = HAUtil.isAutomaticFailoverEnabled(conf);
if (autoFailoverEnabled) {
if (HAUtil.isAutomaticFailoverEmbedded(conf)) {
addIfService(createEmbeddedElectorService());
embeddedElector = createEmbeddedElectorService();
addIfService(embeddedElector);
}
}
}
@ -181,6 +184,13 @@ protected EmbeddedElectorService createEmbeddedElectorService() {
return new EmbeddedElectorService(rmContext);
}
@InterfaceAudience.Private
void resetLeaderElection() {
if (embeddedElector != null) {
embeddedElector.resetLeaderElection();
}
}
private UserGroupInformation checkAccess(String method) throws IOException {
return RMServerUtils.verifyAccess(adminAcl, method, LOG);
}

View File

@ -194,4 +194,9 @@ private boolean isParentZnodeSafe(String clusterId)
}
return true;
}
public void resetLeaderElection() {
elector.quitElection(false);
elector.joinElection(localActiveNodeInfo);
}
}

View File

@ -664,6 +664,7 @@ public void handle(RMFatalEvent event) {
// Transition to standby and reinit active services
LOG.info("Transitioning RM to Standby mode");
rm.transitionToStandby(true);
rm.adminService.resetLeaderElection();
return;
} catch (Exception e) {
LOG.fatal("Failed to transition RM to Standby mode.");

View File

@ -652,12 +652,14 @@ protected void stopRMProxy() { }
*/
public boolean waitForNodeManagersToConnect(long timeout)
throws YarnException, InterruptedException {
ResourceManager rm = getResourceManager();
GetClusterMetricsRequest req = GetClusterMetricsRequest.newInstance();
for (int i = 0; i < timeout / 100; i++) {
if (nodeManagers.length == rm.getClientRMService().getClusterMetrics(req)
.getClusterMetrics().getNumNodeManagers()) {
ResourceManager rm = getResourceManager();
if (rm == null) {
throw new YarnException("Can not find the active RM.");
}
else if (nodeManagers.length == rm.getClientRMService()
.getClusterMetrics(req).getClusterMetrics().getNumNodeManagers()) {
return true;
}
Thread.sleep(100);