YARN-1929. Fixed a deadlock in ResourceManager that occurs when failover happens right at the time of shutdown. Contributed by Karthik Kambatla.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1591071 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Vinod Kumar Vavilapalli 2014-04-29 19:49:44 +00:00
parent a9775b4e49
commit 7a241aee90
5 changed files with 153 additions and 13 deletions

View File

@ -141,8 +141,7 @@ protected void serviceStop() throws Exception {
* @throws RuntimeException the first exception raised during the * @throws RuntimeException the first exception raised during the
* stop process -<i>after all services are stopped</i> * stop process -<i>after all services are stopped</i>
*/ */
private synchronized void stop(int numOfServicesStarted, private void stop(int numOfServicesStarted, boolean stopOnlyStartedServices) {
boolean stopOnlyStartedServices) {
// stop in reverse order of start // stop in reverse order of start
Exception firstException = null; Exception firstException = null;
List<Service> services = getServices(); List<Service> services = getServices();

View File

@ -164,6 +164,9 @@ Release 2.4.1 - UNRELEASED
YARN-1975. Used resources shows escaped html in CapacityScheduler and YARN-1975. Used resources shows escaped html in CapacityScheduler and
FairScheduler page (Mit Desai via jlowe) FairScheduler page (Mit Desai via jlowe)
YARN-1929. Fixed a deadlock in ResourceManager that occurs when failover
happens right at the time of shutdown. (Karthik Kambatla via vinodkv)
Release 2.4.0 - 2014-04-07 Release 2.4.0 - 2014-04-07
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -101,7 +101,7 @@ public AdminService(ResourceManager rm, RMContext rmContext) {
} }
@Override @Override
public synchronized void serviceInit(Configuration conf) throws Exception { public void serviceInit(Configuration conf) throws Exception {
if (rmContext.isHAEnabled()) { if (rmContext.isHAEnabled()) {
autoFailoverEnabled = HAUtil.isAutomaticFailoverEnabled(conf); autoFailoverEnabled = HAUtil.isAutomaticFailoverEnabled(conf);
if (autoFailoverEnabled) { if (autoFailoverEnabled) {
@ -123,13 +123,13 @@ public synchronized void serviceInit(Configuration conf) throws Exception {
} }
@Override @Override
protected synchronized void serviceStart() throws Exception { protected void serviceStart() throws Exception {
startServer(); startServer();
super.serviceStart(); super.serviceStart();
} }
@Override @Override
protected synchronized void serviceStop() throws Exception { protected void serviceStop() throws Exception {
stopServer(); stopServer();
super.serviceStop(); super.serviceStop();
} }

View File

@ -61,7 +61,7 @@ public class EmbeddedElectorService extends AbstractService
} }
@Override @Override
protected synchronized void serviceInit(Configuration conf) protected void serviceInit(Configuration conf)
throws Exception { throws Exception {
conf = conf instanceof YarnConfiguration ? conf : new YarnConfiguration(conf); conf = conf instanceof YarnConfiguration ? conf : new YarnConfiguration(conf);
@ -102,20 +102,20 @@ protected synchronized void serviceInit(Configuration conf)
} }
@Override @Override
protected synchronized void serviceStart() throws Exception { protected void serviceStart() throws Exception {
elector.joinElection(localActiveNodeInfo); elector.joinElection(localActiveNodeInfo);
super.serviceStart(); super.serviceStart();
} }
@Override @Override
protected synchronized void serviceStop() throws Exception { protected void serviceStop() throws Exception {
elector.quitElection(false); elector.quitElection(false);
elector.terminateConnection(); elector.terminateConnection();
super.serviceStop(); super.serviceStop();
} }
@Override @Override
public synchronized void becomeActive() throws ServiceFailedException { public void becomeActive() throws ServiceFailedException {
try { try {
rmContext.getRMAdminService().transitionToActive(req); rmContext.getRMAdminService().transitionToActive(req);
} catch (Exception e) { } catch (Exception e) {
@ -124,7 +124,7 @@ public synchronized void becomeActive() throws ServiceFailedException {
} }
@Override @Override
public synchronized void becomeStandby() { public void becomeStandby() {
try { try {
rmContext.getRMAdminService().transitionToStandby(req); rmContext.getRMAdminService().transitionToStandby(req);
} catch (Exception e) { } catch (Exception e) {
@ -143,13 +143,13 @@ public void enterNeutralMode() {
@SuppressWarnings(value = "unchecked") @SuppressWarnings(value = "unchecked")
@Override @Override
public synchronized void notifyFatalError(String errorMessage) { public void notifyFatalError(String errorMessage) {
rmContext.getDispatcher().getEventHandler().handle( rmContext.getDispatcher().getEventHandler().handle(
new RMFatalEvent(RMFatalEventType.EMBEDDED_ELECTOR_FAILED, errorMessage)); new RMFatalEvent(RMFatalEventType.EMBEDDED_ELECTOR_FAILED, errorMessage));
} }
@Override @Override
public synchronized void fenceOldActive(byte[] oldActiveData) { public void fenceOldActive(byte[] oldActiveData) {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Request to fence old active being ignored, " + LOG.debug("Request to fence old active being ignored, " +
"as embedded leader election doesn't support fencing"); "as embedded leader election doesn't support fencing");
@ -166,7 +166,7 @@ private static byte[] createActiveNodeInfo(String clusterId, String rmId)
.toByteArray(); .toByteArray();
} }
private synchronized boolean isParentZnodeSafe(String clusterId) private boolean isParentZnodeSafe(String clusterId)
throws InterruptedException, IOException, KeeperException { throws InterruptedException, IOException, KeeperException {
byte[] data; byte[] data;
try { try {

View File

@ -0,0 +1,138 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.resourcemanager;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.ha.ClientBaseWithFixes;
import org.apache.hadoop.ha.ServiceFailedException;
import org.apache.hadoop.yarn.conf.HAUtil;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicBoolean;
public class TestRMEmbeddedElector extends ClientBaseWithFixes {
private static final Log LOG =
LogFactory.getLog(TestRMEmbeddedElector.class.getName());
private static final String RM1_NODE_ID = "rm1";
private static final int RM1_PORT_BASE = 10000;
private static final String RM2_NODE_ID = "rm2";
private static final int RM2_PORT_BASE = 20000;
private Configuration conf;
private AtomicBoolean callbackCalled;
private void setConfForRM(String rmId, String prefix, String value) {
conf.set(HAUtil.addSuffix(prefix, rmId), value);
}
private void setRpcAddressForRM(String rmId, int base) {
setConfForRM(rmId, YarnConfiguration.RM_ADDRESS, "0.0.0.0:" +
(base + YarnConfiguration.DEFAULT_RM_PORT));
setConfForRM(rmId, YarnConfiguration.RM_SCHEDULER_ADDRESS, "0.0.0.0:" +
(base + YarnConfiguration.DEFAULT_RM_SCHEDULER_PORT));
setConfForRM(rmId, YarnConfiguration.RM_ADMIN_ADDRESS, "0.0.0.0:" +
(base + YarnConfiguration.DEFAULT_RM_ADMIN_PORT));
setConfForRM(rmId, YarnConfiguration.RM_RESOURCE_TRACKER_ADDRESS, "0.0.0.0:" +
(base + YarnConfiguration.DEFAULT_RM_RESOURCE_TRACKER_PORT));
setConfForRM(rmId, YarnConfiguration.RM_WEBAPP_ADDRESS, "0.0.0.0:" +
(base + YarnConfiguration.DEFAULT_RM_WEBAPP_PORT));
setConfForRM(rmId, YarnConfiguration.RM_WEBAPP_HTTPS_ADDRESS, "0.0.0.0:" +
(base + YarnConfiguration.DEFAULT_RM_WEBAPP_HTTPS_PORT));
}
@Before
public void setup() throws IOException {
conf = new YarnConfiguration();
conf.setBoolean(YarnConfiguration.RM_HA_ENABLED, true);
conf.setBoolean(YarnConfiguration.AUTO_FAILOVER_ENABLED, true);
conf.setBoolean(YarnConfiguration.AUTO_FAILOVER_EMBEDDED, true);
conf.set(YarnConfiguration.RM_CLUSTER_ID, "yarn-test-cluster");
conf.set(YarnConfiguration.RM_ZK_ADDRESS, hostPort);
conf.setInt(YarnConfiguration.RM_ZK_TIMEOUT_MS, 2000);
conf.set(YarnConfiguration.RM_HA_IDS, RM1_NODE_ID + "," + RM2_NODE_ID);
conf.set(YarnConfiguration.RM_HA_ID, RM1_NODE_ID);
setRpcAddressForRM(RM1_NODE_ID, RM1_PORT_BASE);
setRpcAddressForRM(RM2_NODE_ID, RM2_PORT_BASE);
conf.setLong(YarnConfiguration.CLIENT_FAILOVER_SLEEPTIME_BASE_MS, 100L);
callbackCalled = new AtomicBoolean(false);
}
/**
* Test that tries to see if there is a deadlock between
* (a) the thread stopping the RM
* (b) thread processing the ZK event asking RM to transition to active
*
* The test times out if there is a deadlock.
*/
@Test (timeout = 10000)
public void testDeadlockShutdownBecomeActive() throws InterruptedException {
MockRM rm = new MockRMWithElector(conf, 1000);
rm.start();
LOG.info("Waiting for callback");
while (!callbackCalled.get());
LOG.info("Stopping RM");
rm.stop();
LOG.info("Stopped RM");
}
private class MockRMWithElector extends MockRM {
private long delayMs = 0;
MockRMWithElector(Configuration conf) {
super(conf);
}
MockRMWithElector(Configuration conf, long delayMs) {
this(conf);
this.delayMs = delayMs;
}
@Override
protected AdminService createAdminService() {
return new AdminService(MockRMWithElector.this, getRMContext()) {
@Override
protected EmbeddedElectorService createEmbeddedElectorService() {
return new EmbeddedElectorService(getRMContext()) {
@Override
public void becomeActive() throws
ServiceFailedException {
try {
callbackCalled.set(true);
LOG.info("Callback called. Sleeping now");
Thread.sleep(delayMs);
LOG.info("Sleep done");
} catch (InterruptedException e) {
e.printStackTrace();
}
super.becomeActive();
}
};
}
};
}
}
}