YARN-3802. Two RMNodes for the same NodeId are used in RM sometimes

after NM is reconnected. Contributed by zhihai xu

(cherry picked from commit 5b5bb8dcdc)
This commit is contained in:
Xuan 2015-06-18 14:37:49 -07:00
parent 2946e92f79
commit 86b75ac544
3 changed files with 74 additions and 4 deletions

View File

@ -491,6 +491,9 @@ Release 2.8.0 - UNRELEASED
YARN-3824. Fix two minor nits in member variable properties
of YarnConfiguration. (Ray Chiang via devaraj)
YARN-3802. Two RMNodes for the same NodeId are used in RM sometimes
after NM is reconnected. (zhihai xu via xgong)
Release 2.7.1 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -597,10 +597,14 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
if (rmNode.getHttpPort() == newNode.getHttpPort()) {
// Reset heartbeat ID since node just restarted.
rmNode.getLastNodeHeartBeatResponse().setResponseId(0);
if (!rmNode.getTotalCapability().equals(
newNode.getTotalCapability())) {
rmNode.totalCapability = newNode.getTotalCapability();
}
if (rmNode.getState().equals(NodeState.RUNNING)) {
// Only add new node if old state is RUNNING
// Only add old node if old state is RUNNING
rmNode.context.getDispatcher().getEventHandler().handle(
new NodeAddedSchedulerEvent(newNode));
new NodeAddedSchedulerEvent(rmNode));
}
} else {
// Reconnected node differs, so replace old node and start new node

View File

@ -25,6 +25,9 @@ import org.junit.Assert;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.conf.ConfigurationProvider;
import org.apache.hadoop.yarn.conf.ConfigurationProviderFactory;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.event.InlineDispatcher;
@ -32,6 +35,7 @@ import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
import org.apache.hadoop.yarn.server.resourcemanager.NMLivelinessMonitor;
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
import org.apache.hadoop.yarn.server.resourcemanager.NodesListManager;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.RMContextImpl;
@ -39,10 +43,13 @@ import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.NodeEventDi
import org.apache.hadoop.yarn.server.resourcemanager.ResourceTrackerService;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
import org.apache.hadoop.yarn.server.resourcemanager.security.NMTokenSecretManagerInRM;
import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@ -51,6 +58,8 @@ public class TestNMReconnect {
RecordFactoryProvider.getRecordFactory(null);
private List<RMNodeEvent> rmNodeEvents = new ArrayList<RMNodeEvent>();
private Dispatcher dispatcher;
private RMContextImpl context;
private class TestRMNodeEventDispatcher implements
EventHandler<RMNodeEvent> {
@ -68,12 +77,12 @@ public class TestNMReconnect {
public void setUp() {
Configuration conf = new Configuration();
// Dispatcher that processes events inline
Dispatcher dispatcher = new InlineDispatcher();
dispatcher = new InlineDispatcher();
dispatcher.register(RMNodeEventType.class,
new TestRMNodeEventDispatcher());
RMContext context = new RMContextImpl(dispatcher, null,
context = new RMContextImpl(dispatcher, null,
null, null, null, null, null, null, null, null);
dispatcher.register(SchedulerEventType.class,
new InlineDispatcher.EmptyEventHandler());
@ -99,6 +108,11 @@ public class TestNMReconnect {
resourceTrackerService.start();
}
@After
public void tearDown() {
resourceTrackerService.stop();
}
@Test
public void testReconnect() throws Exception {
String hostname1 = "localhost1";
@ -126,4 +140,53 @@ public class TestNMReconnect {
Assert.assertEquals(RMNodeEventType.RECONNECTED,
rmNodeEvents.get(0).getType());
}
@Test
public void testCompareRMNodeAfterReconnect() throws Exception {
Configuration yarnConf = new YarnConfiguration();
CapacityScheduler scheduler = new CapacityScheduler();
scheduler.setConf(yarnConf);
ConfigurationProvider configurationProvider =
ConfigurationProviderFactory.getConfigurationProvider(yarnConf);
configurationProvider.init(yarnConf);
context.setConfigurationProvider(configurationProvider);
RMNodeLabelsManager nlm = new RMNodeLabelsManager();
nlm.init(yarnConf);
nlm.start();
context.setNodeLabelManager(nlm);
scheduler.setRMContext(context);
scheduler.init(yarnConf);
scheduler.start();
dispatcher.register(SchedulerEventType.class, scheduler);
String hostname1 = "localhost1";
Resource capability = BuilderUtils.newResource(4096, 4);
RegisterNodeManagerRequest request1 = recordFactory
.newRecordInstance(RegisterNodeManagerRequest.class);
NodeId nodeId1 = NodeId.newInstance(hostname1, 0);
request1.setNodeId(nodeId1);
request1.setHttpPort(0);
request1.setResource(capability);
resourceTrackerService.registerNodeManager(request1);
Assert.assertNotNull(context.getRMNodes().get(nodeId1));
// verify Scheduler and RMContext use same RMNode reference.
Assert.assertTrue(scheduler.getSchedulerNode(nodeId1).getRMNode() ==
context.getRMNodes().get(nodeId1));
Assert.assertEquals(context.getRMNodes().get(nodeId1).
getTotalCapability(), capability);
Resource capability1 = BuilderUtils.newResource(2048, 2);
request1.setResource(capability1);
resourceTrackerService.registerNodeManager(request1);
Assert.assertNotNull(context.getRMNodes().get(nodeId1));
// verify Scheduler and RMContext use same RMNode reference
// after reconnect.
Assert.assertTrue(scheduler.getSchedulerNode(nodeId1).getRMNode() ==
context.getRMNodes().get(nodeId1));
// verify RMNode's capability is changed.
Assert.assertEquals(context.getRMNodes().get(nodeId1).
getTotalCapability(), capability1);
nlm.stop();
scheduler.stop();
}
}