From ac865de725948569932d2e054703f6ba866484f7 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Thu, 8 Oct 2015 16:39:46 +0000 Subject: [PATCH] YARN-3896. RMNode transitioned from RUNNING to REBOOTED because its response id has not been reset synchronously. (Jun Gong via rohithsharmaks) (cherry picked from commit feaf0349949e831ce3f25814c1bbff52f17bfe8f) Conflicts: hadoop-yarn-project/CHANGES.txt --- .../hadoop/yarn/sls/nodemanager/NodeInfo.java | 3 ++ .../yarn/sls/scheduler/RMNodeWrapper.java | 5 +++ hadoop-yarn-project/CHANGES.txt | 3 ++ .../ResourceTrackerService.java | 2 + .../server/resourcemanager/rmnode/RMNode.java | 7 +++- .../resourcemanager/rmnode/RMNodeImpl.java | 15 ++++--- .../server/resourcemanager/MockNodes.java | 4 ++ .../resourcetracker/TestNMReconnect.java | 39 +++++++++++++++++++ 8 files changed, 72 insertions(+), 6 deletions(-) diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java index ee6eb7b5555..dbea90fc5ad 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/nodemanager/NodeInfo.java @@ -143,6 +143,9 @@ public class NodeInfo { return null; } + public void resetLastNodeHeartBeatResponse() { + } + public List pullContainerUpdates() { ArrayList list = new ArrayList(); diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java index b64be1b61a2..356b8bd2a40 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/RMNodeWrapper.java @@ -129,6 +129,11 @@ public class RMNodeWrapper implements RMNode { return node.getLastNodeHeartBeatResponse(); } + @Override + public void resetLastNodeHeartBeatResponse() { + node.getLastNodeHeartBeatResponse().setResponseId(0); + } + @Override @SuppressWarnings("unchecked") public List pullContainerUpdates() { diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 5fc76b3637a..d4dd07d4360 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -42,6 +42,9 @@ Release 2.6.2 - UNRELEASED YARN-3194. RM should handle NMContainerStatuses sent by NM while registering if NM is Reconnected node (Rohith via jlowe) + YARN-3896. RMNode transitioned from RUNNING to REBOOTED because its response id + has not been reset synchronously. (Jun Gong via rohithsharmaks) + Release 2.6.1 - 2015-09-23 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java index 29a6920f8a6..1352cc50f21 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java @@ -312,6 +312,8 @@ public class ResourceTrackerService extends AbstractService implements } else { LOG.info("Reconnect from the node at: " + host); this.nmLivelinessMonitor.unregister(nodeId); + // Reset heartbeat ID since node just restarted. + oldNode.resetLastNodeHeartBeatResponse(); this.rmContext .getDispatcher() .getEventHandler() diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java index 95eeaf6b83b..ed6875b130e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNode.java @@ -127,7 +127,12 @@ public interface RMNode { public void updateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response); public NodeHeartbeatResponse getLastNodeHeartBeatResponse(); - + + /** + * Reset lastNodeHeartbeatResponse's ID to 0. + */ + void resetLastNodeHeartBeatResponse(); + /** * Get and clear the list of containerUpdates accumulated across NM * heartbeats. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java index 694cd1a0fc8..d2fe9919804 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmnode/RMNodeImpl.java @@ -408,6 +408,16 @@ public class RMNodeImpl implements RMNode, EventHandler { } } + @Override + public void resetLastNodeHeartBeatResponse() { + this.writeLock.lock(); + try { + latestNodeHeartBeatResponse.setResponseId(0); + } finally { + this.writeLock.unlock(); + } + } + public void handle(RMNodeEvent event) { LOG.debug("Processing " + event.getNodeId() + " of type " + event.getType()); try { @@ -567,8 +577,6 @@ public class RMNodeImpl implements RMNode, EventHandler { new NodeRemovedSchedulerEvent(rmNode)); if (rmNode.getHttpPort() == newNode.getHttpPort()) { - // Reset heartbeat ID since node just restarted. - rmNode.getLastNodeHeartBeatResponse().setResponseId(0); if (!rmNode.getTotalCapability().equals( newNode.getTotalCapability())) { rmNode.totalCapability = newNode.getTotalCapability(); @@ -604,9 +612,6 @@ public class RMNodeImpl implements RMNode, EventHandler { handleNMContainerStatus(reconnectEvent.getNMContainerStatuses(), rmNode); - // Reset heartbeat ID since node just restarted. - rmNode.getLastNodeHeartBeatResponse().setResponseId(0); - for (ApplicationId appId : reconnectEvent.getRunningApplications()) { handleRunningAppOnNode(rmNode, rmNode.context, appId, rmNode.nodeId); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java index 278c151ff66..6c9852451d4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockNodes.java @@ -185,6 +185,10 @@ public class MockNodes { return null; } + @Override + public void resetLastNodeHeartBeatResponse() { + } + @Override public String getNodeManagerVersion() { return null; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java index b525efced6f..dce3d06c262 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java @@ -21,6 +21,11 @@ package org.apache.hadoop.yarn.server.resourcemanager.resourcetracker; import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.yarn.api.records.NodeState; +import org.apache.hadoop.yarn.event.DrainDispatcher; +import org.apache.hadoop.yarn.server.resourcemanager.MockNM; +import org.apache.hadoop.yarn.server.resourcemanager.MockRM; +import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.junit.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.NodeId; @@ -189,4 +194,38 @@ public class TestNMReconnect { nlm.stop(); scheduler.stop(); } + + @Test(timeout = 10000) + public void testRMNodeStatusAfterReconnect() throws Exception { + // The node(127.0.0.1:1234) reconnected with RM. When it registered with + // RM, RM set its lastNodeHeartbeatResponse's id to 0 asynchronously. But + // the node's heartbeat come before RM succeeded setting the id to 0. + final DrainDispatcher dispatcher = new DrainDispatcher(); + MockRM rm = new MockRM(){ + @Override + protected Dispatcher createDispatcher() { + return dispatcher; + } + }; + rm.start(); + MockNM nm1 = + new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService()); + nm1.registerNode(); + int i = 0; + while(i < 3) { + nm1.nodeHeartbeat(true); + dispatcher.await(); + i++; + } + + MockNM nm2 = + new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService()); + nm2.registerNode(); + RMNode rmNode = rm.getRMContext().getRMNodes().get(nm2.getNodeId()); + nm2.nodeHeartbeat(true); + dispatcher.await(); + Assert.assertEquals("Node is Not in Running state.", NodeState.RUNNING, + rmNode.getState()); + rm.stop(); + } }