YARN-3896. RMNode transitioned from RUNNING to REBOOTED because its response id has not been reset synchronously. (Jun Gong via rohithsharmaks)
(cherry picked from commit feaf034994
)
This commit is contained in:
parent
f7ee225052
commit
6dc732f2f7
|
@ -149,6 +149,9 @@ public class NodeInfo {
|
|||
return null;
|
||||
}
|
||||
|
||||
public void resetLastNodeHeartBeatResponse() {
|
||||
}
|
||||
|
||||
public List<UpdatedContainerInfo> pullContainerUpdates() {
|
||||
ArrayList<UpdatedContainerInfo> list = new ArrayList<UpdatedContainerInfo>();
|
||||
|
||||
|
|
|
@ -134,6 +134,11 @@ public class RMNodeWrapper implements RMNode {
|
|||
return node.getLastNodeHeartBeatResponse();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetLastNodeHeartBeatResponse() {
|
||||
node.getLastNodeHeartBeatResponse().setResponseId(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public List<UpdatedContainerInfo> pullContainerUpdates() {
|
||||
|
|
|
@ -740,6 +740,9 @@ Release 2.8.0 - UNRELEASED
|
|||
YARN-3986. getTransferredContainers in AbstractYarnScheduler should be present
|
||||
in YarnScheduler interface instead. (Varun Saxena via rohithsharmaks)
|
||||
|
||||
YARN-3896. RMNode transitioned from RUNNING to REBOOTED because its response id
|
||||
has not been reset synchronously. (Jun Gong via rohithsharmaks)
|
||||
|
||||
Release 2.7.2 - UNRELEASED
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -325,6 +325,8 @@ public class ResourceTrackerService extends AbstractService implements
|
|||
} else {
|
||||
LOG.info("Reconnect from the node at: " + host);
|
||||
this.nmLivelinessMonitor.unregister(nodeId);
|
||||
// Reset heartbeat ID since node just restarted.
|
||||
oldNode.resetLastNodeHeartBeatResponse();
|
||||
this.rmContext
|
||||
.getDispatcher()
|
||||
.getEventHandler()
|
||||
|
|
|
@ -129,7 +129,12 @@ public interface RMNode {
|
|||
public void updateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response);
|
||||
|
||||
public NodeHeartbeatResponse getLastNodeHeartBeatResponse();
|
||||
|
||||
|
||||
/**
|
||||
* Reset lastNodeHeartbeatResponse's ID to 0.
|
||||
*/
|
||||
void resetLastNodeHeartBeatResponse();
|
||||
|
||||
/**
|
||||
* Get and clear the list of containerUpdates accumulated across NM
|
||||
* heartbeats.
|
||||
|
|
|
@ -443,6 +443,16 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetLastNodeHeartBeatResponse() {
|
||||
this.writeLock.lock();
|
||||
try {
|
||||
latestNodeHeartBeatResponse.setResponseId(0);
|
||||
} finally {
|
||||
this.writeLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
public void handle(RMNodeEvent event) {
|
||||
LOG.debug("Processing " + event.getNodeId() + " of type " + event.getType());
|
||||
try {
|
||||
|
@ -617,8 +627,6 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
|||
new NodeRemovedSchedulerEvent(rmNode));
|
||||
|
||||
if (rmNode.getHttpPort() == newNode.getHttpPort()) {
|
||||
// Reset heartbeat ID since node just restarted.
|
||||
rmNode.getLastNodeHeartBeatResponse().setResponseId(0);
|
||||
if (!rmNode.getTotalCapability().equals(
|
||||
newNode.getTotalCapability())) {
|
||||
rmNode.totalCapability = newNode.getTotalCapability();
|
||||
|
@ -656,9 +664,6 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
|||
|
||||
handleNMContainerStatus(reconnectEvent.getNMContainerStatuses(), rmNode);
|
||||
|
||||
// Reset heartbeat ID since node just restarted.
|
||||
rmNode.getLastNodeHeartBeatResponse().setResponseId(0);
|
||||
|
||||
for (ApplicationId appId : reconnectEvent.getRunningApplications()) {
|
||||
handleRunningAppOnNode(rmNode, rmNode.context, appId, rmNode.nodeId);
|
||||
}
|
||||
|
|
|
@ -200,6 +200,10 @@ public class MockNodes {
|
|||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetLastNodeHeartBeatResponse() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getNodeManagerVersion() {
|
||||
return null;
|
||||
|
|
|
@ -21,6 +21,11 @@ package org.apache.hadoop.yarn.server.resourcemanager.resourcetracker;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.yarn.api.records.NodeState;
|
||||
import org.apache.hadoop.yarn.event.DrainDispatcher;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||
import org.junit.Assert;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||
|
@ -189,4 +194,38 @@ public class TestNMReconnect {
|
|||
nlm.stop();
|
||||
scheduler.stop();
|
||||
}
|
||||
|
||||
@Test(timeout = 10000)
|
||||
public void testRMNodeStatusAfterReconnect() throws Exception {
|
||||
// The node(127.0.0.1:1234) reconnected with RM. When it registered with
|
||||
// RM, RM set its lastNodeHeartbeatResponse's id to 0 asynchronously. But
|
||||
// the node's heartbeat come before RM succeeded setting the id to 0.
|
||||
final DrainDispatcher dispatcher = new DrainDispatcher();
|
||||
MockRM rm = new MockRM(){
|
||||
@Override
|
||||
protected Dispatcher createDispatcher() {
|
||||
return dispatcher;
|
||||
}
|
||||
};
|
||||
rm.start();
|
||||
MockNM nm1 =
|
||||
new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService());
|
||||
nm1.registerNode();
|
||||
int i = 0;
|
||||
while(i < 3) {
|
||||
nm1.nodeHeartbeat(true);
|
||||
dispatcher.await();
|
||||
i++;
|
||||
}
|
||||
|
||||
MockNM nm2 =
|
||||
new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService());
|
||||
nm2.registerNode();
|
||||
RMNode rmNode = rm.getRMContext().getRMNodes().get(nm2.getNodeId());
|
||||
nm2.nodeHeartbeat(true);
|
||||
dispatcher.await();
|
||||
Assert.assertEquals("Node is Not in Running state.", NodeState.RUNNING,
|
||||
rmNode.getState());
|
||||
rm.stop();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue