YARN-2641. Decommission nodes on -refreshNodes instead of next NM-RM heartbeat. (Zhihai Xu via kasha)
(cherry picked from commit da709a2eac
)
This commit is contained in:
parent
31a4bf7321
commit
a1116b56a4
|
@ -20,6 +20,9 @@ Release 2.7.0 - UNRELEASED
|
||||||
YARN-2635. TestRM, TestRMRestart, TestClientToAMTokens should run
|
YARN-2635. TestRM, TestRMRestart, TestClientToAMTokens should run
|
||||||
with both CS and FS. (Wei Yan and kasha via kasha)
|
with both CS and FS. (Wei Yan and kasha via kasha)
|
||||||
|
|
||||||
|
YARN-2641. Decommission nodes on -refreshNodes instead of next
|
||||||
|
NM-RM heartbeat. (Zhihai Xu via kasha)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
BUG FIXES
|
BUG FIXES
|
||||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.net.NetUtils;
|
import org.apache.hadoop.net.NetUtils;
|
||||||
import org.apache.hadoop.service.AbstractService;
|
import org.apache.hadoop.service.AbstractService;
|
||||||
import org.apache.hadoop.util.HostsFileReader;
|
import org.apache.hadoop.util.HostsFileReader;
|
||||||
|
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.event.EventHandler;
|
import org.apache.hadoop.yarn.event.EventHandler;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
@ -38,6 +39,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppNodeUpdateEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppNodeUpdateEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppNodeUpdateEvent.RMAppNodeUpdateType;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppNodeUpdateEvent.RMAppNodeUpdateType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEvent;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
|
||||||
|
@ -123,6 +126,13 @@ public class NodesListManager extends AbstractService implements
|
||||||
.getConfigurationInputStream(this.conf, excludesFile));
|
.getConfigurationInputStream(this.conf, excludesFile));
|
||||||
printConfiguredHosts();
|
printConfiguredHosts();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (NodeId nodeId: rmContext.getRMNodes().keySet()) {
|
||||||
|
if (!isValidNode(nodeId.getHost())) {
|
||||||
|
this.rmContext.getDispatcher().getEventHandler().handle(
|
||||||
|
new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setDecomissionedNMsMetrics() {
|
private void setDecomissionedNMsMetrics() {
|
||||||
|
|
|
@ -349,15 +349,25 @@ public class ResourceTrackerService extends AbstractService implements
|
||||||
NodeStatus remoteNodeStatus = request.getNodeStatus();
|
NodeStatus remoteNodeStatus = request.getNodeStatus();
|
||||||
/**
|
/**
|
||||||
* Here is the node heartbeat sequence...
|
* Here is the node heartbeat sequence...
|
||||||
* 1. Check if it's a registered node
|
* 1. Check if it's a valid (i.e. not excluded) node
|
||||||
* 2. Check if it's a valid (i.e. not excluded) node
|
* 2. Check if it's a registered node
|
||||||
* 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
|
* 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
|
||||||
* 4. Send healthStatus to RMNode
|
* 4. Send healthStatus to RMNode
|
||||||
*/
|
*/
|
||||||
|
|
||||||
NodeId nodeId = remoteNodeStatus.getNodeId();
|
NodeId nodeId = remoteNodeStatus.getNodeId();
|
||||||
|
|
||||||
// 1. Check if it's a registered node
|
// 1. Check if it's a valid (i.e. not excluded) node
|
||||||
|
if (!this.nodesListManager.isValidNode(nodeId.getHost())) {
|
||||||
|
String message =
|
||||||
|
"Disallowed NodeManager nodeId: " + nodeId + " hostname: "
|
||||||
|
+ nodeId.getHost();
|
||||||
|
LOG.info(message);
|
||||||
|
shutDown.setDiagnosticsMessage(message);
|
||||||
|
return shutDown;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Check if it's a registered node
|
||||||
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
|
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
|
||||||
if (rmNode == null) {
|
if (rmNode == null) {
|
||||||
/* node does not exist */
|
/* node does not exist */
|
||||||
|
@ -370,18 +380,6 @@ public class ResourceTrackerService extends AbstractService implements
|
||||||
// Send ping
|
// Send ping
|
||||||
this.nmLivelinessMonitor.receivedPing(nodeId);
|
this.nmLivelinessMonitor.receivedPing(nodeId);
|
||||||
|
|
||||||
// 2. Check if it's a valid (i.e. not excluded) node
|
|
||||||
if (!this.nodesListManager.isValidNode(rmNode.getHostName())) {
|
|
||||||
String message =
|
|
||||||
"Disallowed NodeManager nodeId: " + nodeId + " hostname: "
|
|
||||||
+ rmNode.getNodeAddress();
|
|
||||||
LOG.info(message);
|
|
||||||
shutDown.setDiagnosticsMessage(message);
|
|
||||||
this.rmContext.getDispatcher().getEventHandler().handle(
|
|
||||||
new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION));
|
|
||||||
return shutDown;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
|
// 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
|
||||||
NodeHeartbeatResponse lastNodeHeartbeatResponse = rmNode.getLastNodeHeartBeatResponse();
|
NodeHeartbeatResponse lastNodeHeartbeatResponse = rmNode.getLastNodeHeartBeatResponse();
|
||||||
if (remoteNodeStatus.getResponseId() + 1 == lastNodeHeartbeatResponse
|
if (remoteNodeStatus.getResponseId() + 1 == lastNodeHeartbeatResponse
|
||||||
|
|
|
@ -130,17 +130,17 @@ public class TestResourceTrackerService {
|
||||||
|
|
||||||
rm.getNodesListManager().refreshNodes(conf);
|
rm.getNodesListManager().refreshNodes(conf);
|
||||||
|
|
||||||
|
checkDecommissionedNMCount(rm, ++metricCount);
|
||||||
|
|
||||||
nodeHeartbeat = nm1.nodeHeartbeat(true);
|
nodeHeartbeat = nm1.nodeHeartbeat(true);
|
||||||
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
||||||
Assert
|
Assert
|
||||||
.assertEquals(0, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
|
.assertEquals(1, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
|
||||||
|
|
||||||
nodeHeartbeat = nm2.nodeHeartbeat(true);
|
nodeHeartbeat = nm2.nodeHeartbeat(true);
|
||||||
Assert.assertTrue("Node is not decommisioned.", NodeAction.SHUTDOWN
|
Assert.assertTrue("Node is not decommisioned.", NodeAction.SHUTDOWN
|
||||||
.equals(nodeHeartbeat.getNodeAction()));
|
.equals(nodeHeartbeat.getNodeAction()));
|
||||||
|
|
||||||
checkDecommissionedNMCount(rm, ++metricCount);
|
|
||||||
|
|
||||||
nodeHeartbeat = nm3.nodeHeartbeat(true);
|
nodeHeartbeat = nm3.nodeHeartbeat(true);
|
||||||
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
||||||
Assert.assertEquals(metricCount, ClusterMetrics.getMetrics()
|
Assert.assertEquals(metricCount, ClusterMetrics.getMetrics()
|
||||||
|
@ -185,6 +185,8 @@ public class TestResourceTrackerService {
|
||||||
|
|
||||||
rm.getNodesListManager().refreshNodes(conf);
|
rm.getNodesListManager().refreshNodes(conf);
|
||||||
|
|
||||||
|
checkDecommissionedNMCount(rm, metricCount + 2);
|
||||||
|
|
||||||
nodeHeartbeat = nm1.nodeHeartbeat(true);
|
nodeHeartbeat = nm1.nodeHeartbeat(true);
|
||||||
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
||||||
nodeHeartbeat = nm2.nodeHeartbeat(true);
|
nodeHeartbeat = nm2.nodeHeartbeat(true);
|
||||||
|
@ -195,7 +197,7 @@ public class TestResourceTrackerService {
|
||||||
Assert.assertTrue("The decommisioned metrics are not updated",
|
Assert.assertTrue("The decommisioned metrics are not updated",
|
||||||
NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
|
NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
|
||||||
dispatcher.await();
|
dispatcher.await();
|
||||||
checkDecommissionedNMCount(rm, metricCount + 2);
|
|
||||||
writeToHostsFile("");
|
writeToHostsFile("");
|
||||||
rm.getNodesListManager().refreshNodes(conf);
|
rm.getNodesListManager().refreshNodes(conf);
|
||||||
|
|
||||||
|
@ -234,6 +236,7 @@ public class TestResourceTrackerService {
|
||||||
conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile
|
conf.set(YarnConfiguration.RM_NODES_INCLUDE_FILE_PATH, hostFile
|
||||||
.getAbsolutePath());
|
.getAbsolutePath());
|
||||||
rm.getNodesListManager().refreshNodes(conf);
|
rm.getNodesListManager().refreshNodes(conf);
|
||||||
|
checkDecommissionedNMCount(rm, ++initialMetricCount);
|
||||||
nodeHeartbeat = nm1.nodeHeartbeat(true);
|
nodeHeartbeat = nm1.nodeHeartbeat(true);
|
||||||
Assert.assertEquals(
|
Assert.assertEquals(
|
||||||
"Node should not have been decomissioned.",
|
"Node should not have been decomissioned.",
|
||||||
|
@ -243,7 +246,6 @@ public class TestResourceTrackerService {
|
||||||
Assert.assertEquals("Node should have been decomissioned but is in state" +
|
Assert.assertEquals("Node should have been decomissioned but is in state" +
|
||||||
nodeHeartbeat.getNodeAction(),
|
nodeHeartbeat.getNodeAction(),
|
||||||
NodeAction.SHUTDOWN, nodeHeartbeat.getNodeAction());
|
NodeAction.SHUTDOWN, nodeHeartbeat.getNodeAction());
|
||||||
checkDecommissionedNMCount(rm, ++initialMetricCount);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -271,6 +273,7 @@ public class TestResourceTrackerService {
|
||||||
conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile
|
conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH, hostFile
|
||||||
.getAbsolutePath());
|
.getAbsolutePath());
|
||||||
rm.getNodesListManager().refreshNodes(conf);
|
rm.getNodesListManager().refreshNodes(conf);
|
||||||
|
checkDecommissionedNMCount(rm, ++initialMetricCount);
|
||||||
nodeHeartbeat = nm1.nodeHeartbeat(true);
|
nodeHeartbeat = nm1.nodeHeartbeat(true);
|
||||||
Assert.assertEquals(
|
Assert.assertEquals(
|
||||||
"Node should not have been decomissioned.",
|
"Node should not have been decomissioned.",
|
||||||
|
@ -280,7 +283,6 @@ public class TestResourceTrackerService {
|
||||||
Assert.assertEquals("Node should have been decomissioned but is in state" +
|
Assert.assertEquals("Node should have been decomissioned but is in state" +
|
||||||
nodeHeartbeat.getNodeAction(),
|
nodeHeartbeat.getNodeAction(),
|
||||||
NodeAction.SHUTDOWN, nodeHeartbeat.getNodeAction());
|
NodeAction.SHUTDOWN, nodeHeartbeat.getNodeAction());
|
||||||
checkDecommissionedNMCount(rm, ++initialMetricCount);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue