YARN-2523. ResourceManager UI showing negative value for "Decommissioned Nodes" field. Contributed by Rohith
(cherry picked from commit 8269bfa613
)
This commit is contained in:
parent
87f07e67b1
commit
e4d46e5ff6
|
@ -417,6 +417,9 @@ Release 2.6.0 - UNRELEASED
|
|||
YARN-2546. Made REST API for application creation/submission use numeric and
|
||||
boolean types instead of the string of them. (Varun Vasudev via zjshen)
|
||||
|
||||
YARN-2523. ResourceManager UI showing negative value for "Decommissioned
|
||||
Nodes" field (Rohith via jlowe)
|
||||
|
||||
Release 2.5.1 - 2014-09-05
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -121,7 +121,6 @@ public class NodesListManager extends AbstractService implements
|
|||
this.conf, includesFile), excludesFile.isEmpty() ? null
|
||||
: this.rmContext.getConfigurationProvider()
|
||||
.getConfigurationInputStream(this.conf, excludesFile));
|
||||
setDecomissionedNMsMetrics();
|
||||
printConfiguredHosts();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -460,22 +460,9 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
|||
break;
|
||||
}
|
||||
|
||||
// Decomissioned NMs equals to the nodes missing in include list (if
|
||||
// include list not empty) or the nodes listed in excluded list.
|
||||
// DecomissionedNMs as per exclude list is set upfront when the
|
||||
// exclude list is read so that RM restart can also reflect the
|
||||
// decomissionedNMs. Note that RM is still not able to know decomissionedNMs
|
||||
// as per include list after it restarts as they are known when those nodes
|
||||
// come for registration.
|
||||
// DecomissionedNMs as per include list is incremented in this transition.
|
||||
switch (finalState) {
|
||||
case DECOMMISSIONED:
|
||||
Set<String> ecludedHosts =
|
||||
context.getNodesListManager().getHostsReader().getExcludedHosts();
|
||||
if (!ecludedHosts.contains(hostName)
|
||||
&& !ecludedHosts.contains(NetUtils.normalizeHostName(hostName))) {
|
||||
metrics.incrDecommisionedNMs();
|
||||
}
|
||||
break;
|
||||
case LOST:
|
||||
metrics.incrNumLostNMs();
|
||||
|
|
|
@ -77,6 +77,8 @@ import org.apache.hadoop.yarn.api.records.Resource;
|
|||
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||
import org.apache.hadoop.yarn.event.DrainDispatcher;
|
||||
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.security.client.RMDelegationTokenIdentifier;
|
||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
||||
|
@ -1833,10 +1835,16 @@ public class TestRMRestart {
|
|||
conf.set(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
|
||||
hostFile.getAbsolutePath());
|
||||
writeToHostsFile("");
|
||||
MockRM rm1 = new MockRM(conf);
|
||||
final DrainDispatcher dispatcher = new DrainDispatcher();
|
||||
MockRM rm1 = new MockRM(conf) {
|
||||
@Override
|
||||
protected Dispatcher createDispatcher() {
|
||||
return dispatcher;
|
||||
}
|
||||
};
|
||||
rm1.start();
|
||||
rm1.registerNode("localhost:1234", 8000);
|
||||
rm1.registerNode("host2:1234", 8000);
|
||||
MockNM nm1 = rm1.registerNode("localhost:1234", 8000);
|
||||
MockNM nm2 = rm1.registerNode("host2:1234", 8000);
|
||||
Assert
|
||||
.assertEquals(0, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
|
||||
String ip = NetUtils.normalizeHostName("localhost");
|
||||
|
@ -1845,15 +1853,25 @@ public class TestRMRestart {
|
|||
|
||||
// refresh nodes
|
||||
rm1.getNodesListManager().refreshNodes(conf);
|
||||
NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
|
||||
Assert
|
||||
.assertTrue(NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
|
||||
nodeHeartbeat = nm2.nodeHeartbeat(true);
|
||||
Assert.assertTrue("The decommisioned metrics are not updated",
|
||||
NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
|
||||
|
||||
dispatcher.await();
|
||||
Assert
|
||||
.assertEquals(2, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
|
||||
rm1.stop();
|
||||
Assert
|
||||
.assertEquals(0, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
|
||||
|
||||
// restart RM.
|
||||
MockRM rm2 = new MockRM(conf);
|
||||
rm2.start();
|
||||
Assert
|
||||
.assertEquals(2, ClusterMetrics.getMetrics().getNumDecommisionedNMs());
|
||||
rm1.stop();
|
||||
rm2.stop();
|
||||
}
|
||||
|
||||
|
|
|
@ -157,25 +157,33 @@ public class TestResourceTrackerService {
|
|||
.getAbsolutePath());
|
||||
|
||||
writeToHostsFile("");
|
||||
rm = new MockRM(conf);
|
||||
final DrainDispatcher dispatcher = new DrainDispatcher();
|
||||
rm = new MockRM(conf) {
|
||||
@Override
|
||||
protected Dispatcher createDispatcher() {
|
||||
return dispatcher;
|
||||
}
|
||||
};
|
||||
rm.start();
|
||||
|
||||
MockNM nm1 = rm.registerNode("host1:1234", 5120);
|
||||
MockNM nm2 = rm.registerNode("host2:5678", 10240);
|
||||
MockNM nm3 = rm.registerNode("localhost:4433", 1024);
|
||||
|
||||
dispatcher.await();
|
||||
|
||||
int metricCount = ClusterMetrics.getMetrics().getNumDecommisionedNMs();
|
||||
NodeHeartbeatResponse nodeHeartbeat = nm1.nodeHeartbeat(true);
|
||||
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
||||
nodeHeartbeat = nm2.nodeHeartbeat(true);
|
||||
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
||||
dispatcher.await();
|
||||
|
||||
// To test that IPs also work
|
||||
String ip = NetUtils.normalizeHostName("localhost");
|
||||
writeToHostsFile("host2", ip);
|
||||
|
||||
rm.getNodesListManager().refreshNodes(conf);
|
||||
checkDecommissionedNMCount(rm, metricCount + 2);
|
||||
|
||||
nodeHeartbeat = nm1.nodeHeartbeat(true);
|
||||
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
||||
|
@ -186,6 +194,19 @@ public class TestResourceTrackerService {
|
|||
nodeHeartbeat = nm3.nodeHeartbeat(true);
|
||||
Assert.assertTrue("The decommisioned metrics are not updated",
|
||||
NodeAction.SHUTDOWN.equals(nodeHeartbeat.getNodeAction()));
|
||||
dispatcher.await();
|
||||
checkDecommissionedNMCount(rm, metricCount + 2);
|
||||
writeToHostsFile("");
|
||||
rm.getNodesListManager().refreshNodes(conf);
|
||||
|
||||
nm3 = rm.registerNode("localhost:4433", 1024);
|
||||
dispatcher.await();
|
||||
nodeHeartbeat = nm3.nodeHeartbeat(true);
|
||||
dispatcher.await();
|
||||
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
||||
// decommissined node is 1 since 1 node is rejoined after updating exclude
|
||||
// file
|
||||
checkDecommissionedNMCount(rm, metricCount + 1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in New Issue