YARN-3212. RMNode State Transition Update with DECOMMISSIONING state. (Junping Du via wangda)
(cherry picked from commit 9bc913a35c
)
This commit is contained in:
parent
b4e1279217
commit
4a657e9326
|
@ -142,6 +142,9 @@ Release 2.8.0 - UNRELEASED
|
||||||
YARN-4034. Render cluster Max Priority in scheduler metrics in RM web
|
YARN-4034. Render cluster Max Priority in scheduler metrics in RM web
|
||||||
UI. (Rohith Sharma K S via jianhe)
|
UI. (Rohith Sharma K S via jianhe)
|
||||||
|
|
||||||
|
YARN-3212. RMNode State Transition Update with DECOMMISSIONING state.
|
||||||
|
(Junping Du via wangda)
|
||||||
|
|
||||||
IMPROVEMENTS
|
IMPROVEMENTS
|
||||||
|
|
||||||
YARN-644. Basic null check is not performed on passed in arguments before
|
YARN-644. Basic null check is not performed on passed in arguments before
|
||||||
|
|
|
@ -399,7 +399,7 @@ public class NodesListManager extends CompositeService implements
|
||||||
NodeId nodeId = entry.getKey();
|
NodeId nodeId = entry.getKey();
|
||||||
if (!isValidNode(nodeId.getHost())) {
|
if (!isValidNode(nodeId.getHost())) {
|
||||||
this.rmContext.getDispatcher().getEventHandler().handle(
|
this.rmContext.getDispatcher().getEventHandler().handle(
|
||||||
new RMNodeEvent(nodeId, RMNodeEventType.DECOMMISSION_WITH_TIMEOUT));
|
new RMNodeEvent(nodeId, RMNodeEventType.GRACEFUL_DECOMMISSION));
|
||||||
} else {
|
} else {
|
||||||
// Recommissioning the nodes
|
// Recommissioning the nodes
|
||||||
if (entry.getValue().getState() == NodeState.DECOMMISSIONING
|
if (entry.getValue().getState() == NodeState.DECOMMISSIONING
|
||||||
|
|
|
@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.NodeId;
|
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||||
import org.apache.hadoop.yarn.api.records.NodeLabel;
|
import org.apache.hadoop.yarn.api.records.NodeLabel;
|
||||||
|
import org.apache.hadoop.yarn.api.records.NodeState;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
|
@ -399,8 +400,10 @@ public class ResourceTrackerService extends AbstractService implements
|
||||||
|
|
||||||
NodeId nodeId = remoteNodeStatus.getNodeId();
|
NodeId nodeId = remoteNodeStatus.getNodeId();
|
||||||
|
|
||||||
// 1. Check if it's a valid (i.e. not excluded) node
|
// 1. Check if it's a valid (i.e. not excluded) node, if not, see if it is
|
||||||
if (!this.nodesListManager.isValidNode(nodeId.getHost())) {
|
// in decommissioning.
|
||||||
|
if (!this.nodesListManager.isValidNode(nodeId.getHost())
|
||||||
|
&& !isNodeInDecommissioning(nodeId)) {
|
||||||
String message =
|
String message =
|
||||||
"Disallowed NodeManager nodeId: " + nodeId + " hostname: "
|
"Disallowed NodeManager nodeId: " + nodeId + " hostname: "
|
||||||
+ nodeId.getHost();
|
+ nodeId.getHost();
|
||||||
|
@ -486,6 +489,19 @@ public class ResourceTrackerService extends AbstractService implements
|
||||||
return nodeHeartBeatResponse;
|
return nodeHeartBeatResponse;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if node in decommissioning state.
|
||||||
|
* @param nodeId
|
||||||
|
*/
|
||||||
|
private boolean isNodeInDecommissioning(NodeId nodeId) {
|
||||||
|
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
|
||||||
|
if (rmNode != null &&
|
||||||
|
rmNode.getState().equals(NodeState.DECOMMISSIONING)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
@Override
|
@Override
|
||||||
public UnRegisterNodeManagerResponse unRegisterNodeManager(
|
public UnRegisterNodeManagerResponse unRegisterNodeManager(
|
||||||
|
|
|
@ -24,7 +24,7 @@ public enum RMNodeEventType {
|
||||||
|
|
||||||
// Source: AdminService
|
// Source: AdminService
|
||||||
DECOMMISSION,
|
DECOMMISSION,
|
||||||
DECOMMISSION_WITH_TIMEOUT,
|
GRACEFUL_DECOMMISSION,
|
||||||
RECOMMISSION,
|
RECOMMISSION,
|
||||||
|
|
||||||
// Source: AdminService, ResourceTrackerService
|
// Source: AdminService, ResourceTrackerService
|
||||||
|
|
|
@ -144,101 +144,150 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
RMNodeEventType,
|
RMNodeEventType,
|
||||||
RMNodeEvent>(NodeState.NEW)
|
RMNodeEvent>(NodeState.NEW)
|
||||||
|
|
||||||
//Transitions from NEW state
|
//Transitions from NEW state
|
||||||
.addTransition(NodeState.NEW, NodeState.RUNNING,
|
.addTransition(NodeState.NEW, NodeState.RUNNING,
|
||||||
RMNodeEventType.STARTED, new AddNodeTransition())
|
RMNodeEventType.STARTED, new AddNodeTransition())
|
||||||
.addTransition(NodeState.NEW, NodeState.NEW,
|
.addTransition(NodeState.NEW, NodeState.NEW,
|
||||||
RMNodeEventType.RESOURCE_UPDATE,
|
RMNodeEventType.RESOURCE_UPDATE,
|
||||||
new UpdateNodeResourceWhenUnusableTransition())
|
new UpdateNodeResourceWhenUnusableTransition())
|
||||||
|
|
||||||
//Transitions from RUNNING state
|
//Transitions from RUNNING state
|
||||||
.addTransition(NodeState.RUNNING,
|
.addTransition(NodeState.RUNNING,
|
||||||
EnumSet.of(NodeState.RUNNING, NodeState.UNHEALTHY),
|
EnumSet.of(NodeState.RUNNING, NodeState.UNHEALTHY),
|
||||||
RMNodeEventType.STATUS_UPDATE, new StatusUpdateWhenHealthyTransition())
|
RMNodeEventType.STATUS_UPDATE,
|
||||||
.addTransition(NodeState.RUNNING, NodeState.DECOMMISSIONED,
|
new StatusUpdateWhenHealthyTransition())
|
||||||
RMNodeEventType.DECOMMISSION,
|
.addTransition(NodeState.RUNNING, NodeState.DECOMMISSIONED,
|
||||||
new DeactivateNodeTransition(NodeState.DECOMMISSIONED))
|
RMNodeEventType.DECOMMISSION,
|
||||||
.addTransition(NodeState.RUNNING, NodeState.LOST,
|
new DeactivateNodeTransition(NodeState.DECOMMISSIONED))
|
||||||
RMNodeEventType.EXPIRE,
|
.addTransition(NodeState.RUNNING, NodeState.DECOMMISSIONING,
|
||||||
new DeactivateNodeTransition(NodeState.LOST))
|
RMNodeEventType.GRACEFUL_DECOMMISSION,
|
||||||
.addTransition(NodeState.RUNNING, NodeState.REBOOTED,
|
new DecommissioningNodeTransition(NodeState.RUNNING,
|
||||||
RMNodeEventType.REBOOTING,
|
NodeState.DECOMMISSIONING))
|
||||||
new DeactivateNodeTransition(NodeState.REBOOTED))
|
.addTransition(NodeState.RUNNING, NodeState.LOST,
|
||||||
.addTransition(NodeState.RUNNING, NodeState.RUNNING,
|
RMNodeEventType.EXPIRE,
|
||||||
RMNodeEventType.CLEANUP_APP, new CleanUpAppTransition())
|
new DeactivateNodeTransition(NodeState.LOST))
|
||||||
.addTransition(NodeState.RUNNING, NodeState.RUNNING,
|
.addTransition(NodeState.RUNNING, NodeState.REBOOTED,
|
||||||
RMNodeEventType.CLEANUP_CONTAINER, new CleanUpContainerTransition())
|
RMNodeEventType.REBOOTING,
|
||||||
.addTransition(NodeState.RUNNING, NodeState.RUNNING,
|
new DeactivateNodeTransition(NodeState.REBOOTED))
|
||||||
RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM,
|
.addTransition(NodeState.RUNNING, NodeState.RUNNING,
|
||||||
new AddContainersToBeRemovedFromNMTransition())
|
RMNodeEventType.CLEANUP_APP, new CleanUpAppTransition())
|
||||||
.addTransition(NodeState.RUNNING, NodeState.RUNNING,
|
.addTransition(NodeState.RUNNING, NodeState.RUNNING,
|
||||||
RMNodeEventType.RECONNECTED, new ReconnectNodeTransition())
|
RMNodeEventType.CLEANUP_CONTAINER, new CleanUpContainerTransition())
|
||||||
.addTransition(NodeState.RUNNING, NodeState.RUNNING,
|
.addTransition(NodeState.RUNNING, NodeState.RUNNING,
|
||||||
RMNodeEventType.RESOURCE_UPDATE, new UpdateNodeResourceWhenRunningTransition())
|
RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM,
|
||||||
.addTransition(NodeState.RUNNING, NodeState.SHUTDOWN,
|
new AddContainersToBeRemovedFromNMTransition())
|
||||||
RMNodeEventType.SHUTDOWN,
|
.addTransition(NodeState.RUNNING, EnumSet.of(NodeState.RUNNING),
|
||||||
new DeactivateNodeTransition(NodeState.SHUTDOWN))
|
RMNodeEventType.RECONNECTED, new ReconnectNodeTransition())
|
||||||
|
.addTransition(NodeState.RUNNING, NodeState.RUNNING,
|
||||||
|
RMNodeEventType.RESOURCE_UPDATE, new UpdateNodeResourceWhenRunningTransition())
|
||||||
|
.addTransition(NodeState.RUNNING, NodeState.SHUTDOWN,
|
||||||
|
RMNodeEventType.SHUTDOWN,
|
||||||
|
new DeactivateNodeTransition(NodeState.SHUTDOWN))
|
||||||
|
|
||||||
//Transitions from REBOOTED state
|
//Transitions from REBOOTED state
|
||||||
.addTransition(NodeState.REBOOTED, NodeState.REBOOTED,
|
.addTransition(NodeState.REBOOTED, NodeState.REBOOTED,
|
||||||
RMNodeEventType.RESOURCE_UPDATE,
|
RMNodeEventType.RESOURCE_UPDATE,
|
||||||
new UpdateNodeResourceWhenUnusableTransition())
|
new UpdateNodeResourceWhenUnusableTransition())
|
||||||
|
|
||||||
//Transitions from DECOMMISSIONED state
|
|
||||||
.addTransition(NodeState.DECOMMISSIONED, NodeState.DECOMMISSIONED,
|
|
||||||
RMNodeEventType.RESOURCE_UPDATE,
|
|
||||||
new UpdateNodeResourceWhenUnusableTransition())
|
|
||||||
.addTransition(NodeState.DECOMMISSIONED, NodeState.DECOMMISSIONED,
|
|
||||||
RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM,
|
|
||||||
new AddContainersToBeRemovedFromNMTransition())
|
|
||||||
|
|
||||||
//Transitions from LOST state
|
//Transitions from DECOMMISSIONED state
|
||||||
.addTransition(NodeState.LOST, NodeState.LOST,
|
.addTransition(NodeState.DECOMMISSIONED, NodeState.DECOMMISSIONED,
|
||||||
RMNodeEventType.RESOURCE_UPDATE,
|
RMNodeEventType.RESOURCE_UPDATE,
|
||||||
new UpdateNodeResourceWhenUnusableTransition())
|
new UpdateNodeResourceWhenUnusableTransition())
|
||||||
.addTransition(NodeState.LOST, NodeState.LOST,
|
.addTransition(NodeState.DECOMMISSIONED, NodeState.DECOMMISSIONED,
|
||||||
RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM,
|
RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM,
|
||||||
new AddContainersToBeRemovedFromNMTransition())
|
new AddContainersToBeRemovedFromNMTransition())
|
||||||
|
|
||||||
//Transitions from UNHEALTHY state
|
//Transitions from DECOMMISSIONING state
|
||||||
.addTransition(NodeState.UNHEALTHY,
|
.addTransition(NodeState.DECOMMISSIONING, NodeState.DECOMMISSIONED,
|
||||||
EnumSet.of(NodeState.UNHEALTHY, NodeState.RUNNING),
|
RMNodeEventType.DECOMMISSION,
|
||||||
RMNodeEventType.STATUS_UPDATE,
|
new DeactivateNodeTransition(NodeState.DECOMMISSIONED))
|
||||||
new StatusUpdateWhenUnHealthyTransition())
|
.addTransition(NodeState.DECOMMISSIONING, NodeState.RUNNING,
|
||||||
.addTransition(NodeState.UNHEALTHY, NodeState.DECOMMISSIONED,
|
RMNodeEventType.RECOMMISSION,
|
||||||
RMNodeEventType.DECOMMISSION,
|
new RecommissionNodeTransition(NodeState.RUNNING))
|
||||||
new DeactivateNodeTransition(NodeState.DECOMMISSIONED))
|
.addTransition(NodeState.DECOMMISSIONING,
|
||||||
.addTransition(NodeState.UNHEALTHY, NodeState.LOST,
|
EnumSet.of(NodeState.DECOMMISSIONING, NodeState.DECOMMISSIONED),
|
||||||
RMNodeEventType.EXPIRE,
|
RMNodeEventType.STATUS_UPDATE,
|
||||||
new DeactivateNodeTransition(NodeState.LOST))
|
new StatusUpdateWhenHealthyTransition())
|
||||||
.addTransition(NodeState.UNHEALTHY, NodeState.REBOOTED,
|
.addTransition(NodeState.DECOMMISSIONING, NodeState.DECOMMISSIONING,
|
||||||
RMNodeEventType.REBOOTING,
|
RMNodeEventType.GRACEFUL_DECOMMISSION,
|
||||||
new DeactivateNodeTransition(NodeState.REBOOTED))
|
new DecommissioningNodeTransition(NodeState.DECOMMISSIONING,
|
||||||
.addTransition(NodeState.UNHEALTHY, NodeState.UNHEALTHY,
|
NodeState.DECOMMISSIONING))
|
||||||
RMNodeEventType.RECONNECTED, new ReconnectNodeTransition())
|
.addTransition(NodeState.DECOMMISSIONING, NodeState.LOST,
|
||||||
.addTransition(NodeState.UNHEALTHY, NodeState.UNHEALTHY,
|
RMNodeEventType.EXPIRE,
|
||||||
RMNodeEventType.CLEANUP_APP, new CleanUpAppTransition())
|
new DeactivateNodeTransition(NodeState.LOST))
|
||||||
.addTransition(NodeState.UNHEALTHY, NodeState.UNHEALTHY,
|
.addTransition(NodeState.DECOMMISSIONING, NodeState.REBOOTED,
|
||||||
RMNodeEventType.CLEANUP_CONTAINER, new CleanUpContainerTransition())
|
RMNodeEventType.REBOOTING,
|
||||||
.addTransition(NodeState.UNHEALTHY, NodeState.UNHEALTHY,
|
new DeactivateNodeTransition(NodeState.REBOOTED))
|
||||||
RMNodeEventType.RESOURCE_UPDATE, new UpdateNodeResourceWhenUnusableTransition())
|
|
||||||
.addTransition(NodeState.UNHEALTHY, NodeState.UNHEALTHY,
|
|
||||||
RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM,
|
|
||||||
new AddContainersToBeRemovedFromNMTransition())
|
|
||||||
.addTransition(NodeState.UNHEALTHY, NodeState.SHUTDOWN,
|
|
||||||
RMNodeEventType.SHUTDOWN,
|
|
||||||
new DeactivateNodeTransition(NodeState.SHUTDOWN))
|
|
||||||
|
|
||||||
//Transitions from SHUTDOWN state
|
.addTransition(NodeState.DECOMMISSIONING, NodeState.DECOMMISSIONING,
|
||||||
.addTransition(NodeState.SHUTDOWN, NodeState.SHUTDOWN,
|
RMNodeEventType.CLEANUP_APP, new CleanUpAppTransition())
|
||||||
RMNodeEventType.RESOURCE_UPDATE,
|
|
||||||
new UpdateNodeResourceWhenUnusableTransition())
|
|
||||||
.addTransition(NodeState.SHUTDOWN, NodeState.SHUTDOWN,
|
|
||||||
RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM,
|
|
||||||
new AddContainersToBeRemovedFromNMTransition())
|
|
||||||
|
|
||||||
// create the topology tables
|
// TODO (in YARN-3223) update resource when container finished.
|
||||||
.installTopology();
|
.addTransition(NodeState.DECOMMISSIONING, NodeState.DECOMMISSIONING,
|
||||||
|
RMNodeEventType.CLEANUP_CONTAINER, new CleanUpContainerTransition())
|
||||||
|
// TODO (in YARN-3223) update resource when container finished.
|
||||||
|
.addTransition(NodeState.DECOMMISSIONING, NodeState.DECOMMISSIONING,
|
||||||
|
RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM,
|
||||||
|
new AddContainersToBeRemovedFromNMTransition())
|
||||||
|
.addTransition(NodeState.DECOMMISSIONING, EnumSet.of(
|
||||||
|
NodeState.DECOMMISSIONING, NodeState.DECOMMISSIONED),
|
||||||
|
RMNodeEventType.RECONNECTED, new ReconnectNodeTransition())
|
||||||
|
.addTransition(NodeState.DECOMMISSIONING, NodeState.DECOMMISSIONING,
|
||||||
|
RMNodeEventType.RESOURCE_UPDATE,
|
||||||
|
new UpdateNodeResourceWhenRunningTransition())
|
||||||
|
|
||||||
|
//Transitions from LOST state
|
||||||
|
.addTransition(NodeState.LOST, NodeState.LOST,
|
||||||
|
RMNodeEventType.RESOURCE_UPDATE,
|
||||||
|
new UpdateNodeResourceWhenUnusableTransition())
|
||||||
|
.addTransition(NodeState.LOST, NodeState.LOST,
|
||||||
|
RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM,
|
||||||
|
new AddContainersToBeRemovedFromNMTransition())
|
||||||
|
|
||||||
|
//Transitions from UNHEALTHY state
|
||||||
|
.addTransition(NodeState.UNHEALTHY,
|
||||||
|
EnumSet.of(NodeState.UNHEALTHY, NodeState.RUNNING),
|
||||||
|
RMNodeEventType.STATUS_UPDATE,
|
||||||
|
new StatusUpdateWhenUnHealthyTransition())
|
||||||
|
.addTransition(NodeState.UNHEALTHY, NodeState.DECOMMISSIONED,
|
||||||
|
RMNodeEventType.DECOMMISSION,
|
||||||
|
new DeactivateNodeTransition(NodeState.DECOMMISSIONED))
|
||||||
|
.addTransition(NodeState.UNHEALTHY, NodeState.DECOMMISSIONING,
|
||||||
|
RMNodeEventType.GRACEFUL_DECOMMISSION,
|
||||||
|
new DecommissioningNodeTransition(NodeState.UNHEALTHY,
|
||||||
|
NodeState.DECOMMISSIONING))
|
||||||
|
.addTransition(NodeState.UNHEALTHY, NodeState.LOST,
|
||||||
|
RMNodeEventType.EXPIRE,
|
||||||
|
new DeactivateNodeTransition(NodeState.LOST))
|
||||||
|
.addTransition(NodeState.UNHEALTHY, NodeState.REBOOTED,
|
||||||
|
RMNodeEventType.REBOOTING,
|
||||||
|
new DeactivateNodeTransition(NodeState.REBOOTED))
|
||||||
|
.addTransition(NodeState.UNHEALTHY, EnumSet.of(NodeState.UNHEALTHY),
|
||||||
|
RMNodeEventType.RECONNECTED, new ReconnectNodeTransition())
|
||||||
|
.addTransition(NodeState.UNHEALTHY, NodeState.UNHEALTHY,
|
||||||
|
RMNodeEventType.CLEANUP_APP, new CleanUpAppTransition())
|
||||||
|
.addTransition(NodeState.UNHEALTHY, NodeState.UNHEALTHY,
|
||||||
|
RMNodeEventType.CLEANUP_CONTAINER, new CleanUpContainerTransition())
|
||||||
|
.addTransition(NodeState.UNHEALTHY, NodeState.UNHEALTHY,
|
||||||
|
RMNodeEventType.RESOURCE_UPDATE,
|
||||||
|
new UpdateNodeResourceWhenUnusableTransition())
|
||||||
|
.addTransition(NodeState.UNHEALTHY, NodeState.UNHEALTHY,
|
||||||
|
RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM,
|
||||||
|
new AddContainersToBeRemovedFromNMTransition())
|
||||||
|
.addTransition(NodeState.UNHEALTHY, NodeState.SHUTDOWN,
|
||||||
|
RMNodeEventType.SHUTDOWN,
|
||||||
|
new DeactivateNodeTransition(NodeState.SHUTDOWN))
|
||||||
|
|
||||||
|
//Transitions from SHUTDOWN state
|
||||||
|
.addTransition(NodeState.SHUTDOWN, NodeState.SHUTDOWN,
|
||||||
|
RMNodeEventType.RESOURCE_UPDATE,
|
||||||
|
new UpdateNodeResourceWhenUnusableTransition())
|
||||||
|
.addTransition(NodeState.SHUTDOWN, NodeState.SHUTDOWN,
|
||||||
|
RMNodeEventType.FINISHED_CONTAINERS_PULLED_BY_AM,
|
||||||
|
new AddContainersToBeRemovedFromNMTransition())
|
||||||
|
|
||||||
|
// create the topology tables
|
||||||
|
.installTopology();
|
||||||
|
|
||||||
private final StateMachine<NodeState, RMNodeEventType,
|
private final StateMachine<NodeState, RMNodeEventType,
|
||||||
RMNodeEvent> stateMachine;
|
RMNodeEvent> stateMachine;
|
||||||
|
@ -265,7 +314,7 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
this.writeLock = lock.writeLock();
|
this.writeLock = lock.writeLock();
|
||||||
|
|
||||||
this.stateMachine = stateMachineFactory.make(this);
|
this.stateMachine = stateMachineFactory.make(this);
|
||||||
|
|
||||||
this.nodeUpdateQueue = new ConcurrentLinkedQueue<UpdatedContainerInfo>();
|
this.nodeUpdateQueue = new ConcurrentLinkedQueue<UpdatedContainerInfo>();
|
||||||
|
|
||||||
this.containerAllocationExpirer = context.getContainerAllocationExpirer();
|
this.containerAllocationExpirer = context.getContainerAllocationExpirer();
|
||||||
|
@ -291,6 +340,11 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
return httpPort;
|
return httpPort;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test only
|
||||||
|
public void setHttpPort(int port) {
|
||||||
|
this.httpPort = port;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public NodeId getNodeID() {
|
public NodeId getNodeID() {
|
||||||
return this.nodeId;
|
return this.nodeId;
|
||||||
|
@ -497,23 +551,35 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
metrics.decrNumShutdownNMs();
|
metrics.decrNumShutdownNMs();
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
LOG.debug("Unexpected previous node state");
|
LOG.debug("Unexpected previous node state");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Treats nodes in decommissioning as active nodes
|
||||||
|
// TODO we may want to differentiate active nodes and decommissioning node in
|
||||||
|
// metrics later.
|
||||||
|
private void updateMetricsForGracefulDecommissionOnUnhealthyNode() {
|
||||||
|
ClusterMetrics metrics = ClusterMetrics.getMetrics();
|
||||||
|
metrics.incrNumActiveNodes();
|
||||||
|
metrics.decrNumUnhealthyNMs();
|
||||||
|
}
|
||||||
|
|
||||||
private void updateMetricsForDeactivatedNode(NodeState initialState,
|
private void updateMetricsForDeactivatedNode(NodeState initialState,
|
||||||
NodeState finalState) {
|
NodeState finalState) {
|
||||||
ClusterMetrics metrics = ClusterMetrics.getMetrics();
|
ClusterMetrics metrics = ClusterMetrics.getMetrics();
|
||||||
|
|
||||||
switch (initialState) {
|
switch (initialState) {
|
||||||
case RUNNING:
|
case RUNNING:
|
||||||
metrics.decrNumActiveNodes();
|
metrics.decrNumActiveNodes();
|
||||||
break;
|
break;
|
||||||
case UNHEALTHY:
|
case DECOMMISSIONING:
|
||||||
metrics.decrNumUnhealthyNMs();
|
metrics.decrNumActiveNodes();
|
||||||
break;
|
break;
|
||||||
default:
|
case UNHEALTHY:
|
||||||
LOG.debug("Unexpected inital state");
|
metrics.decrNumUnhealthyNMs();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
LOG.debug("Unexpected inital state");
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (finalState) {
|
switch (finalState) {
|
||||||
|
@ -608,10 +674,10 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class ReconnectNodeTransition implements
|
public static class ReconnectNodeTransition implements
|
||||||
SingleArcTransition<RMNodeImpl, RMNodeEvent> {
|
MultipleArcTransition<RMNodeImpl, RMNodeEvent, NodeState> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void transition(RMNodeImpl rmNode, RMNodeEvent event) {
|
public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) {
|
||||||
RMNodeReconnectEvent reconnectEvent = (RMNodeReconnectEvent) event;
|
RMNodeReconnectEvent reconnectEvent = (RMNodeReconnectEvent) event;
|
||||||
RMNode newNode = reconnectEvent.getReconnectedNode();
|
RMNode newNode = reconnectEvent.getReconnectedNode();
|
||||||
rmNode.nodeManagerVersion = newNode.getNodeManagerVersion();
|
rmNode.nodeManagerVersion = newNode.getNodeManagerVersion();
|
||||||
|
@ -622,6 +688,12 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
// No application running on the node, so send node-removal event with
|
// No application running on the node, so send node-removal event with
|
||||||
// cleaning up old container info.
|
// cleaning up old container info.
|
||||||
if (noRunningApps) {
|
if (noRunningApps) {
|
||||||
|
if (rmNode.getState() == NodeState.DECOMMISSIONING) {
|
||||||
|
// When node in decommissioning, and no running apps on this node,
|
||||||
|
// it will return as decommissioned state.
|
||||||
|
deactivateNode(rmNode, NodeState.DECOMMISSIONED);
|
||||||
|
return NodeState.DECOMMISSIONED;
|
||||||
|
}
|
||||||
rmNode.nodeUpdateQueue.clear();
|
rmNode.nodeUpdateQueue.clear();
|
||||||
rmNode.context.getDispatcher().getEventHandler().handle(
|
rmNode.context.getDispatcher().getEventHandler().handle(
|
||||||
new NodeRemovedSchedulerEvent(rmNode));
|
new NodeRemovedSchedulerEvent(rmNode));
|
||||||
|
@ -652,6 +724,7 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
rmNode.context.getDispatcher().getEventHandler().handle(
|
rmNode.context.getDispatcher().getEventHandler().handle(
|
||||||
new RMNodeStartedEvent(newNode.getNodeID(), null, null));
|
new RMNodeStartedEvent(newNode.getNodeID(), null, null));
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
rmNode.httpPort = newNode.getHttpPort();
|
rmNode.httpPort = newNode.getHttpPort();
|
||||||
rmNode.httpAddress = newNode.getHttpAddress();
|
rmNode.httpAddress = newNode.getHttpAddress();
|
||||||
|
@ -678,17 +751,21 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
new NodeResourceUpdateSchedulerEvent(rmNode, ResourceOption
|
new NodeResourceUpdateSchedulerEvent(rmNode, ResourceOption
|
||||||
.newInstance(newNode.getTotalCapability(), -1)));
|
.newInstance(newNode.getTotalCapability(), -1)));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
return rmNode.getState();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void handleNMContainerStatus(
|
private void handleNMContainerStatus(
|
||||||
List<NMContainerStatus> nmContainerStatuses, RMNodeImpl rmnode) {
|
List<NMContainerStatus> nmContainerStatuses, RMNodeImpl rmnode) {
|
||||||
List<ContainerStatus> containerStatuses =
|
if (nmContainerStatuses != null) {
|
||||||
new ArrayList<ContainerStatus>();
|
List<ContainerStatus> containerStatuses =
|
||||||
for (NMContainerStatus nmContainerStatus : nmContainerStatuses) {
|
new ArrayList<ContainerStatus>();
|
||||||
containerStatuses.add(createContainerStatus(nmContainerStatus));
|
for (NMContainerStatus nmContainerStatus : nmContainerStatuses) {
|
||||||
|
containerStatuses.add(createContainerStatus(nmContainerStatus));
|
||||||
|
}
|
||||||
|
rmnode.handleContainerStatus(containerStatuses);
|
||||||
}
|
}
|
||||||
rmnode.handleContainerStatus(containerStatuses);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private ContainerStatus createContainerStatus(
|
private ContainerStatus createContainerStatus(
|
||||||
|
@ -770,31 +847,94 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void transition(RMNodeImpl rmNode, RMNodeEvent event) {
|
public void transition(RMNodeImpl rmNode, RMNodeEvent event) {
|
||||||
// Inform the scheduler
|
RMNodeImpl.deactivateNode(rmNode, finalState);
|
||||||
rmNode.nodeUpdateQueue.clear();
|
|
||||||
// If the current state is NodeState.UNHEALTHY
|
|
||||||
// Then node is already been removed from the
|
|
||||||
// Scheduler
|
|
||||||
NodeState initialState = rmNode.getState();
|
|
||||||
if (!initialState.equals(NodeState.UNHEALTHY)) {
|
|
||||||
rmNode.context.getDispatcher().getEventHandler()
|
|
||||||
.handle(new NodeRemovedSchedulerEvent(rmNode));
|
|
||||||
}
|
|
||||||
rmNode.context.getDispatcher().getEventHandler().handle(
|
|
||||||
new NodesListManagerEvent(
|
|
||||||
NodesListManagerEventType.NODE_UNUSABLE, rmNode));
|
|
||||||
|
|
||||||
// Deactivate the node
|
|
||||||
rmNode.context.getRMNodes().remove(rmNode.nodeId);
|
|
||||||
LOG.info("Deactivating Node " + rmNode.nodeId + " as it is now "
|
|
||||||
+ finalState);
|
|
||||||
rmNode.context.getInactiveRMNodes().put(rmNode.nodeId, rmNode);
|
|
||||||
|
|
||||||
//Update the metrics
|
|
||||||
rmNode.updateMetricsForDeactivatedNode(initialState, finalState);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Put a node in deactivated (decommissioned) status.
|
||||||
|
* @param rmNode
|
||||||
|
* @param finalState
|
||||||
|
*/
|
||||||
|
public static void deactivateNode(RMNodeImpl rmNode, NodeState finalState) {
|
||||||
|
|
||||||
|
reportNodeUnusable(rmNode, finalState);
|
||||||
|
|
||||||
|
// Deactivate the node
|
||||||
|
rmNode.context.getRMNodes().remove(rmNode.nodeId);
|
||||||
|
LOG.info("Deactivating Node " + rmNode.nodeId + " as it is now "
|
||||||
|
+ finalState);
|
||||||
|
rmNode.context.getInactiveRMNodes().put(rmNode.nodeId, rmNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Report node is UNUSABLE and update metrics.
|
||||||
|
* @param rmNode
|
||||||
|
* @param finalState
|
||||||
|
*/
|
||||||
|
public static void reportNodeUnusable(RMNodeImpl rmNode,
|
||||||
|
NodeState finalState) {
|
||||||
|
// Inform the scheduler
|
||||||
|
rmNode.nodeUpdateQueue.clear();
|
||||||
|
// If the current state is NodeState.UNHEALTHY
|
||||||
|
// Then node is already been removed from the
|
||||||
|
// Scheduler
|
||||||
|
NodeState initialState = rmNode.getState();
|
||||||
|
if (!initialState.equals(NodeState.UNHEALTHY)) {
|
||||||
|
rmNode.context.getDispatcher().getEventHandler()
|
||||||
|
.handle(new NodeRemovedSchedulerEvent(rmNode));
|
||||||
|
}
|
||||||
|
rmNode.context.getDispatcher().getEventHandler().handle(
|
||||||
|
new NodesListManagerEvent(
|
||||||
|
NodesListManagerEventType.NODE_UNUSABLE, rmNode));
|
||||||
|
|
||||||
|
//Update the metrics
|
||||||
|
rmNode.updateMetricsForDeactivatedNode(initialState, finalState);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The transition to put node in decommissioning state.
|
||||||
|
*/
|
||||||
|
public static class DecommissioningNodeTransition
|
||||||
|
implements SingleArcTransition<RMNodeImpl, RMNodeEvent> {
|
||||||
|
private final NodeState initState;
|
||||||
|
private final NodeState finalState;
|
||||||
|
|
||||||
|
public DecommissioningNodeTransition(NodeState initState,
|
||||||
|
NodeState finalState) {
|
||||||
|
this.initState = initState;
|
||||||
|
this.finalState = finalState;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void transition(RMNodeImpl rmNode, RMNodeEvent event) {
|
||||||
|
LOG.info("Put Node " + rmNode.nodeId + " in DECOMMISSIONING.");
|
||||||
|
if (initState.equals(NodeState.UNHEALTHY)) {
|
||||||
|
rmNode.updateMetricsForGracefulDecommissionOnUnhealthyNode();
|
||||||
|
}
|
||||||
|
// TODO (in YARN-3223) Keep NM's available resource to be 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class RecommissionNodeTransition
|
||||||
|
implements SingleArcTransition<RMNodeImpl, RMNodeEvent> {
|
||||||
|
|
||||||
|
private final NodeState finalState;
|
||||||
|
public RecommissionNodeTransition(NodeState finalState) {
|
||||||
|
this.finalState = finalState;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void transition(RMNodeImpl rmNode, RMNodeEvent event) {
|
||||||
|
LOG.info("Node " + rmNode.nodeId + " in DECOMMISSIONING is " +
|
||||||
|
"recommissioned back to RUNNING.");
|
||||||
|
// TODO handle NM resource resume in YARN-3223.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Status update transition when node is healthy.
|
||||||
|
*/
|
||||||
public static class StatusUpdateWhenHealthyTransition implements
|
public static class StatusUpdateWhenHealthyTransition implements
|
||||||
MultipleArcTransition<RMNodeImpl, RMNodeEvent, NodeState> {
|
MultipleArcTransition<RMNodeImpl, RMNodeEvent, NodeState> {
|
||||||
@Override
|
@Override
|
||||||
|
@ -805,25 +945,44 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
// Switch the last heartbeatresponse.
|
// Switch the last heartbeatresponse.
|
||||||
rmNode.latestNodeHeartBeatResponse = statusEvent.getLatestResponse();
|
rmNode.latestNodeHeartBeatResponse = statusEvent.getLatestResponse();
|
||||||
|
|
||||||
NodeHealthStatus remoteNodeHealthStatus =
|
NodeHealthStatus remoteNodeHealthStatus =
|
||||||
statusEvent.getNodeHealthStatus();
|
statusEvent.getNodeHealthStatus();
|
||||||
rmNode.setHealthReport(remoteNodeHealthStatus.getHealthReport());
|
rmNode.setHealthReport(remoteNodeHealthStatus.getHealthReport());
|
||||||
rmNode.setLastHealthReportTime(
|
rmNode.setLastHealthReportTime(
|
||||||
remoteNodeHealthStatus.getLastHealthReportTime());
|
remoteNodeHealthStatus.getLastHealthReportTime());
|
||||||
|
NodeState initialState = rmNode.getState();
|
||||||
|
boolean isNodeDecommissioning =
|
||||||
|
initialState.equals(NodeState.DECOMMISSIONING);
|
||||||
if (!remoteNodeHealthStatus.getIsNodeHealthy()) {
|
if (!remoteNodeHealthStatus.getIsNodeHealthy()) {
|
||||||
LOG.info("Node " + rmNode.nodeId + " reported UNHEALTHY with details: "
|
LOG.info("Node " + rmNode.nodeId +
|
||||||
+ remoteNodeHealthStatus.getHealthReport());
|
" reported UNHEALTHY with details: " +
|
||||||
rmNode.nodeUpdateQueue.clear();
|
remoteNodeHealthStatus.getHealthReport());
|
||||||
// Inform the scheduler
|
// if a node in decommissioning receives an unhealthy report,
|
||||||
rmNode.context.getDispatcher().getEventHandler().handle(
|
// it will keep decommissioning.
|
||||||
new NodeRemovedSchedulerEvent(rmNode));
|
if (isNodeDecommissioning) {
|
||||||
rmNode.context.getDispatcher().getEventHandler().handle(
|
return NodeState.DECOMMISSIONING;
|
||||||
new NodesListManagerEvent(
|
} else {
|
||||||
NodesListManagerEventType.NODE_UNUSABLE, rmNode));
|
reportNodeUnusable(rmNode, NodeState.UNHEALTHY);
|
||||||
// Update metrics
|
return NodeState.UNHEALTHY;
|
||||||
rmNode.updateMetricsForDeactivatedNode(rmNode.getState(),
|
}
|
||||||
NodeState.UNHEALTHY);
|
}
|
||||||
return NodeState.UNHEALTHY;
|
if (isNodeDecommissioning) {
|
||||||
|
List<ApplicationId> runningApps = rmNode.getRunningApps();
|
||||||
|
|
||||||
|
List<ApplicationId> keepAliveApps = statusEvent.getKeepAliveAppIds();
|
||||||
|
|
||||||
|
// no running (and keeping alive) app on this node, get it
|
||||||
|
// decommissioned.
|
||||||
|
// TODO may need to check no container is being scheduled on this node
|
||||||
|
// as well.
|
||||||
|
if ((runningApps == null || runningApps.size() == 0)
|
||||||
|
&& (keepAliveApps == null || keepAliveApps.size() == 0)) {
|
||||||
|
RMNodeImpl.deactivateNode(rmNode, NodeState.DECOMMISSIONED);
|
||||||
|
return NodeState.DECOMMISSIONED;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO (in YARN-3223) if node in decommissioning, get node resource
|
||||||
|
// updated if container get finished (keep available resource to be 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
rmNode.handleContainerStatus(statusEvent.getContainers());
|
rmNode.handleContainerStatus(statusEvent.getContainers());
|
||||||
|
@ -848,7 +1007,7 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
statusEvent.getKeepAliveAppIds());
|
statusEvent.getKeepAliveAppIds());
|
||||||
}
|
}
|
||||||
|
|
||||||
return NodeState.RUNNING;
|
return initialState;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -857,11 +1016,12 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) {
|
public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) {
|
||||||
RMNodeStatusEvent statusEvent = (RMNodeStatusEvent) event;
|
RMNodeStatusEvent statusEvent = (RMNodeStatusEvent)event;
|
||||||
|
|
||||||
// Switch the last heartbeatresponse.
|
// Switch the last heartbeatresponse.
|
||||||
rmNode.latestNodeHeartBeatResponse = statusEvent.getLatestResponse();
|
rmNode.latestNodeHeartBeatResponse = statusEvent.getLatestResponse();
|
||||||
NodeHealthStatus remoteNodeHealthStatus = statusEvent.getNodeHealthStatus();
|
NodeHealthStatus remoteNodeHealthStatus =
|
||||||
|
statusEvent.getNodeHealthStatus();
|
||||||
rmNode.setHealthReport(remoteNodeHealthStatus.getHealthReport());
|
rmNode.setHealthReport(remoteNodeHealthStatus.getHealthReport());
|
||||||
rmNode.setLastHealthReportTime(
|
rmNode.setLastHealthReportTime(
|
||||||
remoteNodeHealthStatus.getLastHealthReportTime());
|
remoteNodeHealthStatus.getLastHealthReportTime());
|
||||||
|
|
|
@ -29,7 +29,9 @@ import static org.mockito.Mockito.when;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import org.apache.hadoop.net.Node;
|
||||||
import org.apache.hadoop.util.HostsFileReader;
|
import org.apache.hadoop.util.HostsFileReader;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
@ -75,7 +77,7 @@ import org.mockito.stubbing.Answer;
|
||||||
public class TestRMNodeTransitions {
|
public class TestRMNodeTransitions {
|
||||||
|
|
||||||
RMNodeImpl node;
|
RMNodeImpl node;
|
||||||
|
|
||||||
private RMContext rmContext;
|
private RMContext rmContext;
|
||||||
private YarnScheduler scheduler;
|
private YarnScheduler scheduler;
|
||||||
|
|
||||||
|
@ -168,6 +170,42 @@ public class TestRMNodeTransitions {
|
||||||
return event;
|
return event;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private RMNodeStatusEvent getMockRMNodeStatusEventWithRunningApps() {
|
||||||
|
NodeHeartbeatResponse response = mock(NodeHeartbeatResponse.class);
|
||||||
|
|
||||||
|
NodeHealthStatus healthStatus = mock(NodeHealthStatus.class);
|
||||||
|
Boolean yes = new Boolean(true);
|
||||||
|
doReturn(yes).when(healthStatus).getIsNodeHealthy();
|
||||||
|
|
||||||
|
RMNodeStatusEvent event = mock(RMNodeStatusEvent.class);
|
||||||
|
doReturn(healthStatus).when(event).getNodeHealthStatus();
|
||||||
|
doReturn(response).when(event).getLatestResponse();
|
||||||
|
doReturn(RMNodeEventType.STATUS_UPDATE).when(event).getType();
|
||||||
|
doReturn(getAppIdList()).when(event).getKeepAliveAppIds();
|
||||||
|
return event;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<ApplicationId> getAppIdList() {
|
||||||
|
List<ApplicationId> appIdList = new ArrayList<ApplicationId>();
|
||||||
|
appIdList.add(BuilderUtils.newApplicationId(0, 0));
|
||||||
|
return appIdList;
|
||||||
|
}
|
||||||
|
|
||||||
|
private RMNodeStatusEvent getMockRMNodeStatusEventWithoutRunningApps() {
|
||||||
|
NodeHeartbeatResponse response = mock(NodeHeartbeatResponse.class);
|
||||||
|
|
||||||
|
NodeHealthStatus healthStatus = mock(NodeHealthStatus.class);
|
||||||
|
Boolean yes = new Boolean(true);
|
||||||
|
doReturn(yes).when(healthStatus).getIsNodeHealthy();
|
||||||
|
|
||||||
|
RMNodeStatusEvent event = mock(RMNodeStatusEvent.class);
|
||||||
|
doReturn(healthStatus).when(event).getNodeHealthStatus();
|
||||||
|
doReturn(response).when(event).getLatestResponse();
|
||||||
|
doReturn(RMNodeEventType.STATUS_UPDATE).when(event).getType();
|
||||||
|
doReturn(null).when(event).getKeepAliveAppIds();
|
||||||
|
return event;
|
||||||
|
}
|
||||||
|
|
||||||
@Test (timeout = 5000)
|
@Test (timeout = 5000)
|
||||||
public void testExpiredContainer() {
|
public void testExpiredContainer() {
|
||||||
// Start the node
|
// Start the node
|
||||||
|
@ -195,7 +233,33 @@ public class TestRMNodeTransitions {
|
||||||
*/
|
*/
|
||||||
verify(scheduler,times(2)).handle(any(NodeUpdateSchedulerEvent.class));
|
verify(scheduler,times(2)).handle(any(NodeUpdateSchedulerEvent.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testStatusUpdateOnDecommissioningNode(){
|
||||||
|
RMNodeImpl node = getDecommissioningNode();
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONING, node.getState());
|
||||||
|
// Verify node in DECOMMISSIONING won't be changed by status update
|
||||||
|
// with running apps
|
||||||
|
RMNodeStatusEvent statusEvent = getMockRMNodeStatusEventWithRunningApps();
|
||||||
|
node.handle(statusEvent);
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONING, node.getState());
|
||||||
|
|
||||||
|
// Verify node in DECOMMISSIONING will be changed by status update
|
||||||
|
// without running apps
|
||||||
|
statusEvent = getMockRMNodeStatusEventWithoutRunningApps();
|
||||||
|
node.handle(statusEvent);
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONED, node.getState());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRecommissionNode(){
|
||||||
|
RMNodeImpl node = getDecommissioningNode();
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONING, node.getState());
|
||||||
|
node.handle(new RMNodeEvent(node.getNodeID(),
|
||||||
|
RMNodeEventType.RECOMMISSION));
|
||||||
|
Assert.assertEquals(NodeState.RUNNING, node.getState());
|
||||||
|
}
|
||||||
|
|
||||||
@Test (timeout = 5000)
|
@Test (timeout = 5000)
|
||||||
public void testContainerUpdate() throws InterruptedException{
|
public void testContainerUpdate() throws InterruptedException{
|
||||||
//Start the node
|
//Start the node
|
||||||
|
@ -253,9 +317,9 @@ public class TestRMNodeTransitions {
|
||||||
Assert.assertEquals(completedContainerIdFromNode2_1,completedContainers.get(0)
|
Assert.assertEquals(completedContainerIdFromNode2_1,completedContainers.get(0)
|
||||||
.getContainerId());
|
.getContainerId());
|
||||||
Assert.assertEquals(completedContainerIdFromNode2_2,completedContainers.get(1)
|
Assert.assertEquals(completedContainerIdFromNode2_2,completedContainers.get(1)
|
||||||
.getContainerId());
|
.getContainerId());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test (timeout = 5000)
|
@Test (timeout = 5000)
|
||||||
public void testStatusChange(){
|
public void testStatusChange(){
|
||||||
//Start the node
|
//Start the node
|
||||||
|
@ -292,7 +356,7 @@ public class TestRMNodeTransitions {
|
||||||
node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.EXPIRE));
|
node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.EXPIRE));
|
||||||
Assert.assertEquals(0, node.getQueueSize());
|
Assert.assertEquals(0, node.getQueueSize());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRunningExpire() {
|
public void testRunningExpire() {
|
||||||
RMNodeImpl node = getRunningNode();
|
RMNodeImpl node = getRunningNode();
|
||||||
|
@ -375,7 +439,7 @@ public class TestRMNodeTransitions {
|
||||||
initialRebooted, cm.getNumRebootedNMs());
|
initialRebooted, cm.getNumRebootedNMs());
|
||||||
Assert.assertEquals(NodeState.LOST, node.getState());
|
Assert.assertEquals(NodeState.LOST, node.getState());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testUnhealthyExpireForSchedulerRemove() {
|
public void testUnhealthyExpireForSchedulerRemove() {
|
||||||
RMNodeImpl node = getUnhealthyNode();
|
RMNodeImpl node = getUnhealthyNode();
|
||||||
|
@ -407,6 +471,28 @@ public class TestRMNodeTransitions {
|
||||||
Assert.assertEquals(NodeState.DECOMMISSIONED, node.getState());
|
Assert.assertEquals(NodeState.DECOMMISSIONED, node.getState());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDecommissionOnDecommissioningNode() {
|
||||||
|
RMNodeImpl node = getDecommissioningNode();
|
||||||
|
ClusterMetrics cm = ClusterMetrics.getMetrics();
|
||||||
|
int initialActive = cm.getNumActiveNMs();
|
||||||
|
int initialLost = cm.getNumLostNMs();
|
||||||
|
int initialUnhealthy = cm.getUnhealthyNMs();
|
||||||
|
int initialDecommissioned = cm.getNumDecommisionedNMs();
|
||||||
|
int initialRebooted = cm.getNumRebootedNMs();
|
||||||
|
node.handle(new RMNodeEvent(node.getNodeID(),
|
||||||
|
RMNodeEventType.DECOMMISSION));
|
||||||
|
Assert.assertEquals("Active Nodes", initialActive - 1, cm.getNumActiveNMs());
|
||||||
|
Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs());
|
||||||
|
Assert.assertEquals("Unhealthy Nodes",
|
||||||
|
initialUnhealthy, cm.getUnhealthyNMs());
|
||||||
|
Assert.assertEquals("Decommissioned Nodes",
|
||||||
|
initialDecommissioned + 1, cm.getNumDecommisionedNMs());
|
||||||
|
Assert.assertEquals("Rebooted Nodes",
|
||||||
|
initialRebooted, cm.getNumRebootedNMs());
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONED, node.getState());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testUnhealthyDecommission() {
|
public void testUnhealthyDecommission() {
|
||||||
RMNodeImpl node = getUnhealthyNode();
|
RMNodeImpl node = getUnhealthyNode();
|
||||||
|
@ -429,6 +515,30 @@ public class TestRMNodeTransitions {
|
||||||
Assert.assertEquals(NodeState.DECOMMISSIONED, node.getState());
|
Assert.assertEquals(NodeState.DECOMMISSIONED, node.getState());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test Decommissioning on a unhealthy node will make it decommissioning.
|
||||||
|
@Test
|
||||||
|
public void testUnhealthyDecommissioning() {
|
||||||
|
RMNodeImpl node = getUnhealthyNode();
|
||||||
|
ClusterMetrics cm = ClusterMetrics.getMetrics();
|
||||||
|
int initialActive = cm.getNumActiveNMs();
|
||||||
|
int initialLost = cm.getNumLostNMs();
|
||||||
|
int initialUnhealthy = cm.getUnhealthyNMs();
|
||||||
|
int initialDecommissioned = cm.getNumDecommisionedNMs();
|
||||||
|
int initialRebooted = cm.getNumRebootedNMs();
|
||||||
|
node.handle(new RMNodeEvent(node.getNodeID(),
|
||||||
|
RMNodeEventType.GRACEFUL_DECOMMISSION));
|
||||||
|
Assert.assertEquals("Active Nodes", initialActive + 1,
|
||||||
|
cm.getNumActiveNMs());
|
||||||
|
Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs());
|
||||||
|
Assert.assertEquals("Unhealthy Nodes",
|
||||||
|
initialUnhealthy - 1, cm.getUnhealthyNMs());
|
||||||
|
Assert.assertEquals("Decommissioned Nodes", initialDecommissioned,
|
||||||
|
cm.getNumDecommisionedNMs());
|
||||||
|
Assert.assertEquals("Rebooted Nodes",
|
||||||
|
initialRebooted, cm.getNumRebootedNMs());
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONING, node.getState());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRunningRebooting() {
|
public void testRunningRebooting() {
|
||||||
RMNodeImpl node = getRunningNode();
|
RMNodeImpl node = getRunningNode();
|
||||||
|
@ -567,6 +677,14 @@ public class TestRMNodeTransitions {
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private RMNodeImpl getDecommissioningNode() {
|
||||||
|
RMNodeImpl node = getRunningNode();
|
||||||
|
node.handle(new RMNodeEvent(node.getNodeID(),
|
||||||
|
RMNodeEventType.GRACEFUL_DECOMMISSION));
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONING, node.getState());
|
||||||
|
return node;
|
||||||
|
}
|
||||||
|
|
||||||
private RMNodeImpl getUnhealthyNode() {
|
private RMNodeImpl getUnhealthyNode() {
|
||||||
RMNodeImpl node = getRunningNode();
|
RMNodeImpl node = getRunningNode();
|
||||||
NodeHealthStatus status = NodeHealthStatus.newInstance(false, "sick",
|
NodeHealthStatus status = NodeHealthStatus.newInstance(false, "sick",
|
||||||
|
@ -577,20 +695,19 @@ public class TestRMNodeTransitions {
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private RMNodeImpl getNewNode() {
|
private RMNodeImpl getNewNode() {
|
||||||
NodeId nodeId = BuilderUtils.newNodeId("localhost", 0);
|
NodeId nodeId = BuilderUtils.newNodeId("localhost", 0);
|
||||||
RMNodeImpl node = new RMNodeImpl(nodeId, rmContext, null, 0, 0, null, null, null);
|
RMNodeImpl node = new RMNodeImpl(nodeId, rmContext, null, 0, 0, null, null, null);
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
private RMNodeImpl getNewNode(Resource capability) {
|
private RMNodeImpl getNewNode(Resource capability) {
|
||||||
NodeId nodeId = BuilderUtils.newNodeId("localhost", 0);
|
NodeId nodeId = BuilderUtils.newNodeId("localhost", 0);
|
||||||
RMNodeImpl node = new RMNodeImpl(nodeId, rmContext, null, 0, 0, null,
|
RMNodeImpl node = new RMNodeImpl(nodeId, rmContext, null, 0, 0, null,
|
||||||
capability, null);
|
capability, null);
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
private RMNodeImpl getRebootedNode() {
|
private RMNodeImpl getRebootedNode() {
|
||||||
NodeId nodeId = BuilderUtils.newNodeId("localhost", 0);
|
NodeId nodeId = BuilderUtils.newNodeId("localhost", 0);
|
||||||
Resource capability = Resource.newInstance(4096, 4);
|
Resource capability = Resource.newInstance(4096, 4);
|
||||||
|
@ -650,7 +767,39 @@ public class TestRMNodeTransitions {
|
||||||
Assert.assertEquals(NodesListManagerEventType.NODE_USABLE,
|
Assert.assertEquals(NodesListManagerEventType.NODE_USABLE,
|
||||||
nodesListManagerEvent.getType());
|
nodesListManagerEvent.getType());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testReconnectOnDecommissioningNode() {
|
||||||
|
RMNodeImpl node = getDecommissioningNode();
|
||||||
|
|
||||||
|
// Reconnect event with running app
|
||||||
|
node.handle(new RMNodeReconnectEvent(node.getNodeID(), node,
|
||||||
|
getAppIdList(), null));
|
||||||
|
// still decommissioning
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONING, node.getState());
|
||||||
|
|
||||||
|
// Reconnect event without any running app
|
||||||
|
node.handle(new RMNodeReconnectEvent(node.getNodeID(), node, null, null));
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONED, node.getState());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testReconnectWithNewPortOnDecommissioningNode() {
|
||||||
|
RMNodeImpl node = getDecommissioningNode();
|
||||||
|
Random r= new Random();
|
||||||
|
node.setHttpPort(r.nextInt(10000));
|
||||||
|
// Reconnect event with running app
|
||||||
|
node.handle(new RMNodeReconnectEvent(node.getNodeID(), node,
|
||||||
|
getAppIdList(), null));
|
||||||
|
// still decommissioning
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONING, node.getState());
|
||||||
|
|
||||||
|
node.setHttpPort(r.nextInt(10000));
|
||||||
|
// Reconnect event without any running app
|
||||||
|
node.handle(new RMNodeReconnectEvent(node.getNodeID(), node, null, null));
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONED, node.getState());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testResourceUpdateOnRunningNode() {
|
public void testResourceUpdateOnRunningNode() {
|
||||||
RMNodeImpl node = getRunningNode();
|
RMNodeImpl node = getRunningNode();
|
||||||
|
@ -658,18 +807,23 @@ public class TestRMNodeTransitions {
|
||||||
assertEquals("Memory resource is not match.", oldCapacity.getMemory(), 4096);
|
assertEquals("Memory resource is not match.", oldCapacity.getMemory(), 4096);
|
||||||
assertEquals("CPU resource is not match.", oldCapacity.getVirtualCores(), 4);
|
assertEquals("CPU resource is not match.", oldCapacity.getVirtualCores(), 4);
|
||||||
node.handle(new RMNodeResourceUpdateEvent(node.getNodeID(),
|
node.handle(new RMNodeResourceUpdateEvent(node.getNodeID(),
|
||||||
ResourceOption.newInstance(Resource.newInstance(2048, 2),
|
ResourceOption.newInstance(Resource.newInstance(2048, 2),
|
||||||
ResourceOption.OVER_COMMIT_TIMEOUT_MILLIS_DEFAULT)));
|
ResourceOption.OVER_COMMIT_TIMEOUT_MILLIS_DEFAULT)));
|
||||||
Resource newCapacity = node.getTotalCapability();
|
Resource newCapacity = node.getTotalCapability();
|
||||||
assertEquals("Memory resource is not match.", newCapacity.getMemory(), 2048);
|
assertEquals("Memory resource is not match.", newCapacity.getMemory(), 2048);
|
||||||
assertEquals("CPU resource is not match.", newCapacity.getVirtualCores(), 2);
|
assertEquals("CPU resource is not match.", newCapacity.getVirtualCores(), 2);
|
||||||
|
|
||||||
Assert.assertEquals(NodeState.RUNNING, node.getState());
|
Assert.assertEquals(NodeState.RUNNING, node.getState());
|
||||||
Assert.assertNotNull(nodesListManagerEvent);
|
Assert.assertNotNull(nodesListManagerEvent);
|
||||||
Assert.assertEquals(NodesListManagerEventType.NODE_USABLE,
|
Assert.assertEquals(NodesListManagerEventType.NODE_USABLE,
|
||||||
nodesListManagerEvent.getType());
|
nodesListManagerEvent.getType());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDecommissioningOnRunningNode(){
|
||||||
|
getDecommissioningNode();
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testResourceUpdateOnNewNode() {
|
public void testResourceUpdateOnNewNode() {
|
||||||
RMNodeImpl node = getNewNode(Resource.newInstance(4096, 4));
|
RMNodeImpl node = getNewNode(Resource.newInstance(4096, 4));
|
||||||
|
@ -682,10 +836,10 @@ public class TestRMNodeTransitions {
|
||||||
Resource newCapacity = node.getTotalCapability();
|
Resource newCapacity = node.getTotalCapability();
|
||||||
assertEquals("Memory resource is not match.", newCapacity.getMemory(), 2048);
|
assertEquals("Memory resource is not match.", newCapacity.getMemory(), 2048);
|
||||||
assertEquals("CPU resource is not match.", newCapacity.getVirtualCores(), 2);
|
assertEquals("CPU resource is not match.", newCapacity.getVirtualCores(), 2);
|
||||||
|
|
||||||
Assert.assertEquals(NodeState.NEW, node.getState());
|
Assert.assertEquals(NodeState.NEW, node.getState());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testResourceUpdateOnRebootedNode() {
|
public void testResourceUpdateOnRebootedNode() {
|
||||||
RMNodeImpl node = getRebootedNode();
|
RMNodeImpl node = getRebootedNode();
|
||||||
|
@ -702,6 +856,18 @@ public class TestRMNodeTransitions {
|
||||||
Assert.assertEquals(NodeState.REBOOTED, node.getState());
|
Assert.assertEquals(NodeState.REBOOTED, node.getState());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test unhealthy report on a decommissioning node will make it
|
||||||
|
// keep decommissioning.
|
||||||
|
@Test
|
||||||
|
public void testDecommissioningUnhealthy() {
|
||||||
|
RMNodeImpl node = getDecommissioningNode();
|
||||||
|
NodeHealthStatus status = NodeHealthStatus.newInstance(false, "sick",
|
||||||
|
System.currentTimeMillis());
|
||||||
|
node.handle(new RMNodeStatusEvent(node.getNodeID(), status,
|
||||||
|
new ArrayList<ContainerStatus>(), null, null));
|
||||||
|
Assert.assertEquals(NodeState.DECOMMISSIONING, node.getState());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testReconnnectUpdate() {
|
public void testReconnnectUpdate() {
|
||||||
final String nmVersion1 = "nm version 1";
|
final String nmVersion1 = "nm version 1";
|
||||||
|
|
|
@ -43,8 +43,7 @@ public class TestNodesPage {
|
||||||
final int numberOfNodesPerRack = 8;
|
final int numberOfNodesPerRack = 8;
|
||||||
// The following is because of the way TestRMWebApp.mockRMContext creates
|
// The following is because of the way TestRMWebApp.mockRMContext creates
|
||||||
// nodes.
|
// nodes.
|
||||||
final int numberOfLostNodesPerRack = numberOfNodesPerRack
|
final int numberOfLostNodesPerRack = 1;
|
||||||
/ NodeState.values().length;
|
|
||||||
|
|
||||||
// Number of Actual Table Headers for NodesPage.NodesBlock might change in
|
// Number of Actual Table Headers for NodesPage.NodesBlock might change in
|
||||||
// future. In that case this value should be adjusted to the new value.
|
// future. In that case this value should be adjusted to the new value.
|
||||||
|
|
Loading…
Reference in New Issue