YARN-3445. Cache runningApps in RMNode for getting running apps on given NodeId. (Junping Du via mingma)

(cherry picked from commit 08244264c0)
This commit is contained in:
Ming Ma 2015-07-10 08:30:10 -07:00
parent 5e6bbe6031
commit b169889f01
7 changed files with 91 additions and 11 deletions

View File

@ -62,7 +62,8 @@ public class NodeInfo {
private NodeState state; private NodeState state;
private List<ContainerId> toCleanUpContainers; private List<ContainerId> toCleanUpContainers;
private List<ApplicationId> toCleanUpApplications; private List<ApplicationId> toCleanUpApplications;
private List<ApplicationId> runningApplications;
public FakeRMNodeImpl(NodeId nodeId, String nodeAddr, String httpAddress, public FakeRMNodeImpl(NodeId nodeId, String nodeAddr, String httpAddress,
Resource perNode, String rackName, String healthReport, Resource perNode, String rackName, String healthReport,
int cmdPort, String hostName, NodeState state) { int cmdPort, String hostName, NodeState state) {
@ -77,6 +78,7 @@ public class NodeInfo {
this.state = state; this.state = state;
toCleanUpApplications = new ArrayList<ApplicationId>(); toCleanUpApplications = new ArrayList<ApplicationId>();
toCleanUpContainers = new ArrayList<ContainerId>(); toCleanUpContainers = new ArrayList<ContainerId>();
runningApplications = new ArrayList<ApplicationId>();
} }
public NodeId getNodeID() { public NodeId getNodeID() {
@ -135,6 +137,10 @@ public class NodeInfo {
return toCleanUpApplications; return toCleanUpApplications;
} }
public List<ApplicationId> getRunningApps() {
return runningApplications;
}
public void updateNodeHeartbeatResponseForCleanup( public void updateNodeHeartbeatResponseForCleanup(
NodeHeartbeatResponse response) { NodeHeartbeatResponse response) {
} }

View File

@ -118,6 +118,11 @@ public class RMNodeWrapper implements RMNode {
return node.getAppsToCleanup(); return node.getAppsToCleanup();
} }
@Override
public List<ApplicationId> getRunningApps() {
return node.getRunningApps();
}
@Override @Override
public void updateNodeHeartbeatResponseForCleanup( public void updateNodeHeartbeatResponseForCleanup(
NodeHeartbeatResponse nodeHeartbeatResponse) { NodeHeartbeatResponse nodeHeartbeatResponse) {

View File

@ -1637,6 +1637,9 @@ Release 2.6.0 - 2014-11-18
YARN-2811. In Fair Scheduler, reservation fulfillments shouldn't ignore max YARN-2811. In Fair Scheduler, reservation fulfillments shouldn't ignore max
share (Siqi Li via Sandy Ryza) share (Siqi Li via Sandy Ryza)
YARN-3445. Cache runningApps in RMNode for getting running apps on given
NodeId. (Junping Du via mingma)
IMPROVEMENTS IMPROVEMENTS
YARN-2242. Improve exception information on AM launch crashes. (Li Lu YARN-2242. Improve exception information on AM launch crashes. (Li Lu

View File

@ -119,6 +119,8 @@ public interface RMNode {
public List<ApplicationId> getAppsToCleanup(); public List<ApplicationId> getAppsToCleanup();
List<ApplicationId> getRunningApps();
/** /**
* Update a {@link NodeHeartbeatResponse} with the list of containers and * Update a {@link NodeHeartbeatResponse} with the list of containers and
* applications to clean up for this node. * applications to clean up for this node.

View File

@ -123,11 +123,16 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
new HashSet<ContainerId>(); new HashSet<ContainerId>();
/* the list of applications that have finished and need to be purged */ /* the list of applications that have finished and need to be purged */
private final List<ApplicationId> finishedApplications = new ArrayList<ApplicationId>(); private final List<ApplicationId> finishedApplications =
new ArrayList<ApplicationId>();
/* the list of applications that are running on this node */
private final List<ApplicationId> runningApplications =
new ArrayList<ApplicationId>();
private NodeHeartbeatResponse latestNodeHeartBeatResponse = recordFactory private NodeHeartbeatResponse latestNodeHeartBeatResponse = recordFactory
.newRecordInstance(NodeHeartbeatResponse.class); .newRecordInstance(NodeHeartbeatResponse.class);
private static final StateMachineFactory<RMNodeImpl, private static final StateMachineFactory<RMNodeImpl,
NodeState, NodeState,
RMNodeEventType, RMNodeEventType,
@ -136,7 +141,7 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
NodeState, NodeState,
RMNodeEventType, RMNodeEventType,
RMNodeEvent>(NodeState.NEW) RMNodeEvent>(NodeState.NEW)
//Transitions from NEW state //Transitions from NEW state
.addTransition(NodeState.NEW, NodeState.RUNNING, .addTransition(NodeState.NEW, NodeState.RUNNING,
RMNodeEventType.STARTED, new AddNodeTransition()) RMNodeEventType.STARTED, new AddNodeTransition())
@ -382,6 +387,16 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
} }
@Override
public List<ApplicationId> getRunningApps() {
this.readLock.lock();
try {
return new ArrayList<ApplicationId>(this.runningApplications);
} finally {
this.readLock.unlock();
}
}
@Override @Override
public List<ContainerId> getContainersToCleanUp() { public List<ContainerId> getContainersToCleanUp() {
@ -519,9 +534,12 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
LOG.warn("Cannot get RMApp by appId=" + appId LOG.warn("Cannot get RMApp by appId=" + appId
+ ", just added it to finishedApplications list for cleanup"); + ", just added it to finishedApplications list for cleanup");
rmNode.finishedApplications.add(appId); rmNode.finishedApplications.add(appId);
rmNode.runningApplications.remove(appId);
return; return;
} }
// Add running applications back due to Node add or Node reconnection.
rmNode.runningApplications.add(appId);
context.getDispatcher().getEventHandler() context.getDispatcher().getEventHandler()
.handle(new RMAppRunningOnNodeEvent(appId, nodeId)); .handle(new RMAppRunningOnNodeEvent(appId, nodeId));
} }
@ -707,8 +725,9 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
@Override @Override
public void transition(RMNodeImpl rmNode, RMNodeEvent event) { public void transition(RMNodeImpl rmNode, RMNodeEvent event) {
rmNode.finishedApplications.add((( ApplicationId appId = ((RMNodeCleanAppEvent) event).getAppId();
RMNodeCleanAppEvent) event).getAppId()); rmNode.finishedApplications.add(appId);
rmNode.runningApplications.remove(appId);
} }
} }
@ -910,12 +929,22 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
+ "cleanup, no further processing"); + "cleanup, no further processing");
continue; continue;
} }
if (finishedApplications.contains(containerId.getApplicationAttemptId()
.getApplicationId())) { ApplicationId containerAppId =
containerId.getApplicationAttemptId().getApplicationId();
if (finishedApplications.contains(containerAppId)) {
LOG.info("Container " + containerId LOG.info("Container " + containerId
+ " belongs to an application that is already killed," + " belongs to an application that is already killed,"
+ " no further processing"); + " no further processing");
continue; continue;
} else if (!runningApplications.contains(containerAppId)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Container " + containerId
+ " is the first container get launched for application "
+ containerAppId);
}
runningApplications.add(containerAppId);
} }
// Process running containers // Process running containers

View File

@ -186,6 +186,11 @@ public class MockNodes {
return null; return null;
} }
@Override
public List<ApplicationId> getRunningApps() {
return null;
}
@Override @Override
public void updateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response) { public void updateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response) {
} }

View File

@ -33,6 +33,7 @@ import java.util.List;
import org.apache.hadoop.util.HostsFileReader; import org.apache.hadoop.util.HostsFileReader;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.NodeState; import org.apache.hadoop.yarn.api.records.NodeState;
@ -485,9 +486,9 @@ public class TestRMNodeTransitions {
NodeId nodeId = node.getNodeID(); NodeId nodeId = node.getNodeID();
// Expire a container // Expire a container
ContainerId completedContainerId = BuilderUtils.newContainerId( ContainerId completedContainerId = BuilderUtils.newContainerId(
BuilderUtils.newApplicationAttemptId( BuilderUtils.newApplicationAttemptId(
BuilderUtils.newApplicationId(0, 0), 0), 0); BuilderUtils.newApplicationId(0, 0), 0), 0);
node.handle(new RMNodeCleanContainerEvent(nodeId, completedContainerId)); node.handle(new RMNodeCleanContainerEvent(nodeId, completedContainerId));
Assert.assertEquals(1, node.getContainersToCleanUp().size()); Assert.assertEquals(1, node.getContainersToCleanUp().size());
@ -512,6 +513,35 @@ public class TestRMNodeTransitions {
Assert.assertEquals(finishedAppId, hbrsp.getApplicationsToCleanup().get(0)); Assert.assertEquals(finishedAppId, hbrsp.getApplicationsToCleanup().get(0));
} }
@Test(timeout=20000)
public void testUpdateHeartbeatResponseForAppLifeCycle() {
RMNodeImpl node = getRunningNode();
NodeId nodeId = node.getNodeID();
ApplicationId runningAppId = BuilderUtils.newApplicationId(0, 1);
// Create a running container
ContainerId runningContainerId = BuilderUtils.newContainerId(
BuilderUtils.newApplicationAttemptId(
runningAppId, 0), 0);
ContainerStatus status = ContainerStatus.newInstance(runningContainerId,
ContainerState.RUNNING, "", 0);
List<ContainerStatus> statusList = new ArrayList<ContainerStatus>();
statusList.add(status);
NodeHealthStatus nodeHealth = NodeHealthStatus.newInstance(true,
"", System.currentTimeMillis());
node.handle(new RMNodeStatusEvent(nodeId, nodeHealth,
statusList, null, null));
Assert.assertEquals(1, node.getRunningApps().size());
// Finish an application
ApplicationId finishedAppId = runningAppId;
node.handle(new RMNodeCleanAppEvent(nodeId, finishedAppId));
Assert.assertEquals(1, node.getAppsToCleanup().size());
Assert.assertEquals(0, node.getRunningApps().size());
}
private RMNodeImpl getRunningNode() { private RMNodeImpl getRunningNode() {
return getRunningNode(null, 0); return getRunningNode(null, 0);
} }