YARN-3445. Cache runningApps in RMNode for getting running apps on given NodeId. (Junping Du via mingma)

(cherry picked from commit 08244264c0)
This commit is contained in:
Ming Ma 2015-07-10 08:30:10 -07:00
parent 5e6bbe6031
commit b169889f01
7 changed files with 91 additions and 11 deletions

View File

@ -62,6 +62,7 @@ public class NodeInfo {
private NodeState state; private NodeState state;
private List<ContainerId> toCleanUpContainers; private List<ContainerId> toCleanUpContainers;
private List<ApplicationId> toCleanUpApplications; private List<ApplicationId> toCleanUpApplications;
private List<ApplicationId> runningApplications;
public FakeRMNodeImpl(NodeId nodeId, String nodeAddr, String httpAddress, public FakeRMNodeImpl(NodeId nodeId, String nodeAddr, String httpAddress,
Resource perNode, String rackName, String healthReport, Resource perNode, String rackName, String healthReport,
@ -77,6 +78,7 @@ public class NodeInfo {
this.state = state; this.state = state;
toCleanUpApplications = new ArrayList<ApplicationId>(); toCleanUpApplications = new ArrayList<ApplicationId>();
toCleanUpContainers = new ArrayList<ContainerId>(); toCleanUpContainers = new ArrayList<ContainerId>();
runningApplications = new ArrayList<ApplicationId>();
} }
public NodeId getNodeID() { public NodeId getNodeID() {
@ -135,6 +137,10 @@ public class NodeInfo {
return toCleanUpApplications; return toCleanUpApplications;
} }
public List<ApplicationId> getRunningApps() {
return runningApplications;
}
public void updateNodeHeartbeatResponseForCleanup( public void updateNodeHeartbeatResponseForCleanup(
NodeHeartbeatResponse response) { NodeHeartbeatResponse response) {
} }

View File

@ -118,6 +118,11 @@ public class RMNodeWrapper implements RMNode {
return node.getAppsToCleanup(); return node.getAppsToCleanup();
} }
@Override
public List<ApplicationId> getRunningApps() {
return node.getRunningApps();
}
@Override @Override
public void updateNodeHeartbeatResponseForCleanup( public void updateNodeHeartbeatResponseForCleanup(
NodeHeartbeatResponse nodeHeartbeatResponse) { NodeHeartbeatResponse nodeHeartbeatResponse) {

View File

@ -1637,6 +1637,9 @@ Release 2.6.0 - 2014-11-18
YARN-2811. In Fair Scheduler, reservation fulfillments shouldn't ignore max YARN-2811. In Fair Scheduler, reservation fulfillments shouldn't ignore max
share (Siqi Li via Sandy Ryza) share (Siqi Li via Sandy Ryza)
YARN-3445. Cache runningApps in RMNode for getting running apps on given
NodeId. (Junping Du via mingma)
IMPROVEMENTS IMPROVEMENTS
YARN-2242. Improve exception information on AM launch crashes. (Li Lu YARN-2242. Improve exception information on AM launch crashes. (Li Lu

View File

@ -119,6 +119,8 @@ public interface RMNode {
public List<ApplicationId> getAppsToCleanup(); public List<ApplicationId> getAppsToCleanup();
List<ApplicationId> getRunningApps();
/** /**
* Update a {@link NodeHeartbeatResponse} with the list of containers and * Update a {@link NodeHeartbeatResponse} with the list of containers and
* applications to clean up for this node. * applications to clean up for this node.

View File

@ -123,7 +123,12 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
new HashSet<ContainerId>(); new HashSet<ContainerId>();
/* the list of applications that have finished and need to be purged */ /* the list of applications that have finished and need to be purged */
private final List<ApplicationId> finishedApplications = new ArrayList<ApplicationId>(); private final List<ApplicationId> finishedApplications =
new ArrayList<ApplicationId>();
/* the list of applications that are running on this node */
private final List<ApplicationId> runningApplications =
new ArrayList<ApplicationId>();
private NodeHeartbeatResponse latestNodeHeartBeatResponse = recordFactory private NodeHeartbeatResponse latestNodeHeartBeatResponse = recordFactory
.newRecordInstance(NodeHeartbeatResponse.class); .newRecordInstance(NodeHeartbeatResponse.class);
@ -382,6 +387,16 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
} }
@Override
public List<ApplicationId> getRunningApps() {
this.readLock.lock();
try {
return new ArrayList<ApplicationId>(this.runningApplications);
} finally {
this.readLock.unlock();
}
}
@Override @Override
public List<ContainerId> getContainersToCleanUp() { public List<ContainerId> getContainersToCleanUp() {
@ -519,9 +534,12 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
LOG.warn("Cannot get RMApp by appId=" + appId LOG.warn("Cannot get RMApp by appId=" + appId
+ ", just added it to finishedApplications list for cleanup"); + ", just added it to finishedApplications list for cleanup");
rmNode.finishedApplications.add(appId); rmNode.finishedApplications.add(appId);
rmNode.runningApplications.remove(appId);
return; return;
} }
// Add running applications back due to Node add or Node reconnection.
rmNode.runningApplications.add(appId);
context.getDispatcher().getEventHandler() context.getDispatcher().getEventHandler()
.handle(new RMAppRunningOnNodeEvent(appId, nodeId)); .handle(new RMAppRunningOnNodeEvent(appId, nodeId));
} }
@ -707,8 +725,9 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
@Override @Override
public void transition(RMNodeImpl rmNode, RMNodeEvent event) { public void transition(RMNodeImpl rmNode, RMNodeEvent event) {
rmNode.finishedApplications.add((( ApplicationId appId = ((RMNodeCleanAppEvent) event).getAppId();
RMNodeCleanAppEvent) event).getAppId()); rmNode.finishedApplications.add(appId);
rmNode.runningApplications.remove(appId);
} }
} }
@ -910,12 +929,22 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
+ "cleanup, no further processing"); + "cleanup, no further processing");
continue; continue;
} }
if (finishedApplications.contains(containerId.getApplicationAttemptId()
.getApplicationId())) { ApplicationId containerAppId =
containerId.getApplicationAttemptId().getApplicationId();
if (finishedApplications.contains(containerAppId)) {
LOG.info("Container " + containerId LOG.info("Container " + containerId
+ " belongs to an application that is already killed," + " belongs to an application that is already killed,"
+ " no further processing"); + " no further processing");
continue; continue;
} else if (!runningApplications.contains(containerAppId)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Container " + containerId
+ " is the first container get launched for application "
+ containerAppId);
}
runningApplications.add(containerAppId);
} }
// Process running containers // Process running containers

View File

@ -186,6 +186,11 @@ public class MockNodes {
return null; return null;
} }
@Override
public List<ApplicationId> getRunningApps() {
return null;
}
@Override @Override
public void updateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response) { public void updateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response) {
} }

View File

@ -33,6 +33,7 @@ import java.util.List;
import org.apache.hadoop.util.HostsFileReader; import org.apache.hadoop.util.HostsFileReader;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.NodeState; import org.apache.hadoop.yarn.api.records.NodeState;
@ -485,9 +486,9 @@ public class TestRMNodeTransitions {
NodeId nodeId = node.getNodeID(); NodeId nodeId = node.getNodeID();
// Expire a container // Expire a container
ContainerId completedContainerId = BuilderUtils.newContainerId( ContainerId completedContainerId = BuilderUtils.newContainerId(
BuilderUtils.newApplicationAttemptId( BuilderUtils.newApplicationAttemptId(
BuilderUtils.newApplicationId(0, 0), 0), 0); BuilderUtils.newApplicationId(0, 0), 0), 0);
node.handle(new RMNodeCleanContainerEvent(nodeId, completedContainerId)); node.handle(new RMNodeCleanContainerEvent(nodeId, completedContainerId));
Assert.assertEquals(1, node.getContainersToCleanUp().size()); Assert.assertEquals(1, node.getContainersToCleanUp().size());
@ -512,6 +513,35 @@ public class TestRMNodeTransitions {
Assert.assertEquals(finishedAppId, hbrsp.getApplicationsToCleanup().get(0)); Assert.assertEquals(finishedAppId, hbrsp.getApplicationsToCleanup().get(0));
} }
@Test(timeout=20000)
public void testUpdateHeartbeatResponseForAppLifeCycle() {
RMNodeImpl node = getRunningNode();
NodeId nodeId = node.getNodeID();
ApplicationId runningAppId = BuilderUtils.newApplicationId(0, 1);
// Create a running container
ContainerId runningContainerId = BuilderUtils.newContainerId(
BuilderUtils.newApplicationAttemptId(
runningAppId, 0), 0);
ContainerStatus status = ContainerStatus.newInstance(runningContainerId,
ContainerState.RUNNING, "", 0);
List<ContainerStatus> statusList = new ArrayList<ContainerStatus>();
statusList.add(status);
NodeHealthStatus nodeHealth = NodeHealthStatus.newInstance(true,
"", System.currentTimeMillis());
node.handle(new RMNodeStatusEvent(nodeId, nodeHealth,
statusList, null, null));
Assert.assertEquals(1, node.getRunningApps().size());
// Finish an application
ApplicationId finishedAppId = runningAppId;
node.handle(new RMNodeCleanAppEvent(nodeId, finishedAppId));
Assert.assertEquals(1, node.getAppsToCleanup().size());
Assert.assertEquals(0, node.getRunningApps().size());
}
private RMNodeImpl getRunningNode() { private RMNodeImpl getRunningNode() {
return getRunningNode(null, 0); return getRunningNode(null, 0);
} }