merge MAPREDUCE-3596 from trunk. Fix scheduler to handle cleaned up containers, which NMs may subsequently report as running. (Contributed by Vinod Kumar Vavilapalli)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.23@1231303 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Siddharth Seth 2012-01-13 21:17:59 +00:00
parent 3935e3b5a9
commit a8ba510900
14 changed files with 234 additions and 72 deletions

View File

@ -424,6 +424,9 @@ Release 0.23.1 - Unreleased
MAPREDUCE-3625. CapacityScheduler web-ui display of queue's used capacity is broken. MAPREDUCE-3625. CapacityScheduler web-ui display of queue's used capacity is broken.
(Jason Lowe via mahadev) (Jason Lowe via mahadev)
MAPREDUCE-3596. Fix scheduler to handle cleaned up containers, which NMs
may subsequently report as running. (Vinod Kumar Vavilapalli via sseth)
Release 0.23.0 - 2011-11-01 Release 0.23.0 - 2011-11-01
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -205,6 +205,17 @@ public class BuilderUtils {
return nodeId; return nodeId;
} }
public static ContainerStatus newContainerStatus(ContainerId containerId,
ContainerState containerState, String diagnostics, int exitStatus) {
ContainerStatus containerStatus = recordFactory
.newRecordInstance(ContainerStatus.class);
containerStatus.setState(containerState);
containerStatus.setContainerId(containerId);
containerStatus.setDiagnostics(diagnostics);
containerStatus.setExitStatus(exitStatus);
return containerStatus;
}
public static Container newContainer(ContainerId containerId, public static Container newContainer(ContainerId containerId,
NodeId nodeId, String nodeHttpAddress, NodeId nodeId, String nodeHttpAddress,
Resource resource, Priority priority, ContainerToken containerToken) { Resource resource, Priority priority, ContainerToken containerToken) {

View File

@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.state.MultipleArcTransition;
import org.apache.hadoop.yarn.state.SingleArcTransition; import org.apache.hadoop.yarn.state.SingleArcTransition;
import org.apache.hadoop.yarn.state.StateMachine; import org.apache.hadoop.yarn.state.StateMachine;
import org.apache.hadoop.yarn.state.StateMachineFactory; import org.apache.hadoop.yarn.state.StateMachineFactory;
import org.apache.hadoop.yarn.util.BuilderUtils;
import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.ConverterUtils;
public class ContainerImpl implements Container { public class ContainerImpl implements Container {
@ -370,13 +371,8 @@ public class ContainerImpl implements Container {
public ContainerStatus cloneAndGetContainerStatus() { public ContainerStatus cloneAndGetContainerStatus() {
this.readLock.lock(); this.readLock.lock();
try { try {
ContainerStatus containerStatus = return BuilderUtils.newContainerStatus(this.getContainerID(),
recordFactory.newRecordInstance(ContainerStatus.class); getCurrentState(), diagnostics.toString(), exitCode);
containerStatus.setState(getCurrentState());
containerStatus.setContainerId(this.launchContext.getContainerId());
containerStatus.setDiagnostics(diagnostics.toString());
containerStatus.setExitStatus(exitCode);
return containerStatus;
} finally { } finally {
this.readLock.unlock(); this.readLock.unlock();
} }

View File

@ -67,16 +67,16 @@ import org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRen
import org.apache.hadoop.yarn.server.resourcemanager.webapp.RMWebApp; import org.apache.hadoop.yarn.server.resourcemanager.webapp.RMWebApp;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.server.security.ContainerTokenSecretManager; import org.apache.hadoop.yarn.server.security.ContainerTokenSecretManager;
import org.apache.hadoop.yarn.server.webproxy.AppReportFetcher;
import org.apache.hadoop.yarn.server.webproxy.ProxyUriUtils;
import org.apache.hadoop.yarn.server.webproxy.WebAppProxy;
import org.apache.hadoop.yarn.server.webproxy.WebAppProxyServlet;
import org.apache.hadoop.yarn.service.AbstractService; import org.apache.hadoop.yarn.service.AbstractService;
import org.apache.hadoop.yarn.service.CompositeService; import org.apache.hadoop.yarn.service.CompositeService;
import org.apache.hadoop.yarn.service.Service; import org.apache.hadoop.yarn.service.Service;
import org.apache.hadoop.yarn.webapp.WebApp; import org.apache.hadoop.yarn.webapp.WebApp;
import org.apache.hadoop.yarn.webapp.WebApps; import org.apache.hadoop.yarn.webapp.WebApps;
import org.apache.hadoop.yarn.webapp.WebApps.Builder; import org.apache.hadoop.yarn.webapp.WebApps.Builder;
import org.apache.hadoop.yarn.server.webproxy.AppReportFetcher;
import org.apache.hadoop.yarn.server.webproxy.ProxyUriUtils;
import org.apache.hadoop.yarn.server.webproxy.WebAppProxyServlet;
import org.apache.hadoop.yarn.server.webproxy.WebAppProxy;
/** /**
* The ResourceManager is the main class that is a set of components. * The ResourceManager is the main class that is a set of components.
@ -256,7 +256,7 @@ public class ResourceManager extends CompositeService implements Recoverable {
} }
@Private @Private
public static final class SchedulerEventDispatcher extends AbstractService public static class SchedulerEventDispatcher extends AbstractService
implements EventHandler<SchedulerEvent> { implements EventHandler<SchedulerEvent> {
private final ResourceScheduler scheduler; private final ResourceScheduler scheduler;

View File

@ -265,8 +265,8 @@ public class ResourceTrackerService extends AbstractService implements
HeartbeatResponse latestResponse = recordFactory HeartbeatResponse latestResponse = recordFactory
.newRecordInstance(HeartbeatResponse.class); .newRecordInstance(HeartbeatResponse.class);
latestResponse.setResponseId(lastHeartbeatResponse.getResponseId() + 1); latestResponse.setResponseId(lastHeartbeatResponse.getResponseId() + 1);
latestResponse.addAllContainersToCleanup(rmNode.pullContainersToCleanUp()); latestResponse.addAllContainersToCleanup(rmNode.getContainersToCleanUp());
latestResponse.addAllApplicationsToCleanup(rmNode.pullAppsToCleanup()); latestResponse.addAllApplicationsToCleanup(rmNode.getAppsToCleanup());
latestResponse.setNodeAction(NodeAction.NORMAL); latestResponse.setNodeAction(NodeAction.NORMAL);
// 4. Send status to RMNode, saving the latest response. // 4. Send status to RMNode, saving the latest response.

View File

@ -101,9 +101,9 @@ public interface RMNode {
public RMNodeState getState(); public RMNodeState getState();
public List<ContainerId> pullContainersToCleanUp(); public List<ContainerId> getContainersToCleanUp();
public List<ApplicationId> pullAppsToCleanup(); public List<ApplicationId> getAppsToCleanup();
public HeartbeatResponse getLastHeartBeatResponse(); public HeartbeatResponse getLastHeartBeatResponse();
} }

View File

@ -90,7 +90,6 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
private final Map<ContainerId, ContainerStatus> justLaunchedContainers = private final Map<ContainerId, ContainerStatus> justLaunchedContainers =
new HashMap<ContainerId, ContainerStatus>(); new HashMap<ContainerId, ContainerStatus>();
/* set of containers that need to be cleaned */ /* set of containers that need to be cleaned */
private final Set<ContainerId> containersToClean = new TreeSet<ContainerId>( private final Set<ContainerId> containersToClean = new TreeSet<ContainerId>(
new ContainerIdComparator()); new ContainerIdComparator());
@ -248,54 +247,38 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
} }
@Override @Override
public List<ApplicationId> pullAppsToCleanup() { public List<ApplicationId> getAppsToCleanup() {
this.writeLock.lock();
try {
List<ApplicationId> lastfinishedApplications = new ArrayList<ApplicationId>();
lastfinishedApplications.addAll(this.finishedApplications);
this.finishedApplications.clear();
return lastfinishedApplications;
} finally {
this.writeLock.unlock();
}
}
@Private
public List<ContainerId> getContainersToCleanUp() {
this.readLock.lock(); this.readLock.lock();
try { try {
return new ArrayList<ContainerId>(containersToClean); return new ArrayList<ApplicationId>(this.finishedApplications);
} finally { } finally {
this.readLock.unlock(); this.readLock.unlock();
} }
} }
@Override @Override
public List<ContainerId> pullContainersToCleanUp() { public List<ContainerId> getContainersToCleanUp() {
this.writeLock.lock(); this.readLock.lock();
try { try {
List<ContainerId> containersToCleanUp = new ArrayList<ContainerId>(); return new ArrayList<ContainerId>(this.containersToClean);
containersToCleanUp.addAll(this.containersToClean);
this.containersToClean.clear();
return containersToCleanUp;
} finally { } finally {
this.writeLock.unlock(); this.readLock.unlock();
} }
}; };
@Override @Override
public HeartbeatResponse getLastHeartBeatResponse() { public HeartbeatResponse getLastHeartBeatResponse() {
this.writeLock.lock(); this.readLock.lock();
try { try {
return this.latestHeartBeatResponse; return this.latestHeartBeatResponse;
} finally { } finally {
this.writeLock.unlock(); this.readLock.unlock();
} }
} }
@ -407,13 +390,21 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
for (ContainerStatus remoteContainer : statusEvent.getContainers()) { for (ContainerStatus remoteContainer : statusEvent.getContainers()) {
ContainerId containerId = remoteContainer.getContainerId(); ContainerId containerId = remoteContainer.getContainerId();
// Don't bother with containers already scheduled for cleanup, // Don't bother with containers already scheduled for cleanup, or for
// the scheduler doens't need to know any more about this container // applications already killed. The scheduler doens't need to know any
// more about this container
if (rmNode.containersToClean.contains(containerId)) { if (rmNode.containersToClean.contains(containerId)) {
LOG.info("Container " + containerId + " already scheduled for " + LOG.info("Container " + containerId + " already scheduled for " +
"cleanup, no further processing"); "cleanup, no further processing");
continue; continue;
} }
if (rmNode.finishedApplications.contains(containerId
.getApplicationAttemptId().getApplicationId())) {
LOG.info("Container " + containerId
+ " belongs to an application that is already killed,"
+ " no further processing");
continue;
}
// Process running containers // Process running containers
if (remoteContainer.getState() == ContainerState.RUNNING) { if (remoteContainer.getState() == ContainerState.RUNNING) {
@ -435,6 +426,12 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
rmNode.context.getDelegationTokenRenewer().updateKeepAliveApplications( rmNode.context.getDelegationTokenRenewer().updateKeepAliveApplications(
statusEvent.getKeepAliveAppIds()); statusEvent.getKeepAliveAppIds());
// HeartBeat processing from our end is done, as node pulls the following
// lists before sending status-updates. Clear data-structures
rmNode.containersToClean.clear();
rmNode.finishedApplications.clear();
return RMNodeState.RUNNING; return RMNodeState.RUNNING;
} }
} }

View File

@ -39,9 +39,9 @@ import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger; import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants; import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.ApplicationsStore.ApplicationStore; import org.apache.hadoop.yarn.server.resourcemanager.recovery.ApplicationsStore.ApplicationStore;
import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources; import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
@ -52,6 +52,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerFini
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerReservedEvent; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerReservedEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
import com.google.common.collect.HashMultiset; import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset; import com.google.common.collect.Multiset;
@ -61,6 +62,7 @@ import com.google.common.collect.Multiset;
* Each running Application in the RM corresponds to one instance * Each running Application in the RM corresponds to one instance
* of this class. * of this class.
*/ */
@SuppressWarnings("unchecked")
public class SchedulerApp { public class SchedulerApp {
private static final Log LOG = LogFactory.getLog(SchedulerApp.class); private static final Log LOG = LogFactory.getLog(SchedulerApp.class);
@ -174,12 +176,19 @@ public class SchedulerApp {
this.appSchedulingInfo.stop(rmAppAttemptFinalState); this.appSchedulingInfo.stop(rmAppAttemptFinalState);
} }
synchronized public void containerLaunchedOnNode(ContainerId containerId) { public synchronized void containerLaunchedOnNode(ContainerId containerId,
NodeId nodeId) {
// Inform the container // Inform the container
RMContainer rmContainer = RMContainer rmContainer =
getRMContainer(containerId); getRMContainer(containerId);
rmContainer.handle( if (rmContainer == null) {
new RMContainerEvent(containerId, // Some unknown container sneaked into the system. Kill it.
this.rmContext.getDispatcher().getEventHandler()
.handle(new RMNodeCleanContainerEvent(nodeId, containerId));
return;
}
rmContainer.handle(new RMContainerEvent(containerId,
RMContainerEventType.LAUNCHED)); RMContainerEventType.LAUNCHED));
} }

View File

@ -57,6 +57,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAt
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
@ -76,6 +77,7 @@ import org.apache.hadoop.yarn.server.security.ContainerTokenSecretManager;
@LimitedPrivate("yarn") @LimitedPrivate("yarn")
@Evolving @Evolving
@SuppressWarnings("unchecked")
public class CapacityScheduler public class CapacityScheduler
implements ResourceScheduler, CapacitySchedulerContext { implements ResourceScheduler, CapacitySchedulerContext {
@ -588,10 +590,12 @@ implements ResourceScheduler, CapacitySchedulerContext {
LOG.info("Unknown application: " + applicationAttemptId + LOG.info("Unknown application: " + applicationAttemptId +
" launched container " + containerId + " launched container " + containerId +
" on node: " + node); " on node: " + node);
this.rmContext.getDispatcher().getEventHandler()
.handle(new RMNodeCleanContainerEvent(node.getNodeID(), containerId));
return; return;
} }
application.containerLaunchedOnNode(containerId); application.containerLaunchedOnNode(containerId, node.getNodeID());
} }
@Override @Override

View File

@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptS
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue;
@ -87,6 +88,7 @@ import org.apache.hadoop.yarn.util.BuilderUtils;
@LimitedPrivate("yarn") @LimitedPrivate("yarn")
@Evolving @Evolving
@SuppressWarnings("unchecked")
public class FifoScheduler implements ResourceScheduler { public class FifoScheduler implements ResourceScheduler {
private static final Log LOG = LogFactory.getLog(FifoScheduler.class); private static final Log LOG = LogFactory.getLog(FifoScheduler.class);
@ -282,7 +284,6 @@ public class FifoScheduler implements ResourceScheduler {
return nodes.get(nodeId); return nodes.get(nodeId);
} }
@SuppressWarnings("unchecked")
private synchronized void addApplication(ApplicationAttemptId appAttemptId, private synchronized void addApplication(ApplicationAttemptId appAttemptId,
String user) { String user) {
// TODO: Fix store // TODO: Fix store
@ -655,10 +656,14 @@ public class FifoScheduler implements ResourceScheduler {
LOG.info("Unknown application: " + applicationAttemptId + LOG.info("Unknown application: " + applicationAttemptId +
" launched container " + containerId + " launched container " + containerId +
" on node: " + node); " on node: " + node);
// Some unknown container sneaked into the system. Kill it.
this.rmContext.getDispatcher().getEventHandler()
.handle(new RMNodeCleanContainerEvent(node.getNodeID(), containerId));
return; return;
} }
application.containerLaunchedOnNode(containerId); application.containerLaunchedOnNode(containerId, node.getNodeID());
} }
@Lock(FifoScheduler.class) @Lock(FifoScheduler.class)

View File

@ -39,15 +39,17 @@ public class MockNM {
private int responseId; private int responseId;
private NodeId nodeId; private NodeId nodeId;
private final String nodeIdStr;
private final int memory; private final int memory;
private final ResourceTrackerService resourceTracker; private final ResourceTrackerService resourceTracker;
private final int httpPort = 2; private final int httpPort = 2;
MockNM(String nodeIdStr, int memory, ResourceTrackerService resourceTracker) { MockNM(String nodeIdStr, int memory, ResourceTrackerService resourceTracker) {
this.nodeIdStr = nodeIdStr;
this.memory = memory; this.memory = memory;
this.resourceTracker = resourceTracker; this.resourceTracker = resourceTracker;
String[] splits = nodeIdStr.split(":");
nodeId = Records.newRecord(NodeId.class);
nodeId.setHost(splits[0]);
nodeId.setPort(Integer.parseInt(splits[1]));
} }
public NodeId getNodeId() { public NodeId getNodeId() {
@ -63,14 +65,10 @@ public class MockNM {
new HashMap<ApplicationId, List<ContainerStatus>>(); new HashMap<ApplicationId, List<ContainerStatus>>();
conts.put(container.getId().getApplicationAttemptId().getApplicationId(), conts.put(container.getId().getApplicationAttemptId().getApplicationId(),
Arrays.asList(new ContainerStatus[] { container.getContainerStatus() })); Arrays.asList(new ContainerStatus[] { container.getContainerStatus() }));
nodeHeartbeat(conts, true,nodeId); nodeHeartbeat(conts, true);
} }
public NodeId registerNode() throws Exception { public NodeId registerNode() throws Exception {
String[] splits = nodeIdStr.split(":");
nodeId = Records.newRecord(NodeId.class);
nodeId.setHost(splits[0]);
nodeId.setPort(Integer.parseInt(splits[1]));
RegisterNodeManagerRequest req = Records.newRecord( RegisterNodeManagerRequest req = Records.newRecord(
RegisterNodeManagerRequest.class); RegisterNodeManagerRequest.class);
req.setNodeId(nodeId); req.setNodeId(nodeId);
@ -83,11 +81,11 @@ public class MockNM {
} }
public HeartbeatResponse nodeHeartbeat(boolean b) throws Exception { public HeartbeatResponse nodeHeartbeat(boolean b) throws Exception {
return nodeHeartbeat(new HashMap<ApplicationId, List<ContainerStatus>>(), b,nodeId); return nodeHeartbeat(new HashMap<ApplicationId, List<ContainerStatus>>(), b);
} }
public HeartbeatResponse nodeHeartbeat(Map<ApplicationId, public HeartbeatResponse nodeHeartbeat(Map<ApplicationId,
List<ContainerStatus>> conts, boolean isHealthy, NodeId nodeId) throws Exception { List<ContainerStatus>> conts, boolean isHealthy) throws Exception {
NodeHeartbeatRequest req = Records.newRecord(NodeHeartbeatRequest.class); NodeHeartbeatRequest req = Records.newRecord(NodeHeartbeatRequest.class);
NodeStatus status = Records.newRecord(NodeStatus.class); NodeStatus status = Records.newRecord(NodeStatus.class);
status.setNodeId(nodeId); status.setNodeId(nodeId);

View File

@ -152,13 +152,13 @@ public class MockNodes {
} }
@Override @Override
public List<ApplicationId> pullAppsToCleanup() { public List<ApplicationId> getAppsToCleanup() {
// TODO Auto-generated method stub // TODO Auto-generated method stub
return null; return null;
} }
@Override @Override
public List<ContainerId> pullContainersToCleanUp() { public List<ContainerId> getContainersToCleanUp() {
// TODO Auto-generated method stub // TODO Auto-generated method stub
return null; return null;
} }

View File

@ -19,26 +19,39 @@
package org.apache.hadoop.yarn.server.resourcemanager; package org.apache.hadoop.yarn.server.resourcemanager;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import junit.framework.Assert; import junit.framework.Assert;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.DrainDispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.server.api.records.HeartbeatResponse; import org.apache.hadoop.yarn.server.api.records.HeartbeatResponse;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
import org.apache.hadoop.yarn.util.BuilderUtils;
import org.apache.log4j.Level; import org.apache.log4j.Level;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.junit.Test; import org.junit.Test;
import org.mortbay.log.Log;
public class TestApplicationCleanup { public class TestApplicationCleanup {
private static final Log LOG = LogFactory
.getLog(TestApplicationCleanup.class);
@Test @Test
public void testAppCleanup() throws Exception { public void testAppCleanup() throws Exception {
Logger rootLogger = LogManager.getRootLogger(); Logger rootLogger = LogManager.getRootLogger();
@ -67,11 +80,13 @@ public class TestApplicationCleanup {
List<Container> conts = am.allocate(new ArrayList<ResourceRequest>(), List<Container> conts = am.allocate(new ArrayList<ResourceRequest>(),
new ArrayList<ContainerId>()).getAllocatedContainers(); new ArrayList<ContainerId>()).getAllocatedContainers();
int contReceived = conts.size(); int contReceived = conts.size();
while (contReceived < request) { int waitCount = 0;
while (contReceived < request && waitCount++ < 20) {
conts = am.allocate(new ArrayList<ResourceRequest>(), conts = am.allocate(new ArrayList<ResourceRequest>(),
new ArrayList<ContainerId>()).getAllocatedContainers(); new ArrayList<ContainerId>()).getAllocatedContainers();
contReceived += conts.size(); contReceived += conts.size();
Log.info("Got " + contReceived + " containers. Waiting to get " + request); LOG.info("Got " + contReceived + " containers. Waiting to get "
+ request);
Thread.sleep(2000); Thread.sleep(2000);
} }
Assert.assertEquals(request, conts.size()); Assert.assertEquals(request, conts.size());
@ -86,11 +101,12 @@ public class TestApplicationCleanup {
//currently only containers are cleaned via this //currently only containers are cleaned via this
//AM container is cleaned via container launcher //AM container is cleaned via container launcher
while (cleanedConts < 2 || cleanedApps < 1) { waitCount = 0;
while ((cleanedConts < 3 || cleanedApps < 1) && waitCount++ < 20) {
HeartbeatResponse resp = nm1.nodeHeartbeat(true); HeartbeatResponse resp = nm1.nodeHeartbeat(true);
contsToClean = resp.getContainersToCleanupList(); contsToClean = resp.getContainersToCleanupList();
apps = resp.getApplicationsToCleanupList(); apps = resp.getApplicationsToCleanupList();
Log.info("Waiting to get cleanup events.. cleanedConts: " LOG.info("Waiting to get cleanup events.. cleanedConts: "
+ cleanedConts + " cleanedApps: " + cleanedApps); + cleanedConts + " cleanedApps: " + cleanedApps);
cleanedConts += contsToClean.size(); cleanedConts += contsToClean.size();
cleanedApps += apps.size(); cleanedApps += apps.size();
@ -99,6 +115,130 @@ public class TestApplicationCleanup {
Assert.assertEquals(1, apps.size()); Assert.assertEquals(1, apps.size());
Assert.assertEquals(app.getApplicationId(), apps.get(0)); Assert.assertEquals(app.getApplicationId(), apps.get(0));
Assert.assertEquals(1, cleanedApps);
Assert.assertEquals(3, cleanedConts);
rm.stop();
}
@Test
public void testContainerCleanup() throws Exception {
Logger rootLogger = LogManager.getRootLogger();
rootLogger.setLevel(Level.DEBUG);
final DrainDispatcher dispatcher = new DrainDispatcher();
MockRM rm = new MockRM() {
@Override
protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
return new SchedulerEventDispatcher(this.scheduler) {
@Override
public void handle(SchedulerEvent event) {
scheduler.handle(event);
}
};
}
@Override
protected Dispatcher createDispatcher() {
return dispatcher;
}
};
rm.start();
MockNM nm1 = rm.registerNode("h1:1234", 5000);
RMApp app = rm.submitApp(2000);
//kick the scheduling
nm1.nodeHeartbeat(true);
RMAppAttempt attempt = app.getCurrentAppAttempt();
MockAM am = rm.sendAMLaunched(attempt.getAppAttemptId());
am.registerAppAttempt();
//request for containers
int request = 2;
am.allocate("h1" , 1000, request,
new ArrayList<ContainerId>());
dispatcher.await();
//kick the scheduler
nm1.nodeHeartbeat(true);
List<Container> conts = am.allocate(new ArrayList<ResourceRequest>(),
new ArrayList<ContainerId>()).getAllocatedContainers();
int contReceived = conts.size();
int waitCount = 0;
while (contReceived < request && waitCount++ < 20) {
conts = am.allocate(new ArrayList<ResourceRequest>(),
new ArrayList<ContainerId>()).getAllocatedContainers();
dispatcher.await();
contReceived += conts.size();
LOG.info("Got " + contReceived + " containers. Waiting to get "
+ request);
Thread.sleep(2000);
}
Assert.assertEquals(request, conts.size());
// Release a container.
ArrayList<ContainerId> release = new ArrayList<ContainerId>();
release.add(conts.get(1).getId());
am.allocate(new ArrayList<ResourceRequest>(), release);
dispatcher.await();
// Send one more heartbeat with a fake running container. This is to
// simulate the situation that can happen if the NM reports that container
// is running in the same heartbeat when the RM asks it to clean it up.
Map<ApplicationId, List<ContainerStatus>> containerStatuses =
new HashMap<ApplicationId, List<ContainerStatus>>();
ArrayList<ContainerStatus> containerStatusList =
new ArrayList<ContainerStatus>();
containerStatusList.add(BuilderUtils.newContainerStatus(conts.get(1)
.getId(), ContainerState.RUNNING, "nothing", 0));
containerStatuses.put(app.getApplicationId(), containerStatusList);
HeartbeatResponse resp = nm1.nodeHeartbeat(containerStatuses, true);
dispatcher.await();
List<ContainerId> contsToClean = resp.getContainersToCleanupList();
int cleanedConts = contsToClean.size();
waitCount = 0;
while (cleanedConts < 1 && waitCount++ < 20) {
resp = nm1.nodeHeartbeat(true);
dispatcher.await();
contsToClean = resp.getContainersToCleanupList();
LOG.info("Waiting to get cleanup events.. cleanedConts: " + cleanedConts);
cleanedConts += contsToClean.size();
Thread.sleep(1000);
}
LOG.info("Got cleanup for " + contsToClean.get(0));
Assert.assertEquals(1, cleanedConts);
// Now to test the case when RM already gave cleanup, and NM suddenly
// realizes that the container is running.
LOG.info("Testing container launch much after release and "
+ "NM getting cleanup");
containerStatuses.clear();
containerStatusList.clear();
containerStatusList.add(BuilderUtils.newContainerStatus(conts.get(1)
.getId(), ContainerState.RUNNING, "nothing", 0));
containerStatuses.put(app.getApplicationId(), containerStatusList);
resp = nm1.nodeHeartbeat(containerStatuses, true);
dispatcher.await();
contsToClean = resp.getContainersToCleanupList();
cleanedConts = contsToClean.size();
// The cleanup list won't be instantaneous as it is given out by scheduler
// and not RMNodeImpl.
waitCount = 0;
while (cleanedConts < 1 && waitCount++ < 20) {
resp = nm1.nodeHeartbeat(true);
dispatcher.await();
contsToClean = resp.getContainersToCleanupList();
LOG.info("Waiting to get cleanup events.. cleanedConts: " + cleanedConts);
cleanedConts += contsToClean.size();
Thread.sleep(1000);
}
LOG.info("Got cleanup for " + contsToClean.get(0));
Assert.assertEquals(1, cleanedConts);
rm.stop(); rm.stop();
} }

View File

@ -164,8 +164,7 @@ public class TestResourceTrackerService {
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction())); Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
nodeHeartbeat = nm2.nodeHeartbeat( nodeHeartbeat = nm2.nodeHeartbeat(
new HashMap<ApplicationId, List<ContainerStatus>>(), true, new HashMap<ApplicationId, List<ContainerStatus>>(), true);
recordFactory.newRecordInstance(NodeId.class));
Assert.assertTrue(NodeAction.REBOOT.equals(nodeHeartbeat.getNodeAction())); Assert.assertTrue(NodeAction.REBOOT.equals(nodeHeartbeat.getNodeAction()));
checkRebootedNMCount(rm, ++initialMetricCount); checkRebootedNMCount(rm, ++initialMetricCount);
} }