merge MAPREDUCE-3596 from trunk. Fix scheduler to handle cleaned up containers, which NMs may subsequently report as running. (Contributed by Vinod Kumar Vavilapalli)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.23@1231303 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3935e3b5a9
commit
a8ba510900
|
@ -424,6 +424,9 @@ Release 0.23.1 - Unreleased
|
||||||
MAPREDUCE-3625. CapacityScheduler web-ui display of queue's used capacity is broken.
|
MAPREDUCE-3625. CapacityScheduler web-ui display of queue's used capacity is broken.
|
||||||
(Jason Lowe via mahadev)
|
(Jason Lowe via mahadev)
|
||||||
|
|
||||||
|
MAPREDUCE-3596. Fix scheduler to handle cleaned up containers, which NMs
|
||||||
|
may subsequently report as running. (Vinod Kumar Vavilapalli via sseth)
|
||||||
|
|
||||||
Release 0.23.0 - 2011-11-01
|
Release 0.23.0 - 2011-11-01
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -205,6 +205,17 @@ public class BuilderUtils {
|
||||||
return nodeId;
|
return nodeId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static ContainerStatus newContainerStatus(ContainerId containerId,
|
||||||
|
ContainerState containerState, String diagnostics, int exitStatus) {
|
||||||
|
ContainerStatus containerStatus = recordFactory
|
||||||
|
.newRecordInstance(ContainerStatus.class);
|
||||||
|
containerStatus.setState(containerState);
|
||||||
|
containerStatus.setContainerId(containerId);
|
||||||
|
containerStatus.setDiagnostics(diagnostics);
|
||||||
|
containerStatus.setExitStatus(exitStatus);
|
||||||
|
return containerStatus;
|
||||||
|
}
|
||||||
|
|
||||||
public static Container newContainer(ContainerId containerId,
|
public static Container newContainer(ContainerId containerId,
|
||||||
NodeId nodeId, String nodeHttpAddress,
|
NodeId nodeId, String nodeHttpAddress,
|
||||||
Resource resource, Priority priority, ContainerToken containerToken) {
|
Resource resource, Priority priority, ContainerToken containerToken) {
|
||||||
|
|
|
@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.state.MultipleArcTransition;
|
||||||
import org.apache.hadoop.yarn.state.SingleArcTransition;
|
import org.apache.hadoop.yarn.state.SingleArcTransition;
|
||||||
import org.apache.hadoop.yarn.state.StateMachine;
|
import org.apache.hadoop.yarn.state.StateMachine;
|
||||||
import org.apache.hadoop.yarn.state.StateMachineFactory;
|
import org.apache.hadoop.yarn.state.StateMachineFactory;
|
||||||
|
import org.apache.hadoop.yarn.util.BuilderUtils;
|
||||||
import org.apache.hadoop.yarn.util.ConverterUtils;
|
import org.apache.hadoop.yarn.util.ConverterUtils;
|
||||||
|
|
||||||
public class ContainerImpl implements Container {
|
public class ContainerImpl implements Container {
|
||||||
|
@ -370,13 +371,8 @@ public class ContainerImpl implements Container {
|
||||||
public ContainerStatus cloneAndGetContainerStatus() {
|
public ContainerStatus cloneAndGetContainerStatus() {
|
||||||
this.readLock.lock();
|
this.readLock.lock();
|
||||||
try {
|
try {
|
||||||
ContainerStatus containerStatus =
|
return BuilderUtils.newContainerStatus(this.getContainerID(),
|
||||||
recordFactory.newRecordInstance(ContainerStatus.class);
|
getCurrentState(), diagnostics.toString(), exitCode);
|
||||||
containerStatus.setState(getCurrentState());
|
|
||||||
containerStatus.setContainerId(this.launchContext.getContainerId());
|
|
||||||
containerStatus.setDiagnostics(diagnostics.toString());
|
|
||||||
containerStatus.setExitStatus(exitCode);
|
|
||||||
return containerStatus;
|
|
||||||
} finally {
|
} finally {
|
||||||
this.readLock.unlock();
|
this.readLock.unlock();
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,16 +67,16 @@ import org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRen
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.webapp.RMWebApp;
|
import org.apache.hadoop.yarn.server.resourcemanager.webapp.RMWebApp;
|
||||||
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
|
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
|
||||||
import org.apache.hadoop.yarn.server.security.ContainerTokenSecretManager;
|
import org.apache.hadoop.yarn.server.security.ContainerTokenSecretManager;
|
||||||
|
import org.apache.hadoop.yarn.server.webproxy.AppReportFetcher;
|
||||||
|
import org.apache.hadoop.yarn.server.webproxy.ProxyUriUtils;
|
||||||
|
import org.apache.hadoop.yarn.server.webproxy.WebAppProxy;
|
||||||
|
import org.apache.hadoop.yarn.server.webproxy.WebAppProxyServlet;
|
||||||
import org.apache.hadoop.yarn.service.AbstractService;
|
import org.apache.hadoop.yarn.service.AbstractService;
|
||||||
import org.apache.hadoop.yarn.service.CompositeService;
|
import org.apache.hadoop.yarn.service.CompositeService;
|
||||||
import org.apache.hadoop.yarn.service.Service;
|
import org.apache.hadoop.yarn.service.Service;
|
||||||
import org.apache.hadoop.yarn.webapp.WebApp;
|
import org.apache.hadoop.yarn.webapp.WebApp;
|
||||||
import org.apache.hadoop.yarn.webapp.WebApps;
|
import org.apache.hadoop.yarn.webapp.WebApps;
|
||||||
import org.apache.hadoop.yarn.webapp.WebApps.Builder;
|
import org.apache.hadoop.yarn.webapp.WebApps.Builder;
|
||||||
import org.apache.hadoop.yarn.server.webproxy.AppReportFetcher;
|
|
||||||
import org.apache.hadoop.yarn.server.webproxy.ProxyUriUtils;
|
|
||||||
import org.apache.hadoop.yarn.server.webproxy.WebAppProxyServlet;
|
|
||||||
import org.apache.hadoop.yarn.server.webproxy.WebAppProxy;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The ResourceManager is the main class that is a set of components.
|
* The ResourceManager is the main class that is a set of components.
|
||||||
|
@ -256,7 +256,7 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Private
|
@Private
|
||||||
public static final class SchedulerEventDispatcher extends AbstractService
|
public static class SchedulerEventDispatcher extends AbstractService
|
||||||
implements EventHandler<SchedulerEvent> {
|
implements EventHandler<SchedulerEvent> {
|
||||||
|
|
||||||
private final ResourceScheduler scheduler;
|
private final ResourceScheduler scheduler;
|
||||||
|
|
|
@ -265,8 +265,8 @@ public class ResourceTrackerService extends AbstractService implements
|
||||||
HeartbeatResponse latestResponse = recordFactory
|
HeartbeatResponse latestResponse = recordFactory
|
||||||
.newRecordInstance(HeartbeatResponse.class);
|
.newRecordInstance(HeartbeatResponse.class);
|
||||||
latestResponse.setResponseId(lastHeartbeatResponse.getResponseId() + 1);
|
latestResponse.setResponseId(lastHeartbeatResponse.getResponseId() + 1);
|
||||||
latestResponse.addAllContainersToCleanup(rmNode.pullContainersToCleanUp());
|
latestResponse.addAllContainersToCleanup(rmNode.getContainersToCleanUp());
|
||||||
latestResponse.addAllApplicationsToCleanup(rmNode.pullAppsToCleanup());
|
latestResponse.addAllApplicationsToCleanup(rmNode.getAppsToCleanup());
|
||||||
latestResponse.setNodeAction(NodeAction.NORMAL);
|
latestResponse.setNodeAction(NodeAction.NORMAL);
|
||||||
|
|
||||||
// 4. Send status to RMNode, saving the latest response.
|
// 4. Send status to RMNode, saving the latest response.
|
||||||
|
|
|
@ -101,9 +101,9 @@ public interface RMNode {
|
||||||
|
|
||||||
public RMNodeState getState();
|
public RMNodeState getState();
|
||||||
|
|
||||||
public List<ContainerId> pullContainersToCleanUp();
|
public List<ContainerId> getContainersToCleanUp();
|
||||||
|
|
||||||
public List<ApplicationId> pullAppsToCleanup();
|
public List<ApplicationId> getAppsToCleanup();
|
||||||
|
|
||||||
public HeartbeatResponse getLastHeartBeatResponse();
|
public HeartbeatResponse getLastHeartBeatResponse();
|
||||||
}
|
}
|
|
@ -90,7 +90,6 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
private final Map<ContainerId, ContainerStatus> justLaunchedContainers =
|
private final Map<ContainerId, ContainerStatus> justLaunchedContainers =
|
||||||
new HashMap<ContainerId, ContainerStatus>();
|
new HashMap<ContainerId, ContainerStatus>();
|
||||||
|
|
||||||
|
|
||||||
/* set of containers that need to be cleaned */
|
/* set of containers that need to be cleaned */
|
||||||
private final Set<ContainerId> containersToClean = new TreeSet<ContainerId>(
|
private final Set<ContainerId> containersToClean = new TreeSet<ContainerId>(
|
||||||
new ContainerIdComparator());
|
new ContainerIdComparator());
|
||||||
|
@ -248,54 +247,38 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<ApplicationId> pullAppsToCleanup() {
|
public List<ApplicationId> getAppsToCleanup() {
|
||||||
this.writeLock.lock();
|
|
||||||
|
|
||||||
try {
|
|
||||||
List<ApplicationId> lastfinishedApplications = new ArrayList<ApplicationId>();
|
|
||||||
lastfinishedApplications.addAll(this.finishedApplications);
|
|
||||||
this.finishedApplications.clear();
|
|
||||||
return lastfinishedApplications;
|
|
||||||
} finally {
|
|
||||||
this.writeLock.unlock();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Private
|
|
||||||
public List<ContainerId> getContainersToCleanUp() {
|
|
||||||
this.readLock.lock();
|
this.readLock.lock();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return new ArrayList<ContainerId>(containersToClean);
|
return new ArrayList<ApplicationId>(this.finishedApplications);
|
||||||
} finally {
|
} finally {
|
||||||
this.readLock.unlock();
|
this.readLock.unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<ContainerId> pullContainersToCleanUp() {
|
public List<ContainerId> getContainersToCleanUp() {
|
||||||
|
|
||||||
this.writeLock.lock();
|
this.readLock.lock();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
List<ContainerId> containersToCleanUp = new ArrayList<ContainerId>();
|
return new ArrayList<ContainerId>(this.containersToClean);
|
||||||
containersToCleanUp.addAll(this.containersToClean);
|
|
||||||
this.containersToClean.clear();
|
|
||||||
return containersToCleanUp;
|
|
||||||
} finally {
|
} finally {
|
||||||
this.writeLock.unlock();
|
this.readLock.unlock();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public HeartbeatResponse getLastHeartBeatResponse() {
|
public HeartbeatResponse getLastHeartBeatResponse() {
|
||||||
|
|
||||||
this.writeLock.lock();
|
this.readLock.lock();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return this.latestHeartBeatResponse;
|
return this.latestHeartBeatResponse;
|
||||||
} finally {
|
} finally {
|
||||||
this.writeLock.unlock();
|
this.readLock.unlock();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -407,13 +390,21 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
for (ContainerStatus remoteContainer : statusEvent.getContainers()) {
|
for (ContainerStatus remoteContainer : statusEvent.getContainers()) {
|
||||||
ContainerId containerId = remoteContainer.getContainerId();
|
ContainerId containerId = remoteContainer.getContainerId();
|
||||||
|
|
||||||
// Don't bother with containers already scheduled for cleanup,
|
// Don't bother with containers already scheduled for cleanup, or for
|
||||||
// the scheduler doens't need to know any more about this container
|
// applications already killed. The scheduler doens't need to know any
|
||||||
|
// more about this container
|
||||||
if (rmNode.containersToClean.contains(containerId)) {
|
if (rmNode.containersToClean.contains(containerId)) {
|
||||||
LOG.info("Container " + containerId + " already scheduled for " +
|
LOG.info("Container " + containerId + " already scheduled for " +
|
||||||
"cleanup, no further processing");
|
"cleanup, no further processing");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (rmNode.finishedApplications.contains(containerId
|
||||||
|
.getApplicationAttemptId().getApplicationId())) {
|
||||||
|
LOG.info("Container " + containerId
|
||||||
|
+ " belongs to an application that is already killed,"
|
||||||
|
+ " no further processing");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Process running containers
|
// Process running containers
|
||||||
if (remoteContainer.getState() == ContainerState.RUNNING) {
|
if (remoteContainer.getState() == ContainerState.RUNNING) {
|
||||||
|
@ -435,6 +426,12 @@ public class RMNodeImpl implements RMNode, EventHandler<RMNodeEvent> {
|
||||||
|
|
||||||
rmNode.context.getDelegationTokenRenewer().updateKeepAliveApplications(
|
rmNode.context.getDelegationTokenRenewer().updateKeepAliveApplications(
|
||||||
statusEvent.getKeepAliveAppIds());
|
statusEvent.getKeepAliveAppIds());
|
||||||
|
|
||||||
|
// HeartBeat processing from our end is done, as node pulls the following
|
||||||
|
// lists before sending status-updates. Clear data-structures
|
||||||
|
rmNode.containersToClean.clear();
|
||||||
|
rmNode.finishedApplications.clear();
|
||||||
|
|
||||||
return RMNodeState.RUNNING;
|
return RMNodeState.RUNNING;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,9 +39,9 @@ import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.ApplicationsStore.ApplicationStore;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.ApplicationsStore.ApplicationStore;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
|
import org.apache.hadoop.yarn.server.resourcemanager.resource.Resources;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
|
||||||
|
@ -52,6 +52,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerFini
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerReservedEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerReservedEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
|
||||||
|
|
||||||
import com.google.common.collect.HashMultiset;
|
import com.google.common.collect.HashMultiset;
|
||||||
import com.google.common.collect.Multiset;
|
import com.google.common.collect.Multiset;
|
||||||
|
@ -61,6 +62,7 @@ import com.google.common.collect.Multiset;
|
||||||
* Each running Application in the RM corresponds to one instance
|
* Each running Application in the RM corresponds to one instance
|
||||||
* of this class.
|
* of this class.
|
||||||
*/
|
*/
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
public class SchedulerApp {
|
public class SchedulerApp {
|
||||||
|
|
||||||
private static final Log LOG = LogFactory.getLog(SchedulerApp.class);
|
private static final Log LOG = LogFactory.getLog(SchedulerApp.class);
|
||||||
|
@ -174,12 +176,19 @@ public class SchedulerApp {
|
||||||
this.appSchedulingInfo.stop(rmAppAttemptFinalState);
|
this.appSchedulingInfo.stop(rmAppAttemptFinalState);
|
||||||
}
|
}
|
||||||
|
|
||||||
synchronized public void containerLaunchedOnNode(ContainerId containerId) {
|
public synchronized void containerLaunchedOnNode(ContainerId containerId,
|
||||||
|
NodeId nodeId) {
|
||||||
// Inform the container
|
// Inform the container
|
||||||
RMContainer rmContainer =
|
RMContainer rmContainer =
|
||||||
getRMContainer(containerId);
|
getRMContainer(containerId);
|
||||||
rmContainer.handle(
|
if (rmContainer == null) {
|
||||||
new RMContainerEvent(containerId,
|
// Some unknown container sneaked into the system. Kill it.
|
||||||
|
this.rmContext.getDispatcher().getEventHandler()
|
||||||
|
.handle(new RMNodeCleanContainerEvent(nodeId, containerId));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
rmContainer.handle(new RMContainerEvent(containerId,
|
||||||
RMContainerEventType.LAUNCHED));
|
RMContainerEventType.LAUNCHED));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -57,6 +57,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAt
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
||||||
|
@ -76,6 +77,7 @@ import org.apache.hadoop.yarn.server.security.ContainerTokenSecretManager;
|
||||||
|
|
||||||
@LimitedPrivate("yarn")
|
@LimitedPrivate("yarn")
|
||||||
@Evolving
|
@Evolving
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
public class CapacityScheduler
|
public class CapacityScheduler
|
||||||
implements ResourceScheduler, CapacitySchedulerContext {
|
implements ResourceScheduler, CapacitySchedulerContext {
|
||||||
|
|
||||||
|
@ -588,10 +590,12 @@ implements ResourceScheduler, CapacitySchedulerContext {
|
||||||
LOG.info("Unknown application: " + applicationAttemptId +
|
LOG.info("Unknown application: " + applicationAttemptId +
|
||||||
" launched container " + containerId +
|
" launched container " + containerId +
|
||||||
" on node: " + node);
|
" on node: " + node);
|
||||||
|
this.rmContext.getDispatcher().getEventHandler()
|
||||||
|
.handle(new RMNodeCleanContainerEvent(node.getNodeID(), containerId));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
application.containerLaunchedOnNode(containerId);
|
application.containerLaunchedOnNode(containerId, node.getNodeID());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -65,6 +65,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptS
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue;
|
||||||
|
@ -87,6 +88,7 @@ import org.apache.hadoop.yarn.util.BuilderUtils;
|
||||||
|
|
||||||
@LimitedPrivate("yarn")
|
@LimitedPrivate("yarn")
|
||||||
@Evolving
|
@Evolving
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
public class FifoScheduler implements ResourceScheduler {
|
public class FifoScheduler implements ResourceScheduler {
|
||||||
|
|
||||||
private static final Log LOG = LogFactory.getLog(FifoScheduler.class);
|
private static final Log LOG = LogFactory.getLog(FifoScheduler.class);
|
||||||
|
@ -282,7 +284,6 @@ public class FifoScheduler implements ResourceScheduler {
|
||||||
return nodes.get(nodeId);
|
return nodes.get(nodeId);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
private synchronized void addApplication(ApplicationAttemptId appAttemptId,
|
private synchronized void addApplication(ApplicationAttemptId appAttemptId,
|
||||||
String user) {
|
String user) {
|
||||||
// TODO: Fix store
|
// TODO: Fix store
|
||||||
|
@ -655,10 +656,14 @@ public class FifoScheduler implements ResourceScheduler {
|
||||||
LOG.info("Unknown application: " + applicationAttemptId +
|
LOG.info("Unknown application: " + applicationAttemptId +
|
||||||
" launched container " + containerId +
|
" launched container " + containerId +
|
||||||
" on node: " + node);
|
" on node: " + node);
|
||||||
|
// Some unknown container sneaked into the system. Kill it.
|
||||||
|
this.rmContext.getDispatcher().getEventHandler()
|
||||||
|
.handle(new RMNodeCleanContainerEvent(node.getNodeID(), containerId));
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
application.containerLaunchedOnNode(containerId);
|
application.containerLaunchedOnNode(containerId, node.getNodeID());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Lock(FifoScheduler.class)
|
@Lock(FifoScheduler.class)
|
||||||
|
|
|
@ -39,15 +39,17 @@ public class MockNM {
|
||||||
|
|
||||||
private int responseId;
|
private int responseId;
|
||||||
private NodeId nodeId;
|
private NodeId nodeId;
|
||||||
private final String nodeIdStr;
|
|
||||||
private final int memory;
|
private final int memory;
|
||||||
private final ResourceTrackerService resourceTracker;
|
private final ResourceTrackerService resourceTracker;
|
||||||
private final int httpPort = 2;
|
private final int httpPort = 2;
|
||||||
|
|
||||||
MockNM(String nodeIdStr, int memory, ResourceTrackerService resourceTracker) {
|
MockNM(String nodeIdStr, int memory, ResourceTrackerService resourceTracker) {
|
||||||
this.nodeIdStr = nodeIdStr;
|
|
||||||
this.memory = memory;
|
this.memory = memory;
|
||||||
this.resourceTracker = resourceTracker;
|
this.resourceTracker = resourceTracker;
|
||||||
|
String[] splits = nodeIdStr.split(":");
|
||||||
|
nodeId = Records.newRecord(NodeId.class);
|
||||||
|
nodeId.setHost(splits[0]);
|
||||||
|
nodeId.setPort(Integer.parseInt(splits[1]));
|
||||||
}
|
}
|
||||||
|
|
||||||
public NodeId getNodeId() {
|
public NodeId getNodeId() {
|
||||||
|
@ -63,14 +65,10 @@ public class MockNM {
|
||||||
new HashMap<ApplicationId, List<ContainerStatus>>();
|
new HashMap<ApplicationId, List<ContainerStatus>>();
|
||||||
conts.put(container.getId().getApplicationAttemptId().getApplicationId(),
|
conts.put(container.getId().getApplicationAttemptId().getApplicationId(),
|
||||||
Arrays.asList(new ContainerStatus[] { container.getContainerStatus() }));
|
Arrays.asList(new ContainerStatus[] { container.getContainerStatus() }));
|
||||||
nodeHeartbeat(conts, true,nodeId);
|
nodeHeartbeat(conts, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public NodeId registerNode() throws Exception {
|
public NodeId registerNode() throws Exception {
|
||||||
String[] splits = nodeIdStr.split(":");
|
|
||||||
nodeId = Records.newRecord(NodeId.class);
|
|
||||||
nodeId.setHost(splits[0]);
|
|
||||||
nodeId.setPort(Integer.parseInt(splits[1]));
|
|
||||||
RegisterNodeManagerRequest req = Records.newRecord(
|
RegisterNodeManagerRequest req = Records.newRecord(
|
||||||
RegisterNodeManagerRequest.class);
|
RegisterNodeManagerRequest.class);
|
||||||
req.setNodeId(nodeId);
|
req.setNodeId(nodeId);
|
||||||
|
@ -83,11 +81,11 @@ public class MockNM {
|
||||||
}
|
}
|
||||||
|
|
||||||
public HeartbeatResponse nodeHeartbeat(boolean b) throws Exception {
|
public HeartbeatResponse nodeHeartbeat(boolean b) throws Exception {
|
||||||
return nodeHeartbeat(new HashMap<ApplicationId, List<ContainerStatus>>(), b,nodeId);
|
return nodeHeartbeat(new HashMap<ApplicationId, List<ContainerStatus>>(), b);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HeartbeatResponse nodeHeartbeat(Map<ApplicationId,
|
public HeartbeatResponse nodeHeartbeat(Map<ApplicationId,
|
||||||
List<ContainerStatus>> conts, boolean isHealthy, NodeId nodeId) throws Exception {
|
List<ContainerStatus>> conts, boolean isHealthy) throws Exception {
|
||||||
NodeHeartbeatRequest req = Records.newRecord(NodeHeartbeatRequest.class);
|
NodeHeartbeatRequest req = Records.newRecord(NodeHeartbeatRequest.class);
|
||||||
NodeStatus status = Records.newRecord(NodeStatus.class);
|
NodeStatus status = Records.newRecord(NodeStatus.class);
|
||||||
status.setNodeId(nodeId);
|
status.setNodeId(nodeId);
|
||||||
|
|
|
@ -152,13 +152,13 @@ public class MockNodes {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<ApplicationId> pullAppsToCleanup() {
|
public List<ApplicationId> getAppsToCleanup() {
|
||||||
// TODO Auto-generated method stub
|
// TODO Auto-generated method stub
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<ContainerId> pullContainersToCleanUp() {
|
public List<ContainerId> getContainersToCleanUp() {
|
||||||
// TODO Auto-generated method stub
|
// TODO Auto-generated method stub
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,26 +19,39 @@
|
||||||
package org.apache.hadoop.yarn.server.resourcemanager;
|
package org.apache.hadoop.yarn.server.resourcemanager;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import junit.framework.Assert;
|
import junit.framework.Assert;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.api.records.Container;
|
import org.apache.hadoop.yarn.api.records.Container;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||||
|
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||||
|
import org.apache.hadoop.yarn.event.DrainDispatcher;
|
||||||
|
import org.apache.hadoop.yarn.event.EventHandler;
|
||||||
import org.apache.hadoop.yarn.server.api.records.HeartbeatResponse;
|
import org.apache.hadoop.yarn.server.api.records.HeartbeatResponse;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
|
||||||
|
import org.apache.hadoop.yarn.util.BuilderUtils;
|
||||||
import org.apache.log4j.Level;
|
import org.apache.log4j.Level;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import org.mortbay.log.Log;
|
|
||||||
|
|
||||||
public class TestApplicationCleanup {
|
public class TestApplicationCleanup {
|
||||||
|
|
||||||
|
private static final Log LOG = LogFactory
|
||||||
|
.getLog(TestApplicationCleanup.class);
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testAppCleanup() throws Exception {
|
public void testAppCleanup() throws Exception {
|
||||||
Logger rootLogger = LogManager.getRootLogger();
|
Logger rootLogger = LogManager.getRootLogger();
|
||||||
|
@ -67,11 +80,13 @@ public class TestApplicationCleanup {
|
||||||
List<Container> conts = am.allocate(new ArrayList<ResourceRequest>(),
|
List<Container> conts = am.allocate(new ArrayList<ResourceRequest>(),
|
||||||
new ArrayList<ContainerId>()).getAllocatedContainers();
|
new ArrayList<ContainerId>()).getAllocatedContainers();
|
||||||
int contReceived = conts.size();
|
int contReceived = conts.size();
|
||||||
while (contReceived < request) {
|
int waitCount = 0;
|
||||||
|
while (contReceived < request && waitCount++ < 20) {
|
||||||
conts = am.allocate(new ArrayList<ResourceRequest>(),
|
conts = am.allocate(new ArrayList<ResourceRequest>(),
|
||||||
new ArrayList<ContainerId>()).getAllocatedContainers();
|
new ArrayList<ContainerId>()).getAllocatedContainers();
|
||||||
contReceived += conts.size();
|
contReceived += conts.size();
|
||||||
Log.info("Got " + contReceived + " containers. Waiting to get " + request);
|
LOG.info("Got " + contReceived + " containers. Waiting to get "
|
||||||
|
+ request);
|
||||||
Thread.sleep(2000);
|
Thread.sleep(2000);
|
||||||
}
|
}
|
||||||
Assert.assertEquals(request, conts.size());
|
Assert.assertEquals(request, conts.size());
|
||||||
|
@ -86,11 +101,12 @@ public class TestApplicationCleanup {
|
||||||
|
|
||||||
//currently only containers are cleaned via this
|
//currently only containers are cleaned via this
|
||||||
//AM container is cleaned via container launcher
|
//AM container is cleaned via container launcher
|
||||||
while (cleanedConts < 2 || cleanedApps < 1) {
|
waitCount = 0;
|
||||||
|
while ((cleanedConts < 3 || cleanedApps < 1) && waitCount++ < 20) {
|
||||||
HeartbeatResponse resp = nm1.nodeHeartbeat(true);
|
HeartbeatResponse resp = nm1.nodeHeartbeat(true);
|
||||||
contsToClean = resp.getContainersToCleanupList();
|
contsToClean = resp.getContainersToCleanupList();
|
||||||
apps = resp.getApplicationsToCleanupList();
|
apps = resp.getApplicationsToCleanupList();
|
||||||
Log.info("Waiting to get cleanup events.. cleanedConts: "
|
LOG.info("Waiting to get cleanup events.. cleanedConts: "
|
||||||
+ cleanedConts + " cleanedApps: " + cleanedApps);
|
+ cleanedConts + " cleanedApps: " + cleanedApps);
|
||||||
cleanedConts += contsToClean.size();
|
cleanedConts += contsToClean.size();
|
||||||
cleanedApps += apps.size();
|
cleanedApps += apps.size();
|
||||||
|
@ -99,6 +115,130 @@ public class TestApplicationCleanup {
|
||||||
|
|
||||||
Assert.assertEquals(1, apps.size());
|
Assert.assertEquals(1, apps.size());
|
||||||
Assert.assertEquals(app.getApplicationId(), apps.get(0));
|
Assert.assertEquals(app.getApplicationId(), apps.get(0));
|
||||||
|
Assert.assertEquals(1, cleanedApps);
|
||||||
|
Assert.assertEquals(3, cleanedConts);
|
||||||
|
|
||||||
|
rm.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testContainerCleanup() throws Exception {
|
||||||
|
|
||||||
|
Logger rootLogger = LogManager.getRootLogger();
|
||||||
|
rootLogger.setLevel(Level.DEBUG);
|
||||||
|
final DrainDispatcher dispatcher = new DrainDispatcher();
|
||||||
|
MockRM rm = new MockRM() {
|
||||||
|
@Override
|
||||||
|
protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
|
||||||
|
return new SchedulerEventDispatcher(this.scheduler) {
|
||||||
|
@Override
|
||||||
|
public void handle(SchedulerEvent event) {
|
||||||
|
scheduler.handle(event);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Dispatcher createDispatcher() {
|
||||||
|
return dispatcher;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
rm.start();
|
||||||
|
|
||||||
|
MockNM nm1 = rm.registerNode("h1:1234", 5000);
|
||||||
|
|
||||||
|
RMApp app = rm.submitApp(2000);
|
||||||
|
|
||||||
|
//kick the scheduling
|
||||||
|
nm1.nodeHeartbeat(true);
|
||||||
|
|
||||||
|
RMAppAttempt attempt = app.getCurrentAppAttempt();
|
||||||
|
MockAM am = rm.sendAMLaunched(attempt.getAppAttemptId());
|
||||||
|
am.registerAppAttempt();
|
||||||
|
|
||||||
|
//request for containers
|
||||||
|
int request = 2;
|
||||||
|
am.allocate("h1" , 1000, request,
|
||||||
|
new ArrayList<ContainerId>());
|
||||||
|
dispatcher.await();
|
||||||
|
|
||||||
|
//kick the scheduler
|
||||||
|
nm1.nodeHeartbeat(true);
|
||||||
|
List<Container> conts = am.allocate(new ArrayList<ResourceRequest>(),
|
||||||
|
new ArrayList<ContainerId>()).getAllocatedContainers();
|
||||||
|
int contReceived = conts.size();
|
||||||
|
int waitCount = 0;
|
||||||
|
while (contReceived < request && waitCount++ < 20) {
|
||||||
|
conts = am.allocate(new ArrayList<ResourceRequest>(),
|
||||||
|
new ArrayList<ContainerId>()).getAllocatedContainers();
|
||||||
|
dispatcher.await();
|
||||||
|
contReceived += conts.size();
|
||||||
|
LOG.info("Got " + contReceived + " containers. Waiting to get "
|
||||||
|
+ request);
|
||||||
|
Thread.sleep(2000);
|
||||||
|
}
|
||||||
|
Assert.assertEquals(request, conts.size());
|
||||||
|
|
||||||
|
// Release a container.
|
||||||
|
ArrayList<ContainerId> release = new ArrayList<ContainerId>();
|
||||||
|
release.add(conts.get(1).getId());
|
||||||
|
am.allocate(new ArrayList<ResourceRequest>(), release);
|
||||||
|
dispatcher.await();
|
||||||
|
|
||||||
|
// Send one more heartbeat with a fake running container. This is to
|
||||||
|
// simulate the situation that can happen if the NM reports that container
|
||||||
|
// is running in the same heartbeat when the RM asks it to clean it up.
|
||||||
|
Map<ApplicationId, List<ContainerStatus>> containerStatuses =
|
||||||
|
new HashMap<ApplicationId, List<ContainerStatus>>();
|
||||||
|
ArrayList<ContainerStatus> containerStatusList =
|
||||||
|
new ArrayList<ContainerStatus>();
|
||||||
|
containerStatusList.add(BuilderUtils.newContainerStatus(conts.get(1)
|
||||||
|
.getId(), ContainerState.RUNNING, "nothing", 0));
|
||||||
|
containerStatuses.put(app.getApplicationId(), containerStatusList);
|
||||||
|
|
||||||
|
HeartbeatResponse resp = nm1.nodeHeartbeat(containerStatuses, true);
|
||||||
|
dispatcher.await();
|
||||||
|
List<ContainerId> contsToClean = resp.getContainersToCleanupList();
|
||||||
|
int cleanedConts = contsToClean.size();
|
||||||
|
waitCount = 0;
|
||||||
|
while (cleanedConts < 1 && waitCount++ < 20) {
|
||||||
|
resp = nm1.nodeHeartbeat(true);
|
||||||
|
dispatcher.await();
|
||||||
|
contsToClean = resp.getContainersToCleanupList();
|
||||||
|
LOG.info("Waiting to get cleanup events.. cleanedConts: " + cleanedConts);
|
||||||
|
cleanedConts += contsToClean.size();
|
||||||
|
Thread.sleep(1000);
|
||||||
|
}
|
||||||
|
LOG.info("Got cleanup for " + contsToClean.get(0));
|
||||||
|
Assert.assertEquals(1, cleanedConts);
|
||||||
|
|
||||||
|
// Now to test the case when RM already gave cleanup, and NM suddenly
|
||||||
|
// realizes that the container is running.
|
||||||
|
LOG.info("Testing container launch much after release and "
|
||||||
|
+ "NM getting cleanup");
|
||||||
|
containerStatuses.clear();
|
||||||
|
containerStatusList.clear();
|
||||||
|
containerStatusList.add(BuilderUtils.newContainerStatus(conts.get(1)
|
||||||
|
.getId(), ContainerState.RUNNING, "nothing", 0));
|
||||||
|
containerStatuses.put(app.getApplicationId(), containerStatusList);
|
||||||
|
|
||||||
|
resp = nm1.nodeHeartbeat(containerStatuses, true);
|
||||||
|
dispatcher.await();
|
||||||
|
contsToClean = resp.getContainersToCleanupList();
|
||||||
|
cleanedConts = contsToClean.size();
|
||||||
|
// The cleanup list won't be instantaneous as it is given out by scheduler
|
||||||
|
// and not RMNodeImpl.
|
||||||
|
waitCount = 0;
|
||||||
|
while (cleanedConts < 1 && waitCount++ < 20) {
|
||||||
|
resp = nm1.nodeHeartbeat(true);
|
||||||
|
dispatcher.await();
|
||||||
|
contsToClean = resp.getContainersToCleanupList();
|
||||||
|
LOG.info("Waiting to get cleanup events.. cleanedConts: " + cleanedConts);
|
||||||
|
cleanedConts += contsToClean.size();
|
||||||
|
Thread.sleep(1000);
|
||||||
|
}
|
||||||
|
LOG.info("Got cleanup for " + contsToClean.get(0));
|
||||||
|
Assert.assertEquals(1, cleanedConts);
|
||||||
|
|
||||||
rm.stop();
|
rm.stop();
|
||||||
}
|
}
|
||||||
|
|
|
@ -164,8 +164,7 @@ public class TestResourceTrackerService {
|
||||||
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
Assert.assertTrue(NodeAction.NORMAL.equals(nodeHeartbeat.getNodeAction()));
|
||||||
|
|
||||||
nodeHeartbeat = nm2.nodeHeartbeat(
|
nodeHeartbeat = nm2.nodeHeartbeat(
|
||||||
new HashMap<ApplicationId, List<ContainerStatus>>(), true,
|
new HashMap<ApplicationId, List<ContainerStatus>>(), true);
|
||||||
recordFactory.newRecordInstance(NodeId.class));
|
|
||||||
Assert.assertTrue(NodeAction.REBOOT.equals(nodeHeartbeat.getNodeAction()));
|
Assert.assertTrue(NodeAction.REBOOT.equals(nodeHeartbeat.getNodeAction()));
|
||||||
checkRebootedNMCount(rm, ++initialMetricCount);
|
checkRebootedNMCount(rm, ++initialMetricCount);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue