YARN-2249. Avoided AM release requests being lost on work preserving RM restart. Contributed by Jian He.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1618972 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bf9aa34dea
commit
f6a778c372
|
@ -36,8 +36,8 @@ import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.locks.Lock;
|
import java.util.concurrent.locks.Lock;
|
||||||
import java.util.concurrent.locks.ReentrantLock;
|
import java.util.concurrent.locks.ReentrantLock;
|
||||||
|
|
||||||
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
|
||||||
import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate;
|
import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate;
|
||||||
|
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
||||||
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
||||||
import org.apache.hadoop.conf.Configurable;
|
import org.apache.hadoop.conf.Configurable;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
@ -61,6 +61,7 @@ import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.UpdatedContainerInfo;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.UpdatedContainerInfo;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
|
||||||
|
@ -933,4 +934,10 @@ final public class ResourceSchedulerWrapper
|
||||||
return new HashMap<ApplicationId,
|
return new HashMap<ApplicationId,
|
||||||
SchedulerApplication<SchedulerApplicationAttempt>>();
|
SchedulerApplication<SchedulerApplicationAttempt>>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void completedContainer(RMContainer rmContainer,
|
||||||
|
ContainerStatus containerStatus, RMContainerEventType event) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -214,6 +214,9 @@ Release 2.6.0 - UNRELEASED
|
||||||
YARN-2409. RM ActiveToStandBy transition missing stoping previous rmDispatcher.
|
YARN-2409. RM ActiveToStandBy transition missing stoping previous rmDispatcher.
|
||||||
(Rohith via jianhe)
|
(Rohith via jianhe)
|
||||||
|
|
||||||
|
YARN-2249. Avoided AM release requests being lost on work preserving RM
|
||||||
|
restart. (Jian He via zjshen)
|
||||||
|
|
||||||
Release 2.5.0 - UNRELEASED
|
Release 2.5.0 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -23,10 +23,14 @@ import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.Timer;
|
||||||
|
import java.util.TimerTask;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.service.AbstractService;
|
import org.apache.hadoop.service.AbstractService;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
@ -34,18 +38,25 @@ import org.apache.hadoop.yarn.api.records.ApplicationResourceUsageReport;
|
||||||
import org.apache.hadoop.yarn.api.records.Container;
|
import org.apache.hadoop.yarn.api.records.Container;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.NodeId;
|
import org.apache.hadoop.yarn.api.records.NodeId;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppMoveEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppMoveEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerFinishedEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerRecoverEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerRecoverEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||||
|
@ -54,6 +65,7 @@ import org.apache.hadoop.yarn.util.resource.Resources;
|
||||||
|
|
||||||
import com.google.common.util.concurrent.SettableFuture;
|
import com.google.common.util.concurrent.SettableFuture;
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
public abstract class AbstractYarnScheduler
|
public abstract class AbstractYarnScheduler
|
||||||
<T extends SchedulerApplicationAttempt, N extends SchedulerNode>
|
<T extends SchedulerApplicationAttempt, N extends SchedulerNode>
|
||||||
|
@ -72,6 +84,7 @@ public abstract class AbstractYarnScheduler
|
||||||
|
|
||||||
protected RMContext rmContext;
|
protected RMContext rmContext;
|
||||||
protected Map<ApplicationId, SchedulerApplication<T>> applications;
|
protected Map<ApplicationId, SchedulerApplication<T>> applications;
|
||||||
|
protected int nmExpireInterval;
|
||||||
|
|
||||||
protected final static List<Container> EMPTY_CONTAINER_LIST =
|
protected final static List<Container> EMPTY_CONTAINER_LIST =
|
||||||
new ArrayList<Container>();
|
new ArrayList<Container>();
|
||||||
|
@ -87,6 +100,15 @@ public abstract class AbstractYarnScheduler
|
||||||
super(name);
|
super(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void serviceInit(Configuration conf) throws Exception {
|
||||||
|
nmExpireInterval =
|
||||||
|
conf.getInt(YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS,
|
||||||
|
YarnConfiguration.DEFAULT_RM_NM_EXPIRY_INTERVAL_MS);
|
||||||
|
createReleaseCache();
|
||||||
|
super.serviceInit(conf);
|
||||||
|
}
|
||||||
|
|
||||||
public synchronized List<Container> getTransferredContainers(
|
public synchronized List<Container> getTransferredContainers(
|
||||||
ApplicationAttemptId currentAttempt) {
|
ApplicationAttemptId currentAttempt) {
|
||||||
ApplicationId appId = currentAttempt.getApplicationId();
|
ApplicationId appId = currentAttempt.getApplicationId();
|
||||||
|
@ -281,6 +303,19 @@ public abstract class AbstractYarnScheduler
|
||||||
((RMContainerImpl)rmContainer).setAMContainer(true);
|
((RMContainerImpl)rmContainer).setAMContainer(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
synchronized (schedulerAttempt) {
|
||||||
|
Set<ContainerId> releases = schedulerAttempt.getPendingRelease();
|
||||||
|
if (releases.contains(container.getContainerId())) {
|
||||||
|
// release the container
|
||||||
|
rmContainer.handle(new RMContainerFinishedEvent(container
|
||||||
|
.getContainerId(), SchedulerUtils.createAbnormalContainerStatus(
|
||||||
|
container.getContainerId(), SchedulerUtils.RELEASED_CONTAINER),
|
||||||
|
RMContainerEventType.RELEASED));
|
||||||
|
releases.remove(container.getContainerId());
|
||||||
|
LOG.info(container.getContainerId() + " is released by application.");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -320,6 +355,62 @@ public abstract class AbstractYarnScheduler
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void createReleaseCache() {
|
||||||
|
// Cleanup the cache after nm expire interval.
|
||||||
|
new Timer().schedule(new TimerTask() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
for (SchedulerApplication<T> app : applications.values()) {
|
||||||
|
|
||||||
|
T attempt = app.getCurrentAppAttempt();
|
||||||
|
synchronized (attempt) {
|
||||||
|
for (ContainerId containerId : attempt.getPendingRelease()) {
|
||||||
|
RMAuditLogger.logFailure(
|
||||||
|
app.getUser(),
|
||||||
|
AuditConstants.RELEASE_CONTAINER,
|
||||||
|
"Unauthorized access or invalid container",
|
||||||
|
"Scheduler",
|
||||||
|
"Trying to release container not owned by app or with invalid id.",
|
||||||
|
attempt.getApplicationId(), containerId);
|
||||||
|
}
|
||||||
|
attempt.getPendingRelease().clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG.info("Release request cache is cleaned up");
|
||||||
|
}
|
||||||
|
}, nmExpireInterval);
|
||||||
|
}
|
||||||
|
|
||||||
|
// clean up a completed container
|
||||||
|
protected abstract void completedContainer(RMContainer rmContainer,
|
||||||
|
ContainerStatus containerStatus, RMContainerEventType event);
|
||||||
|
|
||||||
|
protected void releaseContainers(List<ContainerId> containers,
|
||||||
|
SchedulerApplicationAttempt attempt) {
|
||||||
|
for (ContainerId containerId : containers) {
|
||||||
|
RMContainer rmContainer = getRMContainer(containerId);
|
||||||
|
if (rmContainer == null) {
|
||||||
|
if (System.currentTimeMillis() - ResourceManager.getClusterTimeStamp()
|
||||||
|
< nmExpireInterval) {
|
||||||
|
LOG.info(containerId + " doesn't exist. Add the container"
|
||||||
|
+ " to the release request cache as it maybe on recovery.");
|
||||||
|
synchronized (attempt) {
|
||||||
|
attempt.getPendingRelease().add(containerId);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
RMAuditLogger.logFailure(attempt.getUser(),
|
||||||
|
AuditConstants.RELEASE_CONTAINER,
|
||||||
|
"Unauthorized access or invalid container", "Scheduler",
|
||||||
|
"Trying to release container not owned by app or with invalid id.",
|
||||||
|
attempt.getApplicationId(), containerId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
completedContainer(rmContainer,
|
||||||
|
SchedulerUtils.createAbnormalContainerStatus(containerId,
|
||||||
|
SchedulerUtils.RELEASED_CONTAINER), RMContainerEventType.RELEASED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public SchedulerNode getSchedulerNode(NodeId nodeId) {
|
public SchedulerNode getSchedulerNode(NodeId nodeId) {
|
||||||
return nodes.get(nodeId);
|
return nodes.get(nodeId);
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,13 +17,14 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.hadoop.yarn.server.resourcemanager.scheduler;
|
package org.apache.hadoop.yarn.server.resourcemanager.scheduler;
|
||||||
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
@ -41,7 +42,6 @@ import org.apache.hadoop.yarn.api.records.NodeId;
|
||||||
import org.apache.hadoop.yarn.api.records.Priority;
|
import org.apache.hadoop.yarn.api.records.Priority;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
|
||||||
|
@ -53,6 +53,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerStat
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeCleanContainerEvent;
|
||||||
import org.apache.hadoop.yarn.util.resource.Resources;
|
import org.apache.hadoop.yarn.util.resource.Resources;
|
||||||
|
|
||||||
|
import com.google.common.base.Preconditions;
|
||||||
import com.google.common.collect.HashMultiset;
|
import com.google.common.collect.HashMultiset;
|
||||||
import com.google.common.collect.Multiset;
|
import com.google.common.collect.Multiset;
|
||||||
|
|
||||||
|
@ -87,6 +88,13 @@ public class SchedulerApplicationAttempt {
|
||||||
protected List<RMContainer> newlyAllocatedContainers =
|
protected List<RMContainer> newlyAllocatedContainers =
|
||||||
new ArrayList<RMContainer>();
|
new ArrayList<RMContainer>();
|
||||||
|
|
||||||
|
// This pendingRelease is used in work-preserving recovery scenario to keep
|
||||||
|
// track of the AM's outstanding release requests. RM on recovery could
|
||||||
|
// receive the release request form AM before it receives the container status
|
||||||
|
// from NM for recovery. In this case, the to-be-recovered containers reported
|
||||||
|
// by NM should not be recovered.
|
||||||
|
private Set<ContainerId> pendingRelease = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Count how many times the application has been given an opportunity
|
* Count how many times the application has been given an opportunity
|
||||||
* to schedule a task at each priority. Each time the scheduler
|
* to schedule a task at each priority. Each time the scheduler
|
||||||
|
@ -114,7 +122,7 @@ public class SchedulerApplicationAttempt {
|
||||||
new AppSchedulingInfo(applicationAttemptId, user, queue,
|
new AppSchedulingInfo(applicationAttemptId, user, queue,
|
||||||
activeUsersManager, rmContext.getEpoch());
|
activeUsersManager, rmContext.getEpoch());
|
||||||
this.queue = queue;
|
this.queue = queue;
|
||||||
|
this.pendingRelease = new HashSet<ContainerId>();
|
||||||
if (rmContext.getRMApps() != null &&
|
if (rmContext.getRMApps() != null &&
|
||||||
rmContext.getRMApps()
|
rmContext.getRMApps()
|
||||||
.containsKey(applicationAttemptId.getApplicationId())) {
|
.containsKey(applicationAttemptId.getApplicationId())) {
|
||||||
|
@ -163,6 +171,10 @@ public class SchedulerApplicationAttempt {
|
||||||
return appSchedulingInfo.getResourceRequests(priority);
|
return appSchedulingInfo.getResourceRequests(priority);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Set<ContainerId> getPendingRelease() {
|
||||||
|
return this.pendingRelease;
|
||||||
|
}
|
||||||
|
|
||||||
public int getNewContainerId() {
|
public int getNewContainerId() {
|
||||||
return appSchedulingInfo.getNewContainerId();
|
return appSchedulingInfo.getNewContainerId();
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,8 +54,6 @@ import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.*;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.*;
|
||||||
|
@ -199,7 +197,7 @@ public class CapacityScheduler extends
|
||||||
private static final long DEFAULT_ASYNC_SCHEDULER_INTERVAL = 5;
|
private static final long DEFAULT_ASYNC_SCHEDULER_INTERVAL = 5;
|
||||||
|
|
||||||
private boolean overrideWithQueueMappings = false;
|
private boolean overrideWithQueueMappings = false;
|
||||||
private List<QueueMapping> mappings = new ArrayList<QueueMapping>();
|
private List<QueueMapping> mappings = null;
|
||||||
private Groups groups;
|
private Groups groups;
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
|
@ -789,21 +787,7 @@ public class CapacityScheduler extends
|
||||||
getMinimumResourceCapability(), maximumAllocation);
|
getMinimumResourceCapability(), maximumAllocation);
|
||||||
|
|
||||||
// Release containers
|
// Release containers
|
||||||
for (ContainerId releasedContainerId : release) {
|
releaseContainers(release, application);
|
||||||
RMContainer rmContainer = getRMContainer(releasedContainerId);
|
|
||||||
if (rmContainer == null) {
|
|
||||||
RMAuditLogger.logFailure(application.getUser(),
|
|
||||||
AuditConstants.RELEASE_CONTAINER,
|
|
||||||
"Unauthorized access or invalid container", "CapacityScheduler",
|
|
||||||
"Trying to release container not owned by app or with invalid id",
|
|
||||||
application.getApplicationId(), releasedContainerId);
|
|
||||||
}
|
|
||||||
completedContainer(rmContainer,
|
|
||||||
SchedulerUtils.createAbnormalContainerStatus(
|
|
||||||
releasedContainerId,
|
|
||||||
SchedulerUtils.RELEASED_CONTAINER),
|
|
||||||
RMContainerEventType.RELEASED);
|
|
||||||
}
|
|
||||||
|
|
||||||
synchronized (application) {
|
synchronized (application) {
|
||||||
|
|
||||||
|
@ -1098,7 +1082,8 @@ public class CapacityScheduler extends
|
||||||
}
|
}
|
||||||
|
|
||||||
@Lock(CapacityScheduler.class)
|
@Lock(CapacityScheduler.class)
|
||||||
private synchronized void completedContainer(RMContainer rmContainer,
|
@Override
|
||||||
|
protected synchronized void completedContainer(RMContainer rmContainer,
|
||||||
ContainerStatus containerStatus, RMContainerEventType event) {
|
ContainerStatus containerStatus, RMContainerEventType event) {
|
||||||
if (rmContainer == null) {
|
if (rmContainer == null) {
|
||||||
LOG.info("Null container completed...");
|
LOG.info("Null container completed...");
|
||||||
|
|
|
@ -49,8 +49,6 @@ import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnException;
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.resource.ResourceWeights;
|
import org.apache.hadoop.yarn.server.resourcemanager.resource.ResourceWeights;
|
||||||
|
@ -810,7 +808,8 @@ public class FairScheduler extends
|
||||||
/**
|
/**
|
||||||
* Clean up a completed container.
|
* Clean up a completed container.
|
||||||
*/
|
*/
|
||||||
private synchronized void completedContainer(RMContainer rmContainer,
|
@Override
|
||||||
|
protected synchronized void completedContainer(RMContainer rmContainer,
|
||||||
ContainerStatus containerStatus, RMContainerEventType event) {
|
ContainerStatus containerStatus, RMContainerEventType event) {
|
||||||
if (rmContainer == null) {
|
if (rmContainer == null) {
|
||||||
LOG.info("Null container completed...");
|
LOG.info("Null container completed...");
|
||||||
|
@ -913,21 +912,7 @@ public class FairScheduler extends
|
||||||
}
|
}
|
||||||
|
|
||||||
// Release containers
|
// Release containers
|
||||||
for (ContainerId releasedContainerId : release) {
|
releaseContainers(release, application);
|
||||||
RMContainer rmContainer = getRMContainer(releasedContainerId);
|
|
||||||
if (rmContainer == null) {
|
|
||||||
RMAuditLogger.logFailure(application.getUser(),
|
|
||||||
AuditConstants.RELEASE_CONTAINER,
|
|
||||||
"Unauthorized access or invalid container", "FairScheduler",
|
|
||||||
"Trying to release container not owned by app or with invalid id",
|
|
||||||
application.getApplicationId(), releasedContainerId);
|
|
||||||
}
|
|
||||||
completedContainer(rmContainer,
|
|
||||||
SchedulerUtils.createAbnormalContainerStatus(
|
|
||||||
releasedContainerId,
|
|
||||||
SchedulerUtils.RELEASED_CONTAINER),
|
|
||||||
RMContainerEventType.RELEASED);
|
|
||||||
}
|
|
||||||
|
|
||||||
synchronized (application) {
|
synchronized (application) {
|
||||||
if (!ask.isEmpty()) {
|
if (!ask.isEmpty()) {
|
||||||
|
|
|
@ -52,8 +52,6 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
||||||
import org.apache.hadoop.yarn.factories.RecordFactory;
|
import org.apache.hadoop.yarn.factories.RecordFactory;
|
||||||
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
|
||||||
|
@ -89,7 +87,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSc
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
|
||||||
|
|
||||||
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
||||||
import org.apache.hadoop.yarn.server.utils.Lock;
|
import org.apache.hadoop.yarn.server.utils.Lock;
|
||||||
import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
|
import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
|
||||||
|
@ -295,21 +292,7 @@ public class FifoScheduler extends
|
||||||
clusterResource, minimumAllocation, maximumAllocation);
|
clusterResource, minimumAllocation, maximumAllocation);
|
||||||
|
|
||||||
// Release containers
|
// Release containers
|
||||||
for (ContainerId releasedContainer : release) {
|
releaseContainers(release, application);
|
||||||
RMContainer rmContainer = getRMContainer(releasedContainer);
|
|
||||||
if (rmContainer == null) {
|
|
||||||
RMAuditLogger.logFailure(application.getUser(),
|
|
||||||
AuditConstants.RELEASE_CONTAINER,
|
|
||||||
"Unauthorized access or invalid container", "FifoScheduler",
|
|
||||||
"Trying to release container not owned by app or with invalid id",
|
|
||||||
application.getApplicationId(), releasedContainer);
|
|
||||||
}
|
|
||||||
containerCompleted(rmContainer,
|
|
||||||
SchedulerUtils.createAbnormalContainerStatus(
|
|
||||||
releasedContainer,
|
|
||||||
SchedulerUtils.RELEASED_CONTAINER),
|
|
||||||
RMContainerEventType.RELEASED);
|
|
||||||
}
|
|
||||||
|
|
||||||
synchronized (application) {
|
synchronized (application) {
|
||||||
|
|
||||||
|
@ -443,7 +426,7 @@ public class FifoScheduler extends
|
||||||
LOG.info("Skip killing " + container.getContainerId());
|
LOG.info("Skip killing " + container.getContainerId());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
containerCompleted(container,
|
completedContainer(container,
|
||||||
SchedulerUtils.createAbnormalContainerStatus(
|
SchedulerUtils.createAbnormalContainerStatus(
|
||||||
container.getContainerId(), SchedulerUtils.COMPLETED_APPLICATION),
|
container.getContainerId(), SchedulerUtils.COMPLETED_APPLICATION),
|
||||||
RMContainerEventType.KILL);
|
RMContainerEventType.KILL);
|
||||||
|
@ -717,7 +700,7 @@ public class FifoScheduler extends
|
||||||
for (ContainerStatus completedContainer : completedContainers) {
|
for (ContainerStatus completedContainer : completedContainers) {
|
||||||
ContainerId containerId = completedContainer.getContainerId();
|
ContainerId containerId = completedContainer.getContainerId();
|
||||||
LOG.debug("Container FINISHED: " + containerId);
|
LOG.debug("Container FINISHED: " + containerId);
|
||||||
containerCompleted(getRMContainer(containerId),
|
completedContainer(getRMContainer(containerId),
|
||||||
completedContainer, RMContainerEventType.FINISHED);
|
completedContainer, RMContainerEventType.FINISHED);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -818,7 +801,7 @@ public class FifoScheduler extends
|
||||||
ContainerExpiredSchedulerEvent containerExpiredEvent =
|
ContainerExpiredSchedulerEvent containerExpiredEvent =
|
||||||
(ContainerExpiredSchedulerEvent) event;
|
(ContainerExpiredSchedulerEvent) event;
|
||||||
ContainerId containerid = containerExpiredEvent.getContainerId();
|
ContainerId containerid = containerExpiredEvent.getContainerId();
|
||||||
containerCompleted(getRMContainer(containerid),
|
completedContainer(getRMContainer(containerid),
|
||||||
SchedulerUtils.createAbnormalContainerStatus(
|
SchedulerUtils.createAbnormalContainerStatus(
|
||||||
containerid,
|
containerid,
|
||||||
SchedulerUtils.EXPIRED_CONTAINER),
|
SchedulerUtils.EXPIRED_CONTAINER),
|
||||||
|
@ -831,7 +814,8 @@ public class FifoScheduler extends
|
||||||
}
|
}
|
||||||
|
|
||||||
@Lock(FifoScheduler.class)
|
@Lock(FifoScheduler.class)
|
||||||
private synchronized void containerCompleted(RMContainer rmContainer,
|
@Override
|
||||||
|
protected synchronized void completedContainer(RMContainer rmContainer,
|
||||||
ContainerStatus containerStatus, RMContainerEventType event) {
|
ContainerStatus containerStatus, RMContainerEventType event) {
|
||||||
if (rmContainer == null) {
|
if (rmContainer == null) {
|
||||||
LOG.info("Null container completed...");
|
LOG.info("Null container completed...");
|
||||||
|
@ -881,7 +865,7 @@ public class FifoScheduler extends
|
||||||
}
|
}
|
||||||
// Kill running containers
|
// Kill running containers
|
||||||
for(RMContainer container : node.getRunningContainers()) {
|
for(RMContainer container : node.getRunningContainers()) {
|
||||||
containerCompleted(container,
|
completedContainer(container,
|
||||||
SchedulerUtils.createAbnormalContainerStatus(
|
SchedulerUtils.createAbnormalContainerStatus(
|
||||||
container.getContainerId(),
|
container.getContainerId(),
|
||||||
SchedulerUtils.LOST_CONTAINER),
|
SchedulerUtils.LOST_CONTAINER),
|
||||||
|
|
|
@ -49,7 +49,7 @@ public class MockAM {
|
||||||
|
|
||||||
private volatile int responseId = 0;
|
private volatile int responseId = 0;
|
||||||
private final ApplicationAttemptId attemptId;
|
private final ApplicationAttemptId attemptId;
|
||||||
private final RMContext context;
|
private RMContext context;
|
||||||
private ApplicationMasterProtocol amRMProtocol;
|
private ApplicationMasterProtocol amRMProtocol;
|
||||||
|
|
||||||
private final List<ResourceRequest> requests = new ArrayList<ResourceRequest>();
|
private final List<ResourceRequest> requests = new ArrayList<ResourceRequest>();
|
||||||
|
@ -62,7 +62,9 @@ public class MockAM {
|
||||||
this.attemptId = attemptId;
|
this.attemptId = attemptId;
|
||||||
}
|
}
|
||||||
|
|
||||||
void setAMRMProtocol(ApplicationMasterProtocol amRMProtocol) {
|
public void setAMRMProtocol(ApplicationMasterProtocol amRMProtocol,
|
||||||
|
RMContext context) {
|
||||||
|
this.context = context;
|
||||||
this.amRMProtocol = amRMProtocol;
|
this.amRMProtocol = amRMProtocol;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -171,7 +171,6 @@ public class TestApplicationMasterService {
|
||||||
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
|
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
|
||||||
MockAM am1 = rm.sendAMLaunched(attempt1.getAppAttemptId());
|
MockAM am1 = rm.sendAMLaunched(attempt1.getAppAttemptId());
|
||||||
am1.registerAppAttempt();
|
am1.registerAppAttempt();
|
||||||
am1.setAMRMProtocol(rm.getApplicationMasterService());
|
|
||||||
|
|
||||||
AllocateRequestPBImpl allocateRequest = new AllocateRequestPBImpl();
|
AllocateRequestPBImpl allocateRequest = new AllocateRequestPBImpl();
|
||||||
List<ContainerId> release = new ArrayList<ContainerId>();
|
List<ContainerId> release = new ArrayList<ContainerId>();
|
||||||
|
|
|
@ -289,7 +289,7 @@ public class TestRMRestart {
|
||||||
|
|
||||||
// verify old AM is not accepted
|
// verify old AM is not accepted
|
||||||
// change running AM to talk to new RM
|
// change running AM to talk to new RM
|
||||||
am1.setAMRMProtocol(rm2.getApplicationMasterService());
|
am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
|
||||||
AllocateResponse allocResponse = am1.allocate(
|
AllocateResponse allocResponse = am1.allocate(
|
||||||
new ArrayList<ResourceRequest>(),
|
new ArrayList<ResourceRequest>(),
|
||||||
new ArrayList<ContainerId>());
|
new ArrayList<ContainerId>());
|
||||||
|
@ -1663,7 +1663,7 @@ public class TestRMRestart {
|
||||||
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
|
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
|
||||||
// recover app
|
// recover app
|
||||||
RMApp loadedApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
|
RMApp loadedApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
|
||||||
am1.setAMRMProtocol(rm2.getApplicationMasterService());
|
am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
|
||||||
am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>());
|
am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>());
|
||||||
nm1.nodeHeartbeat(true);
|
nm1.nodeHeartbeat(true);
|
||||||
nm1 = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
|
nm1 = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
|
||||||
|
|
|
@ -33,10 +33,13 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||||
import org.apache.hadoop.security.UserGroupInformation;
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
|
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerState;
|
import org.apache.hadoop.yarn.api.records.ContainerState;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.Resource;
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
|
||||||
|
@ -72,6 +75,9 @@ import org.junit.Test;
|
||||||
import org.junit.runner.RunWith;
|
import org.junit.runner.RunWith;
|
||||||
import org.junit.runners.Parameterized;
|
import org.junit.runners.Parameterized;
|
||||||
|
|
||||||
|
import com.google.common.base.Supplier;
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings({"rawtypes", "unchecked"})
|
@SuppressWarnings({"rawtypes", "unchecked"})
|
||||||
@RunWith(value = Parameterized.class)
|
@RunWith(value = Parameterized.class)
|
||||||
public class TestWorkPreservingRMRestart {
|
public class TestWorkPreservingRMRestart {
|
||||||
|
@ -572,8 +578,8 @@ public class TestWorkPreservingRMRestart {
|
||||||
rm2.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
|
rm2.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
|
||||||
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.LAUNCHED);
|
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.LAUNCHED);
|
||||||
|
|
||||||
am0.setAMRMProtocol(rm2.getApplicationMasterService());
|
am0.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
|
||||||
am0.registerAppAttempt(false);
|
am0.registerAppAttempt(true);
|
||||||
|
|
||||||
rm2.waitForState(app0.getApplicationId(), RMAppState.RUNNING);
|
rm2.waitForState(app0.getApplicationId(), RMAppState.RUNNING);
|
||||||
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
|
rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
|
||||||
|
@ -646,6 +652,69 @@ public class TestWorkPreservingRMRestart {
|
||||||
waitForNumContainersToRecover(2, rm2, am0.getApplicationAttemptId());
|
waitForNumContainersToRecover(2, rm2, am0.getApplicationAttemptId());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test if RM on recovery receives the container release request from AM
|
||||||
|
// before it receives the container status reported by NM for recovery. this
|
||||||
|
// container should not be recovered.
|
||||||
|
@Test (timeout = 30000)
|
||||||
|
public void testReleasedContainerNotRecovered() throws Exception {
|
||||||
|
MemoryRMStateStore memStore = new MemoryRMStateStore();
|
||||||
|
memStore.init(conf);
|
||||||
|
rm1 = new MockRM(conf, memStore);
|
||||||
|
MockNM nm1 = new MockNM("h1:1234", 15120, rm1.getResourceTrackerService());
|
||||||
|
nm1.registerNode();
|
||||||
|
rm1.start();
|
||||||
|
|
||||||
|
RMApp app1 = rm1.submitApp(1024);
|
||||||
|
final MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
|
||||||
|
|
||||||
|
// Re-start RM
|
||||||
|
conf.setInt(YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS, 8000);
|
||||||
|
rm2 = new MockRM(conf, memStore);
|
||||||
|
rm2.start();
|
||||||
|
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
|
||||||
|
rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
|
||||||
|
am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
|
||||||
|
am1.registerAppAttempt(true);
|
||||||
|
|
||||||
|
// try to release a container before the container is actually recovered.
|
||||||
|
final ContainerId runningContainer =
|
||||||
|
ContainerId.newInstance(am1.getApplicationAttemptId(), 2);
|
||||||
|
am1.allocate(null, Arrays.asList(runningContainer));
|
||||||
|
|
||||||
|
// send container statuses to recover the containers
|
||||||
|
List<NMContainerStatus> containerStatuses =
|
||||||
|
createNMContainerStatusForApp(am1);
|
||||||
|
nm1.registerNode(containerStatuses, null);
|
||||||
|
|
||||||
|
// only the am container should be recovered.
|
||||||
|
waitForNumContainersToRecover(1, rm2, am1.getApplicationAttemptId());
|
||||||
|
|
||||||
|
final AbstractYarnScheduler scheduler =
|
||||||
|
(AbstractYarnScheduler) rm2.getResourceScheduler();
|
||||||
|
// cached release request is cleaned.
|
||||||
|
// assertFalse(scheduler.getPendingRelease().contains(runningContainer));
|
||||||
|
|
||||||
|
AllocateResponse response = am1.allocate(null, null);
|
||||||
|
// AM gets notified of the completed container.
|
||||||
|
boolean receivedCompletedContainer = false;
|
||||||
|
for (ContainerStatus status : response.getCompletedContainersStatuses()) {
|
||||||
|
if (status.getContainerId().equals(runningContainer)) {
|
||||||
|
receivedCompletedContainer = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(receivedCompletedContainer);
|
||||||
|
|
||||||
|
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||||
|
public Boolean get() {
|
||||||
|
// release cache is cleaned up and previous running container is not
|
||||||
|
// recovered
|
||||||
|
return scheduler.getApplicationAttempt(am1.getApplicationAttemptId())
|
||||||
|
.getPendingRelease().isEmpty()
|
||||||
|
&& scheduler.getRMContainer(runningContainer) == null;
|
||||||
|
}
|
||||||
|
}, 1000, 20000);
|
||||||
|
}
|
||||||
|
|
||||||
private void asserteMetrics(QueueMetrics qm, int appsSubmitted,
|
private void asserteMetrics(QueueMetrics qm, int appsSubmitted,
|
||||||
int appsPending, int appsRunning, int appsCompleted,
|
int appsPending, int appsRunning, int appsCompleted,
|
||||||
int allocatedContainers, int availableMB, int availableVirtualCores,
|
int allocatedContainers, int availableMB, int availableVirtualCores,
|
||||||
|
@ -661,7 +730,7 @@ public class TestWorkPreservingRMRestart {
|
||||||
assertEquals(allocatedVirtualCores, qm.getAllocatedVirtualCores());
|
assertEquals(allocatedVirtualCores, qm.getAllocatedVirtualCores());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void waitForNumContainersToRecover(int num, MockRM rm,
|
public static void waitForNumContainersToRecover(int num, MockRM rm,
|
||||||
ApplicationAttemptId attemptId) throws Exception {
|
ApplicationAttemptId attemptId) throws Exception {
|
||||||
AbstractYarnScheduler scheduler =
|
AbstractYarnScheduler scheduler =
|
||||||
(AbstractYarnScheduler) rm.getResourceScheduler();
|
(AbstractYarnScheduler) rm.getResourceScheduler();
|
||||||
|
@ -674,7 +743,9 @@ public class TestWorkPreservingRMRestart {
|
||||||
attempt = scheduler.getApplicationAttempt(attemptId);
|
attempt = scheduler.getApplicationAttempt(attemptId);
|
||||||
}
|
}
|
||||||
while (attempt.getLiveContainers().size() < num) {
|
while (attempt.getLiveContainers().size() < num) {
|
||||||
System.out.println("Wait for " + num + " containers to recover.");
|
System.out.println("Wait for " + num
|
||||||
|
+ " containers to recover. currently: "
|
||||||
|
+ attempt.getLiveContainers().size());
|
||||||
Thread.sleep(200);
|
Thread.sleep(200);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue