YARN-5216. Expose configurable preemption policy for OPPORTUNISTIC containers running on the NM. (Hitesh Sharma via asuresh)
(cherry picked from commit 4f8194430f
)
This commit is contained in:
parent
1af9f962c9
commit
52bf458616
|
@ -977,6 +977,15 @@ public class YarnConfiguration extends Configuration {
|
||||||
NM_PREFIX + "container-retry-minimum-interval-ms";
|
NM_PREFIX + "container-retry-minimum-interval-ms";
|
||||||
public static final int DEFAULT_NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS = 1000;
|
public static final int DEFAULT_NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS = 1000;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use container pause as the preemption policy over kill in the container
|
||||||
|
* queue at a NodeManager.
|
||||||
|
**/
|
||||||
|
public static final String NM_CONTAINER_QUEUING_USE_PAUSE_FOR_PREEMPTION =
|
||||||
|
NM_PREFIX + "opportunistic-containers-use-pause-for-preemption";
|
||||||
|
public static final boolean
|
||||||
|
DEFAULT_NM_CONTAINER_QUEUING_USE_PAUSE_FOR_PREEMPTION = false;
|
||||||
|
|
||||||
/** Interval at which the delayed token removal thread runs */
|
/** Interval at which the delayed token removal thread runs */
|
||||||
public static final String RM_DELAYED_DELEGATION_TOKEN_REMOVAL_INTERVAL_MS =
|
public static final String RM_DELAYED_DELEGATION_TOKEN_REMOVAL_INTERVAL_MS =
|
||||||
RM_PREFIX + "delayed.delegation-token.removal-interval-ms";
|
RM_PREFIX + "delayed.delegation-token.removal-interval-ms";
|
||||||
|
|
|
@ -2836,6 +2836,15 @@
|
||||||
<value>100</value>
|
<value>100</value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>
|
||||||
|
Use container pause as the preemption policy over kill in the container
|
||||||
|
queue at a NodeManager.
|
||||||
|
</description>
|
||||||
|
<name>yarn.nodemanager.opportunistic-containers-use-pause-for-preemption</name>
|
||||||
|
<value>false</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>
|
<description>
|
||||||
Error filename pattern, to identify the file in the container's
|
Error filename pattern, to identify the file in the container's
|
||||||
|
|
|
@ -91,4 +91,6 @@ public interface Container extends EventHandler<ContainerEvent> {
|
||||||
void sendKillEvent(int exitStatus, String description);
|
void sendKillEvent(int exitStatus, String description);
|
||||||
|
|
||||||
boolean isRecovering();
|
boolean isRecovering();
|
||||||
|
|
||||||
|
void sendPauseEvent(String description);
|
||||||
}
|
}
|
||||||
|
|
|
@ -793,15 +793,22 @@ public class ContainerImpl implements Container {
|
||||||
@SuppressWarnings("unchecked") // dispatcher not typed
|
@SuppressWarnings("unchecked") // dispatcher not typed
|
||||||
@Override
|
@Override
|
||||||
public void sendLaunchEvent() {
|
public void sendLaunchEvent() {
|
||||||
ContainersLauncherEventType launcherEvent =
|
if (ContainerState.PAUSED == getContainerState()) {
|
||||||
ContainersLauncherEventType.LAUNCH_CONTAINER;
|
dispatcher.getEventHandler().handle(
|
||||||
if (recoveredStatus == RecoveredContainerStatus.LAUNCHED) {
|
new ContainerResumeEvent(containerId,
|
||||||
// try to recover a container that was previously launched
|
"Container Resumed as some resources freed up"));
|
||||||
launcherEvent = ContainersLauncherEventType.RECOVER_CONTAINER;
|
} else {
|
||||||
|
ContainersLauncherEventType launcherEvent =
|
||||||
|
ContainersLauncherEventType.LAUNCH_CONTAINER;
|
||||||
|
if (recoveredStatus == RecoveredContainerStatus.LAUNCHED) {
|
||||||
|
// try to recover a container that was previously launched
|
||||||
|
launcherEvent = ContainersLauncherEventType.RECOVER_CONTAINER;
|
||||||
|
}
|
||||||
|
containerLaunchStartTime = clock.getTime();
|
||||||
|
dispatcher.getEventHandler().handle(
|
||||||
|
new ContainersLauncherEvent(this, launcherEvent));
|
||||||
}
|
}
|
||||||
containerLaunchStartTime = clock.getTime();
|
|
||||||
dispatcher.getEventHandler().handle(
|
|
||||||
new ContainersLauncherEvent(this, launcherEvent));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked") // dispatcher not typed
|
@SuppressWarnings("unchecked") // dispatcher not typed
|
||||||
|
@ -820,6 +827,13 @@ public class ContainerImpl implements Container {
|
||||||
new ContainerKillEvent(containerId, exitStatus, description));
|
new ContainerKillEvent(containerId, exitStatus, description));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked") // dispatcher not typed
|
||||||
|
@Override
|
||||||
|
public void sendPauseEvent(String description) {
|
||||||
|
dispatcher.getEventHandler().handle(
|
||||||
|
new ContainerPauseEvent(containerId, description));
|
||||||
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked") // dispatcher not typed
|
@SuppressWarnings("unchecked") // dispatcher not typed
|
||||||
private void sendRelaunchEvent() {
|
private void sendRelaunchEvent() {
|
||||||
ContainersLauncherEventType launcherEvent =
|
ContainersLauncherEventType launcherEvent =
|
||||||
|
@ -1775,7 +1789,7 @@ public class ContainerImpl implements Container {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Transitions upon receiving PAUSE_CONTAINER.
|
* Transitions upon receiving PAUSE_CONTAINER.
|
||||||
* - RUNNING -> PAUSED
|
* - RUNNING -> PAUSING
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("unchecked") // dispatcher not typed
|
@SuppressWarnings("unchecked") // dispatcher not typed
|
||||||
static class PauseContainerTransition implements
|
static class PauseContainerTransition implements
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
package org.apache.hadoop.yarn.server.nodemanager.containermanager.scheduler;
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.scheduler;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.service.AbstractService;
|
import org.apache.hadoop.service.AbstractService;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
|
@ -34,6 +35,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
|
||||||
|
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor
|
||||||
.ChangeMonitoringContainerResourceEvent;
|
.ChangeMonitoringContainerResourceEvent;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
|
||||||
|
|
||||||
|
|
||||||
|
@ -74,7 +76,7 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
queuedOpportunisticContainers = new LinkedHashMap<>();
|
queuedOpportunisticContainers = new LinkedHashMap<>();
|
||||||
|
|
||||||
// Used to keep track of containers that have been marked to be killed
|
// Used to keep track of containers that have been marked to be killed
|
||||||
// to make room for a guaranteed container.
|
// or paused to make room for a guaranteed container.
|
||||||
private final Map<ContainerId, Container> oppContainersToKill =
|
private final Map<ContainerId, Container> oppContainersToKill =
|
||||||
new HashMap<>();
|
new HashMap<>();
|
||||||
|
|
||||||
|
@ -98,6 +100,8 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
private final AsyncDispatcher dispatcher;
|
private final AsyncDispatcher dispatcher;
|
||||||
private final NodeManagerMetrics metrics;
|
private final NodeManagerMetrics metrics;
|
||||||
|
|
||||||
|
private Boolean usePauseEventForPreemption = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiate a Container Scheduler.
|
* Instantiate a Container Scheduler.
|
||||||
* @param context NodeManager Context.
|
* @param context NodeManager Context.
|
||||||
|
@ -112,6 +116,17 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
DEFAULT_NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH));
|
DEFAULT_NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void serviceInit(Configuration conf) throws Exception {
|
||||||
|
super.serviceInit(conf);
|
||||||
|
this.usePauseEventForPreemption =
|
||||||
|
conf.getBoolean(
|
||||||
|
YarnConfiguration.NM_CONTAINER_QUEUING_USE_PAUSE_FOR_PREEMPTION,
|
||||||
|
YarnConfiguration.
|
||||||
|
DEFAULT_NM_CONTAINER_QUEUING_USE_PAUSE_FOR_PREEMPTION);
|
||||||
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public ContainerScheduler(Context context, AsyncDispatcher dispatcher,
|
public ContainerScheduler(Context context, AsyncDispatcher dispatcher,
|
||||||
NodeManagerMetrics metrics, int qLength) {
|
NodeManagerMetrics metrics, int qLength) {
|
||||||
|
@ -136,8 +151,9 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
case SCHEDULE_CONTAINER:
|
case SCHEDULE_CONTAINER:
|
||||||
scheduleContainer(event.getContainer());
|
scheduleContainer(event.getContainer());
|
||||||
break;
|
break;
|
||||||
|
case CONTAINER_PAUSED:
|
||||||
case CONTAINER_COMPLETED:
|
case CONTAINER_COMPLETED:
|
||||||
onContainerCompleted(event.getContainer());
|
onResourcesReclaimed(event.getContainer());
|
||||||
break;
|
break;
|
||||||
case UPDATE_CONTAINER:
|
case UPDATE_CONTAINER:
|
||||||
if (event instanceof UpdateContainerSchedulerEvent) {
|
if (event instanceof UpdateContainerSchedulerEvent) {
|
||||||
|
@ -203,9 +219,9 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
queuedGuaranteedContainers.put(containerId,
|
queuedGuaranteedContainers.put(containerId,
|
||||||
updateEvent.getContainer());
|
updateEvent.getContainer());
|
||||||
}
|
}
|
||||||
//Kill opportunistic containers if any to make room for
|
//Kill/pause opportunistic containers if any to make room for
|
||||||
// promotion request
|
// promotion request
|
||||||
killOpportunisticContainers(updateEvent.getContainer());
|
reclaimOpportunisticContainerResources(updateEvent.getContainer());
|
||||||
} else {
|
} else {
|
||||||
// Demotion of queued container.. Should not happen too often
|
// Demotion of queued container.. Should not happen too often
|
||||||
// since you should not find too many queued guaranteed
|
// since you should not find too many queued guaranteed
|
||||||
|
@ -243,6 +259,12 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
return this.runningContainers.size();
|
return this.runningContainers.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public void setUsePauseEventForPreemption(
|
||||||
|
boolean usePauseEventForPreemption) {
|
||||||
|
this.usePauseEventForPreemption = usePauseEventForPreemption;
|
||||||
|
}
|
||||||
|
|
||||||
public OpportunisticContainersStatus getOpportunisticContainersStatus() {
|
public OpportunisticContainersStatus getOpportunisticContainersStatus() {
|
||||||
this.opportunisticContainersStatus.setQueuedOpportContainers(
|
this.opportunisticContainersStatus.setQueuedOpportContainers(
|
||||||
getNumQueuedOpportunisticContainers());
|
getNumQueuedOpportunisticContainers());
|
||||||
|
@ -257,7 +279,7 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
return this.opportunisticContainersStatus;
|
return this.opportunisticContainersStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void onContainerCompleted(Container container) {
|
private void onResourcesReclaimed(Container container) {
|
||||||
oppContainersToKill.remove(container.getContainerId());
|
oppContainersToKill.remove(container.getContainerId());
|
||||||
|
|
||||||
// This could be killed externally for eg. by the ContainerManager,
|
// This could be killed externally for eg. by the ContainerManager,
|
||||||
|
@ -292,6 +314,23 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
// Start pending guaranteed containers, if resources available.
|
// Start pending guaranteed containers, if resources available.
|
||||||
boolean resourcesAvailable = startContainers(
|
boolean resourcesAvailable = startContainers(
|
||||||
queuedGuaranteedContainers.values(), forceStartGuaranteedContaieners);
|
queuedGuaranteedContainers.values(), forceStartGuaranteedContaieners);
|
||||||
|
// Resume opportunistic containers, if resource available.
|
||||||
|
if (resourcesAvailable) {
|
||||||
|
List<Container> pausedContainers = new ArrayList<Container>();
|
||||||
|
Map<ContainerId, Container> containers =
|
||||||
|
context.getContainers();
|
||||||
|
for (Map.Entry<ContainerId, Container>entry : containers.entrySet()) {
|
||||||
|
ContainerId contId = entry.getKey();
|
||||||
|
// Find containers that were not already started and are in paused state
|
||||||
|
if(false == runningContainers.containsKey(contId)) {
|
||||||
|
if(containers.get(contId).getContainerState()
|
||||||
|
== ContainerState.PAUSED) {
|
||||||
|
pausedContainers.add(containers.get(contId));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
resourcesAvailable = startContainers(pausedContainers, false);
|
||||||
|
}
|
||||||
// Start opportunistic containers, if resources available.
|
// Start opportunistic containers, if resources available.
|
||||||
if (resourcesAvailable) {
|
if (resourcesAvailable) {
|
||||||
startContainers(queuedOpportunisticContainers.values(), false);
|
startContainers(queuedOpportunisticContainers.values(), false);
|
||||||
|
@ -395,7 +434,7 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
// if the guaranteed container is queued, we need to preempt opportunistic
|
// if the guaranteed container is queued, we need to preempt opportunistic
|
||||||
// containers for make room for it
|
// containers for make room for it
|
||||||
if (queuedGuaranteedContainers.containsKey(container.getContainerId())) {
|
if (queuedGuaranteedContainers.containsKey(container.getContainerId())) {
|
||||||
killOpportunisticContainers(container);
|
reclaimOpportunisticContainerResources(container);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Given an opportunistic container, we first try to start as many queuing
|
// Given an opportunistic container, we first try to start as many queuing
|
||||||
|
@ -413,19 +452,30 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void killOpportunisticContainers(Container container) {
|
@SuppressWarnings("unchecked")
|
||||||
List<Container> extraOpportContainersToKill =
|
private void reclaimOpportunisticContainerResources(Container container) {
|
||||||
pickOpportunisticContainersToKill(container.getContainerId());
|
List<Container> extraOppContainersToReclaim =
|
||||||
|
pickOpportunisticContainersToReclaimResources(
|
||||||
|
container.getContainerId());
|
||||||
// Kill the opportunistic containers that were chosen.
|
// Kill the opportunistic containers that were chosen.
|
||||||
for (Container contToKill : extraOpportContainersToKill) {
|
for (Container contToReclaim : extraOppContainersToReclaim) {
|
||||||
contToKill.sendKillEvent(
|
String preemptionAction = usePauseEventForPreemption == true ? "paused" :
|
||||||
ContainerExitStatus.KILLED_BY_CONTAINER_SCHEDULER,
|
"resumed";
|
||||||
"Container Killed to make room for Guaranteed Container.");
|
|
||||||
oppContainersToKill.put(contToKill.getContainerId(), contToKill);
|
|
||||||
LOG.info(
|
LOG.info(
|
||||||
"Opportunistic container {} will be killed in order to start the "
|
"Container {} will be {} to start the "
|
||||||
+ "execution of guaranteed container {}.",
|
+ "execution of guaranteed container {}.",
|
||||||
contToKill.getContainerId(), container.getContainerId());
|
contToReclaim.getContainerId(), preemptionAction,
|
||||||
|
container.getContainerId());
|
||||||
|
|
||||||
|
if (usePauseEventForPreemption) {
|
||||||
|
contToReclaim.sendPauseEvent(
|
||||||
|
"Container Paused to make room for Guaranteed Container");
|
||||||
|
} else {
|
||||||
|
contToReclaim.sendKillEvent(
|
||||||
|
ContainerExitStatus.KILLED_BY_CONTAINER_SCHEDULER,
|
||||||
|
"Container Killed to make room for Guaranteed Container.");
|
||||||
|
}
|
||||||
|
oppContainersToKill.put(contToReclaim.getContainerId(), contToReclaim);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -440,7 +490,7 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
container.sendLaunchEvent();
|
container.sendLaunchEvent();
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Container> pickOpportunisticContainersToKill(
|
private List<Container> pickOpportunisticContainersToReclaimResources(
|
||||||
ContainerId containerToStartId) {
|
ContainerId containerToStartId) {
|
||||||
// The opportunistic containers that need to be killed for the
|
// The opportunistic containers that need to be killed for the
|
||||||
// given container to start.
|
// given container to start.
|
||||||
|
|
|
@ -23,6 +23,8 @@ import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.ConcurrentMap;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||||
import org.apache.hadoop.security.UserGroupInformation;
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
|
@ -49,6 +51,7 @@ import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
|
import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
|
||||||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl;
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl;
|
||||||
|
@ -124,18 +127,38 @@ public class TestContainerSchedulerQueuing extends BaseContainerManagerTest {
|
||||||
@Override
|
@Override
|
||||||
protected ContainerExecutor createContainerExecutor() {
|
protected ContainerExecutor createContainerExecutor() {
|
||||||
DefaultContainerExecutor exec = new DefaultContainerExecutor() {
|
DefaultContainerExecutor exec = new DefaultContainerExecutor() {
|
||||||
|
ConcurrentMap<String, Boolean> oversleepMap =
|
||||||
|
new ConcurrentHashMap<String, Boolean>();
|
||||||
@Override
|
@Override
|
||||||
public int launchContainer(ContainerStartContext ctx)
|
public int launchContainer(ContainerStartContext ctx)
|
||||||
throws IOException, ConfigurationException {
|
throws IOException, ConfigurationException {
|
||||||
|
oversleepMap.put(ctx.getContainer().getContainerId().toString(), false);
|
||||||
if (delayContainers) {
|
if (delayContainers) {
|
||||||
try {
|
try {
|
||||||
Thread.sleep(10000);
|
Thread.sleep(10000);
|
||||||
|
if(oversleepMap.get(ctx.getContainer().getContainerId().toString())
|
||||||
|
== true) {
|
||||||
|
Thread.sleep(10000);
|
||||||
|
}
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
// Nothing..
|
// Nothing..
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return super.launchContainer(ctx);
|
return super.launchContainer(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void pauseContainer(Container container) {
|
||||||
|
// To mimic pausing we force the container to be in the PAUSED state
|
||||||
|
// a little longer by oversleeping.
|
||||||
|
oversleepMap.put(container.getContainerId().toString(), true);
|
||||||
|
LOG.info("Container was paused");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void resumeContainer(Container container) {
|
||||||
|
LOG.info("Container was resumed");
|
||||||
|
}
|
||||||
};
|
};
|
||||||
exec.setConf(conf);
|
exec.setConf(conf);
|
||||||
return spy(exec);
|
return spy(exec);
|
||||||
|
@ -505,6 +528,86 @@ public class TestContainerSchedulerQueuing extends BaseContainerManagerTest {
|
||||||
contStatus1.getState());
|
contStatus1.getState());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Submit two OPPORTUNISTIC and one GUARANTEED containers. The resources
|
||||||
|
* requests by each container as such that only one can run in parallel.
|
||||||
|
* Thus, the OPPORTUNISTIC container that started running, will be
|
||||||
|
* paused for the GUARANTEED container to start.
|
||||||
|
* Once the GUARANTEED container finishes its execution, the remaining
|
||||||
|
* OPPORTUNISTIC container will be executed.
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testPauseOpportunisticForGuaranteedContainer() throws Exception {
|
||||||
|
containerManager.start();
|
||||||
|
containerManager.getContainerScheduler().
|
||||||
|
setUsePauseEventForPreemption(true);
|
||||||
|
ContainerLaunchContext containerLaunchContext =
|
||||||
|
recordFactory.newRecordInstance(ContainerLaunchContext.class);
|
||||||
|
|
||||||
|
List<StartContainerRequest> list = new ArrayList<>();
|
||||||
|
list.add(StartContainerRequest.newInstance(
|
||||||
|
containerLaunchContext,
|
||||||
|
createContainerToken(createContainerId(0), DUMMY_RM_IDENTIFIER,
|
||||||
|
context.getNodeId(),
|
||||||
|
user, BuilderUtils.newResource(2048, 1),
|
||||||
|
context.getContainerTokenSecretManager(), null,
|
||||||
|
ExecutionType.OPPORTUNISTIC)));
|
||||||
|
|
||||||
|
StartContainersRequest allRequests =
|
||||||
|
StartContainersRequest.newInstance(list);
|
||||||
|
containerManager.startContainers(allRequests);
|
||||||
|
|
||||||
|
BaseContainerManagerTest.waitForNMContainerState(containerManager,
|
||||||
|
createContainerId(0), ContainerState.RUNNING, 40);
|
||||||
|
|
||||||
|
list = new ArrayList<>();
|
||||||
|
list.add(StartContainerRequest.newInstance(
|
||||||
|
containerLaunchContext,
|
||||||
|
createContainerToken(createContainerId(1), DUMMY_RM_IDENTIFIER,
|
||||||
|
context.getNodeId(),
|
||||||
|
user, BuilderUtils.newResource(2048, 1),
|
||||||
|
context.getContainerTokenSecretManager(), null,
|
||||||
|
ExecutionType.GUARANTEED)));
|
||||||
|
allRequests =
|
||||||
|
StartContainersRequest.newInstance(list);
|
||||||
|
|
||||||
|
containerManager.startContainers(allRequests);
|
||||||
|
|
||||||
|
BaseContainerManagerTest.waitForNMContainerState(containerManager,
|
||||||
|
createContainerId(1), ContainerState.RUNNING, 40);
|
||||||
|
|
||||||
|
// Get container statuses. Container 0 should be paused, container 1
|
||||||
|
// should be running.
|
||||||
|
List<ContainerId> statList = new ArrayList<ContainerId>();
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
statList.add(createContainerId(i));
|
||||||
|
}
|
||||||
|
GetContainerStatusesRequest statRequest =
|
||||||
|
GetContainerStatusesRequest.newInstance(statList);
|
||||||
|
List<ContainerStatus> containerStatuses = containerManager
|
||||||
|
.getContainerStatuses(statRequest).getContainerStatuses();
|
||||||
|
for (ContainerStatus status : containerStatuses) {
|
||||||
|
if (status.getContainerId().equals(createContainerId(0))) {
|
||||||
|
Assert.assertTrue(status.getDiagnostics().contains(
|
||||||
|
"Container Paused to make room for Guaranteed Container"));
|
||||||
|
} else if (status.getContainerId().equals(createContainerId(1))) {
|
||||||
|
Assert.assertEquals(
|
||||||
|
org.apache.hadoop.yarn.api.records.ContainerState.RUNNING,
|
||||||
|
status.getState());
|
||||||
|
}
|
||||||
|
System.out.println("\nStatus : [" + status + "]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure that the GUARANTEED container completes
|
||||||
|
BaseContainerManagerTest.waitForNMContainerState(containerManager,
|
||||||
|
createContainerId(1), ContainerState.DONE, 40);
|
||||||
|
// Make sure that the PAUSED opportunistic container resumes and
|
||||||
|
// starts running
|
||||||
|
BaseContainerManagerTest.waitForNMContainerState(containerManager,
|
||||||
|
createContainerId(0), ContainerState.DONE, 40);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 1. Submit a long running GUARANTEED container to hog all NM resources.
|
* 1. Submit a long running GUARANTEED container to hog all NM resources.
|
||||||
* 2. Submit 6 OPPORTUNISTIC containers, all of which will be queued.
|
* 2. Submit 6 OPPORTUNISTIC containers, all of which will be queued.
|
||||||
|
|
|
@ -229,4 +229,9 @@ public class MockContainer implements Container {
|
||||||
public boolean isRecovering() {
|
public boolean isRecovering() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void sendPauseEvent(String description) {
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue