YARN-7185. ContainerScheduler should only look at availableResource for GUARANTEED containers when OPPORTUNISTIC container queuing is enabled. (Wangda Tan via asuresh)
(cherry picked from commit 2ae72692fc
)
This commit is contained in:
parent
e5a65ebc28
commit
1230b60fa6
|
@ -238,6 +238,11 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
return this.queuedOpportunisticContainers.size();
|
return this.queuedOpportunisticContainers.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
public int getNumRunningContainers() {
|
||||||
|
return this.runningContainers.size();
|
||||||
|
}
|
||||||
|
|
||||||
public OpportunisticContainersStatus getOpportunisticContainersStatus() {
|
public OpportunisticContainersStatus getOpportunisticContainersStatus() {
|
||||||
this.opportunisticContainersStatus.setQueuedOpportContainers(
|
this.opportunisticContainersStatus.setQueuedOpportContainers(
|
||||||
getNumQueuedOpportunisticContainers());
|
getNumQueuedOpportunisticContainers());
|
||||||
|
@ -274,27 +279,32 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
ExecutionType.OPPORTUNISTIC) {
|
ExecutionType.OPPORTUNISTIC) {
|
||||||
this.metrics.completeOpportunisticContainer(container.getResource());
|
this.metrics.completeOpportunisticContainer(container.getResource());
|
||||||
}
|
}
|
||||||
startPendingContainers();
|
startPendingContainers(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void startPendingContainers() {
|
/**
|
||||||
|
* Start pending containers in the queue.
|
||||||
|
* @param forceStartGuaranteedContaieners When this is true, start guaranteed
|
||||||
|
* container without looking at available resource
|
||||||
|
*/
|
||||||
|
private void startPendingContainers(boolean forceStartGuaranteedContaieners) {
|
||||||
// Start pending guaranteed containers, if resources available.
|
// Start pending guaranteed containers, if resources available.
|
||||||
boolean resourcesAvailable =
|
boolean resourcesAvailable = startContainers(
|
||||||
startContainersFromQueue(queuedGuaranteedContainers.values());
|
queuedGuaranteedContainers.values(), forceStartGuaranteedContaieners);
|
||||||
// Start opportunistic containers, if resources available.
|
// Start opportunistic containers, if resources available.
|
||||||
if (resourcesAvailable) {
|
if (resourcesAvailable) {
|
||||||
startContainersFromQueue(queuedOpportunisticContainers.values());
|
startContainers(queuedOpportunisticContainers.values(), false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean startContainersFromQueue(
|
private boolean startContainers(
|
||||||
Collection<Container> queuedContainers) {
|
Collection<Container> containersToBeStarted, boolean force) {
|
||||||
Iterator<Container> cIter = queuedContainers.iterator();
|
Iterator<Container> cIter = containersToBeStarted.iterator();
|
||||||
boolean resourcesAvailable = true;
|
boolean resourcesAvailable = true;
|
||||||
while (cIter.hasNext() && resourcesAvailable) {
|
while (cIter.hasNext() && resourcesAvailable) {
|
||||||
Container container = cIter.next();
|
Container container = cIter.next();
|
||||||
if (tryStartContainer(container)) {
|
if (tryStartContainer(container, force)) {
|
||||||
cIter.remove();
|
cIter.remove();
|
||||||
} else {
|
} else {
|
||||||
resourcesAvailable = false;
|
resourcesAvailable = false;
|
||||||
|
@ -303,9 +313,11 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
return resourcesAvailable;
|
return resourcesAvailable;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean tryStartContainer(Container container) {
|
private boolean tryStartContainer(Container container, boolean force) {
|
||||||
boolean containerStarted = false;
|
boolean containerStarted = false;
|
||||||
if (resourceAvailableToStartContainer(container)) {
|
// call startContainer without checking available resource when force==true
|
||||||
|
if (force || resourceAvailableToStartContainer(
|
||||||
|
container)) {
|
||||||
startContainer(container);
|
startContainer(container);
|
||||||
containerStarted = true;
|
containerStarted = true;
|
||||||
}
|
}
|
||||||
|
@ -373,7 +385,12 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
// enough number of opportunistic containers.
|
// enough number of opportunistic containers.
|
||||||
if (isGuaranteedContainer) {
|
if (isGuaranteedContainer) {
|
||||||
enqueueContainer(container);
|
enqueueContainer(container);
|
||||||
startPendingContainers();
|
|
||||||
|
// When opportunistic container not allowed (which is determined by
|
||||||
|
// max-queue length of pending opportunistic containers <= 0), start
|
||||||
|
// guaranteed containers without looking at available resources.
|
||||||
|
boolean forceStartGuaranteedContainers = (maxOppQueueLength <= 0);
|
||||||
|
startPendingContainers(forceStartGuaranteedContainers);
|
||||||
|
|
||||||
// if the guaranteed container is queued, we need to preempt opportunistic
|
// if the guaranteed container is queued, we need to preempt opportunistic
|
||||||
// containers for make room for it
|
// containers for make room for it
|
||||||
|
@ -386,12 +403,12 @@ public class ContainerScheduler extends AbstractService implements
|
||||||
// containers based on remaining resource available, then enqueue the
|
// containers based on remaining resource available, then enqueue the
|
||||||
// opportunistic container. If the container is enqueued, we do another
|
// opportunistic container. If the container is enqueued, we do another
|
||||||
// pass to try to start the newly enqueued opportunistic container.
|
// pass to try to start the newly enqueued opportunistic container.
|
||||||
startPendingContainers();
|
startPendingContainers(false);
|
||||||
boolean containerQueued = enqueueContainer(container);
|
boolean containerQueued = enqueueContainer(container);
|
||||||
// container may not get queued because the max opportunistic container
|
// container may not get queued because the max opportunistic container
|
||||||
// queue length is reached. If so, there is no point doing another pass
|
// queue length is reached. If so, there is no point doing another pass
|
||||||
if (containerQueued) {
|
if (containerQueued) {
|
||||||
startPendingContainers();
|
startPendingContainers(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -147,7 +147,7 @@ public class TestContainerManager extends BaseContainerManagerTest {
|
||||||
@Before
|
@Before
|
||||||
public void setup() throws IOException {
|
public void setup() throws IOException {
|
||||||
conf.setInt(
|
conf.setInt(
|
||||||
YarnConfiguration.NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH, 10);
|
YarnConfiguration.NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH, 0);
|
||||||
super.setup();
|
super.setup();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,98 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.scheduler;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||||
|
import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
|
||||||
|
import org.apache.hadoop.yarn.api.protocolrecords.StartContainersRequest;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
|
||||||
|
import org.apache.hadoop.yarn.api.records.ExecutionType;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
|
||||||
|
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Make sure CantainerScheduler related changes are compatible to old behaviors
|
||||||
|
*/
|
||||||
|
public class TestContainerSchedulerBehaviorCompatibility
|
||||||
|
extends BaseContainerManagerTest {
|
||||||
|
public TestContainerSchedulerBehaviorCompatibility()
|
||||||
|
throws UnsupportedFileSystemException {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() throws IOException {
|
||||||
|
conf.setInt(YarnConfiguration.NM_VCORES, 1);
|
||||||
|
conf.setInt(YarnConfiguration.NM_OPPORTUNISTIC_CONTAINERS_MAX_QUEUE_LENGTH,
|
||||||
|
0);
|
||||||
|
super.setup();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testForceStartGuaranteedContainersWhenOppContainerDisabled()
|
||||||
|
throws Exception {
|
||||||
|
containerManager.start();
|
||||||
|
|
||||||
|
ContainerLaunchContext containerLaunchContext =
|
||||||
|
recordFactory.newRecordInstance(ContainerLaunchContext.class);
|
||||||
|
containerLaunchContext.setCommands(Arrays.asList("echo"));
|
||||||
|
|
||||||
|
List<StartContainerRequest> list = new ArrayList<>();
|
||||||
|
|
||||||
|
// Add a container start request with #vcores > available (1).
|
||||||
|
// This could happen when DefaultContainerCalculator configured because
|
||||||
|
// on the RM side it won't check vcores at all.
|
||||||
|
list.add(StartContainerRequest.newInstance(containerLaunchContext,
|
||||||
|
createContainerToken(createContainerId(0), DUMMY_RM_IDENTIFIER,
|
||||||
|
context.getNodeId(), user, BuilderUtils.newResource(2048, 4),
|
||||||
|
context.getContainerTokenSecretManager(), null,
|
||||||
|
ExecutionType.GUARANTEED)));
|
||||||
|
|
||||||
|
StartContainersRequest allRequests = StartContainersRequest.newInstance(list);
|
||||||
|
containerManager.startContainers(allRequests);
|
||||||
|
|
||||||
|
ContainerScheduler cs = containerManager.getContainerScheduler();
|
||||||
|
int nQueuedContainers = cs.getNumQueuedContainers();
|
||||||
|
int nRunningContainers = cs.getNumRunningContainers();
|
||||||
|
|
||||||
|
// Wait at most 10 secs and we expect all containers finished.
|
||||||
|
int maxTry = 100;
|
||||||
|
int nTried = 1;
|
||||||
|
while (nQueuedContainers != 0 || nRunningContainers != 0) {
|
||||||
|
Thread.sleep(100);
|
||||||
|
nQueuedContainers = cs.getNumQueuedContainers();
|
||||||
|
nRunningContainers = cs.getNumRunningContainers();
|
||||||
|
nTried++;
|
||||||
|
if (nTried > maxTry) {
|
||||||
|
Assert.fail("Failed to get either number of queuing containers to 0 or "
|
||||||
|
+ "number of running containers to 0, #queued=" + nQueuedContainers
|
||||||
|
+ ", #running=" + nRunningContainers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue