YARN-6163. FS Preemption is a trickle for severely starved applications. (kasha)

(cherry picked from commit 6c25dbcdc0)
This commit is contained in:
Karthik Kambatla 2017-02-15 23:16:01 -08:00
parent 0f8b99fd34
commit 46b6c95e0a
12 changed files with 584 additions and 139 deletions

View File

@ -182,6 +182,24 @@ public static Resource subtract(Resource lhs, Resource rhs) {
return subtractFrom(clone(lhs), rhs); return subtractFrom(clone(lhs), rhs);
} }
/**
* Subtract <code>rhs</code> from <code>lhs</code> and reset any negative
* values to zero.
* @param lhs {@link Resource} to subtract from
* @param rhs {@link Resource} to subtract
* @return the value of lhs after subtraction
*/
public static Resource subtractFromNonNegative(Resource lhs, Resource rhs) {
subtractFrom(lhs, rhs);
if (lhs.getMemorySize() < 0) {
lhs.setMemorySize(0);
}
if (lhs.getVirtualCores() < 0) {
lhs.setVirtualCores(0);
}
return lhs;
}
public static Resource negate(Resource resource) { public static Resource negate(Resource resource) {
return subtract(NONE, resource); return subtract(NONE, resource);
} }

View File

@ -120,6 +120,7 @@ public abstract class AbstractYarnScheduler
*/ */
protected ConcurrentMap<ApplicationId, SchedulerApplication<T>> applications; protected ConcurrentMap<ApplicationId, SchedulerApplication<T>> applications;
protected int nmExpireInterval; protected int nmExpireInterval;
protected long nmHeartbeatInterval;
protected final static List<Container> EMPTY_CONTAINER_LIST = protected final static List<Container> EMPTY_CONTAINER_LIST =
new ArrayList<Container>(); new ArrayList<Container>();
@ -156,6 +157,9 @@ public void serviceInit(Configuration conf) throws Exception {
nmExpireInterval = nmExpireInterval =
conf.getInt(YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS, conf.getInt(YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS,
YarnConfiguration.DEFAULT_RM_NM_EXPIRY_INTERVAL_MS); YarnConfiguration.DEFAULT_RM_NM_EXPIRY_INTERVAL_MS);
nmHeartbeatInterval =
conf.getLong(YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS,
YarnConfiguration.DEFAULT_RM_NM_HEARTBEAT_INTERVAL_MS);
long configuredMaximumAllocationWaitTime = long configuredMaximumAllocationWaitTime =
conf.getLong(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS, conf.getLong(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS,
YarnConfiguration.DEFAULT_RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS); YarnConfiguration.DEFAULT_RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS);

View File

@ -86,6 +86,7 @@ public class FSAppAttempt extends SchedulerApplicationAttempt
private final Set<RMContainer> containersToPreempt = new HashSet<>(); private final Set<RMContainer> containersToPreempt = new HashSet<>();
private Resource fairshareStarvation = Resources.none(); private Resource fairshareStarvation = Resources.none();
private long lastTimeAtFairShare; private long lastTimeAtFairShare;
private long nextStarvationCheck;
// minShareStarvation attributed to this application by the leaf queue // minShareStarvation attributed to this application by the leaf queue
private Resource minshareStarvation = Resources.none(); private Resource minshareStarvation = Resources.none();
@ -210,15 +211,9 @@ private void subtractResourcesOnBlacklistedNodes(
blacklistNodeIds.addAll(scheduler.getBlacklistedNodes(this)); blacklistNodeIds.addAll(scheduler.getBlacklistedNodes(this));
} }
for (FSSchedulerNode node: blacklistNodeIds) { for (FSSchedulerNode node: blacklistNodeIds) {
Resources.subtractFrom(availableResources, Resources.subtractFromNonNegative(availableResources,
node.getUnallocatedResource()); node.getUnallocatedResource());
} }
if (availableResources.getMemorySize() < 0) {
availableResources.setMemorySize(0);
}
if (availableResources.getVirtualCores() < 0) {
availableResources.setVirtualCores(0);
}
} }
/** /**
@ -528,6 +523,15 @@ Resource getStarvation() {
return Resources.add(fairshareStarvation, minshareStarvation); return Resources.add(fairshareStarvation, minshareStarvation);
} }
/**
* Get last computed fairshare starvation.
*
* @return last computed fairshare starvation
*/
Resource getFairshareStarvation() {
return fairshareStarvation;
}
/** /**
* Set the minshare attributed to this application. To be called only from * Set the minshare attributed to this application. To be called only from
* {@link FSLeafQueue#updateStarvedApps}. * {@link FSLeafQueue#updateStarvedApps}.
@ -1066,17 +1070,17 @@ boolean assignReservedContainer(FSSchedulerNode node) {
} }
/** /**
* Helper method that computes the extent of fairshare fairshareStarvation. * Helper method that computes the extent of fairshare starvation.
* @return freshly computed fairshare starvation
*/ */
Resource fairShareStarvation() { Resource fairShareStarvation() {
Resource threshold = Resources.multiply( Resource threshold = Resources.multiply(
getFairShare(), fsQueue.getFairSharePreemptionThreshold()); getFairShare(), fsQueue.getFairSharePreemptionThreshold());
Resource starvation = Resources.subtractFrom(threshold, getResourceUsage()); Resource starvation = Resources.componentwiseMin(threshold, demand);
Resources.subtractFromNonNegative(starvation, getResourceUsage());
long now = scheduler.getClock().getTime(); long now = scheduler.getClock().getTime();
boolean starved = Resources.greaterThan( boolean starved = !Resources.isNone(starvation);
fsQueue.getPolicy().getResourceCalculator(),
scheduler.getClusterResource(), starvation, Resources.none());
if (!starved) { if (!starved) {
lastTimeAtFairShare = now; lastTimeAtFairShare = now;
@ -1104,6 +1108,81 @@ boolean isStarvedForFairShare() {
return !Resources.isNone(fairshareStarvation); return !Resources.isNone(fairshareStarvation);
} }
/**
* Fetch a list of RRs corresponding to the extent the app is starved
* (fairshare and minshare). This method considers the number of containers
* in a RR and also only one locality-level (the first encountered
* resourceName).
*
* @return list of {@link ResourceRequest}s corresponding to the amount of
* starvation.
*/
List<ResourceRequest> getStarvedResourceRequests() {
// List of RRs we build in this method to return
List<ResourceRequest> ret = new ArrayList<>();
// Track visited RRs to avoid the same RR at multiple locality levels
VisitedResourceRequestTracker visitedRRs =
new VisitedResourceRequestTracker(scheduler.getNodeTracker());
// Start with current starvation and track the pending amount
Resource pending = getStarvation();
for (ResourceRequest rr : appSchedulingInfo.getAllResourceRequests()) {
if (Resources.isNone(pending)) {
// Found enough RRs to match the starvation
break;
}
// See if we have already seen this RR
if (!visitedRRs.visit(rr)) {
continue;
}
// A RR can have multiple containers of a capability. We need to
// compute the number of containers that fit in "pending".
int numContainersThatFit = (int) Math.floor(
Resources.ratio(scheduler.getResourceCalculator(),
pending, rr.getCapability()));
if (numContainersThatFit == 0) {
// This RR's capability is too large to fit in pending
continue;
}
// If the RR is only partially being satisfied, include only the
// partial number of containers.
if (numContainersThatFit < rr.getNumContainers()) {
rr = ResourceRequest.newInstance(rr.getPriority(),
rr.getResourceName(), rr.getCapability(), numContainersThatFit);
}
// Add the RR to return list and adjust "pending" accordingly
ret.add(rr);
Resources.subtractFromNonNegative(pending,
Resources.multiply(rr.getCapability(), rr.getNumContainers()));
}
return ret;
}
/**
* Notify this app that preemption has been triggered to make room for
* outstanding demand. The app should not be considered starved until after
* the specified delay.
*
* @param delayBeforeNextStarvationCheck duration to wait
*/
void preemptionTriggered(long delayBeforeNextStarvationCheck) {
nextStarvationCheck =
scheduler.getClock().getTime() + delayBeforeNextStarvationCheck;
}
/**
* Whether this app's starvation should be considered.
*/
boolean shouldCheckForStarvation() {
return scheduler.getClock().getTime() >= nextStarvationCheck;
}
/* Schedulable methods implementation */ /* Schedulable methods implementation */
@Override @Override
@ -1116,6 +1195,13 @@ public Resource getDemand() {
return demand; return demand;
} }
/**
* Get the current app's unsatisfied demand.
*/
Resource getPendingDemand() {
return Resources.subtract(demand, getResourceUsage());
}
@Override @Override
public long getStartTime() { public long getStartTime() {
return startTime; return startTime;

View File

@ -219,6 +219,63 @@ public void updateInternal(boolean checkStarvation) {
} }
} }
/**
* Compute the extent of fairshare starvation for a set of apps.
*
* @param appsWithDemand apps to compute fairshare starvation for
* @return aggregate fairshare starvation for all apps
*/
private Resource updateStarvedAppsFairshare(
TreeSet<FSAppAttempt> appsWithDemand) {
Resource fairShareStarvation = Resources.clone(none());
// Fetch apps with unmet demand sorted by fairshare starvation
for (FSAppAttempt app : appsWithDemand) {
Resource appStarvation = app.fairShareStarvation();
if (!Resources.isNone(appStarvation)) {
context.getStarvedApps().addStarvedApp(app);
Resources.addTo(fairShareStarvation, appStarvation);
} else {
break;
}
}
return fairShareStarvation;
}
/**
* Distribute minshare starvation to a set of apps
* @param appsWithDemand set of apps
* @param minShareStarvation minshare starvation to distribute
*/
private void updateStarvedAppsMinshare(
final TreeSet<FSAppAttempt> appsWithDemand,
final Resource minShareStarvation) {
Resource pending = Resources.clone(minShareStarvation);
// Keep adding apps to the starved list until the unmet demand goes over
// the remaining minshare
for (FSAppAttempt app : appsWithDemand) {
if (!Resources.isNone(pending)) {
Resource appMinShare = app.getPendingDemand();
Resources.subtractFromNonNegative(
appMinShare, app.getFairshareStarvation());
if (Resources.greaterThan(policy.getResourceCalculator(),
scheduler.getClusterResource(), appMinShare, pending)) {
Resources.subtractFromNonNegative(appMinShare, pending);
pending = none();
} else {
Resources.subtractFromNonNegative(pending, appMinShare);
}
app.setMinshareStarvation(appMinShare);
context.getStarvedApps().addStarvedApp(app);
} else {
// Reset minshare starvation in case we had set it in a previous
// iteration
app.resetMinshareStarvation();
}
}
}
/** /**
* Helper method to identify starved applications. This needs to be called * Helper method to identify starved applications. This needs to be called
* ONLY from {@link #updateInternal}, after the application shares * ONLY from {@link #updateInternal}, after the application shares
@ -237,44 +294,20 @@ public void updateInternal(boolean checkStarvation) {
* starved due to fairshare, there might still be starved applications. * starved due to fairshare, there might still be starved applications.
*/ */
private void updateStarvedApps() { private void updateStarvedApps() {
// First identify starved applications and track total amount of // Fetch apps with pending demand
// starvation (in resources) TreeSet<FSAppAttempt> appsWithDemand = fetchAppsWithDemand(false);
Resource fairShareStarvation = Resources.clone(none());
// Fetch apps with unmet demand sorted by fairshare starvation // Process apps with fairshare starvation
TreeSet<FSAppAttempt> appsWithDemand = fetchAppsWithDemand(); Resource fairShareStarvation = updateStarvedAppsFairshare(appsWithDemand);
for (FSAppAttempt app : appsWithDemand) {
Resource appStarvation = app.fairShareStarvation();
if (!Resources.equals(Resources.none(), appStarvation)) {
context.getStarvedApps().addStarvedApp(app);
Resources.addTo(fairShareStarvation, appStarvation);
} else {
break;
}
}
// Compute extent of minshare starvation // Compute extent of minshare starvation
Resource minShareStarvation = minShareStarvation(); Resource minShareStarvation = minShareStarvation();
// Compute minshare starvation that is not subsumed by fairshare starvation // Compute minshare starvation that is not subsumed by fairshare starvation
Resources.subtractFrom(minShareStarvation, fairShareStarvation); Resources.subtractFromNonNegative(minShareStarvation, fairShareStarvation);
// Keep adding apps to the starved list until the unmet demand goes over // Assign this minshare to apps with pending demand over fairshare
// the remaining minshare updateStarvedAppsMinshare(appsWithDemand, minShareStarvation);
for (FSAppAttempt app : appsWithDemand) {
if (Resources.greaterThan(policy.getResourceCalculator(),
scheduler.getClusterResource(), minShareStarvation, none())) {
Resource appPendingDemand =
Resources.subtract(app.getDemand(), app.getResourceUsage());
Resources.subtractFrom(minShareStarvation, appPendingDemand);
app.setMinshareStarvation(appPendingDemand);
context.getStarvedApps().addStarvedApp(app);
} else {
// Reset minshare starvation in case we had set it in a previous
// iteration
app.resetMinshareStarvation();
}
}
} }
@Override @Override
@ -352,7 +385,7 @@ public Resource assignContainer(FSSchedulerNode node) {
return assigned; return assigned;
} }
for (FSAppAttempt sched : fetchAppsWithDemand()) { for (FSAppAttempt sched : fetchAppsWithDemand(true)) {
if (SchedulerAppUtils.isPlaceBlacklisted(sched, node, LOG)) { if (SchedulerAppUtils.isPlaceBlacklisted(sched, node, LOG)) {
continue; continue;
} }
@ -368,14 +401,24 @@ public Resource assignContainer(FSSchedulerNode node) {
return assigned; return assigned;
} }
private TreeSet<FSAppAttempt> fetchAppsWithDemand() { /**
* Fetch the subset of apps that have unmet demand. When used for
* preemption-related code (as opposed to allocation), omits apps that
* should not be checked for starvation.
*
* @param assignment whether the apps are for allocation containers, as
* opposed to preemption calculations
* @return Set of apps with unmet demand
*/
private TreeSet<FSAppAttempt> fetchAppsWithDemand(boolean assignment) {
TreeSet<FSAppAttempt> pendingForResourceApps = TreeSet<FSAppAttempt> pendingForResourceApps =
new TreeSet<>(policy.getComparator()); new TreeSet<>(policy.getComparator());
readLock.lock(); readLock.lock();
try { try {
for (FSAppAttempt app : runnableApps) { for (FSAppAttempt app : runnableApps) {
Resource pending = app.getAppAttemptResourceUsage().getPending(); Resource pending = app.getAppAttemptResourceUsage().getPending();
if (!pending.equals(none())) { if (!Resources.isNone(pending) &&
(assignment || app.shouldCheckForStarvation())) {
pendingForResourceApps.add(app); pendingForResourceApps.add(app);
} }
} }

View File

@ -41,20 +41,26 @@ class FSPreemptionThread extends Thread {
protected final FSContext context; protected final FSContext context;
private final FairScheduler scheduler; private final FairScheduler scheduler;
private final long warnTimeBeforeKill; private final long warnTimeBeforeKill;
private final long delayBeforeNextStarvationCheck;
private final Timer preemptionTimer; private final Timer preemptionTimer;
FSPreemptionThread(FairScheduler scheduler) { FSPreemptionThread(FairScheduler scheduler) {
setDaemon(true);
setName("FSPreemptionThread");
this.scheduler = scheduler; this.scheduler = scheduler;
this.context = scheduler.getContext(); this.context = scheduler.getContext();
FairSchedulerConfiguration fsConf = scheduler.getConf(); FairSchedulerConfiguration fsConf = scheduler.getConf();
context.setPreemptionEnabled(); context.setPreemptionEnabled();
context.setPreemptionUtilizationThreshold( context.setPreemptionUtilizationThreshold(
fsConf.getPreemptionUtilizationThreshold()); fsConf.getPreemptionUtilizationThreshold());
warnTimeBeforeKill = fsConf.getWaitTimeBeforeKill();
preemptionTimer = new Timer("Preemption Timer", true); preemptionTimer = new Timer("Preemption Timer", true);
setDaemon(true); warnTimeBeforeKill = fsConf.getWaitTimeBeforeKill();
setName("FSPreemptionThread"); long allocDelay = (fsConf.isContinuousSchedulingEnabled()
? 10 * fsConf.getContinuousSchedulingSleepMs() // 10 runs
: 4 * scheduler.getNMHeartbeatInterval()); // 4 heartbeats
delayBeforeNextStarvationCheck = warnTimeBeforeKill + allocDelay +
fsConf.getWaitTimeBeforeNextStarvationCheck();
} }
public void run() { public void run() {
@ -62,13 +68,8 @@ public void run() {
FSAppAttempt starvedApp; FSAppAttempt starvedApp;
try{ try{
starvedApp = context.getStarvedApps().take(); starvedApp = context.getStarvedApps().take();
if (!Resources.isNone(starvedApp.getStarvation())) { preemptContainers(identifyContainersToPreempt(starvedApp));
PreemptableContainers containers = starvedApp.preemptionTriggered(delayBeforeNextStarvationCheck);
identifyContainersToPreempt(starvedApp);
if (containers != null) {
preemptContainers(containers.containers);
}
}
} catch (InterruptedException e) { } catch (InterruptedException e) {
LOG.info("Preemption thread interrupted! Exiting."); LOG.info("Preemption thread interrupted! Exiting.");
return; return;
@ -77,55 +78,57 @@ public void run() {
} }
/** /**
* Given an app, identify containers to preempt to satisfy the app's next * Given an app, identify containers to preempt to satisfy the app's
* resource request. * starvation.
*
* Mechanics:
* 1. Fetch all {@link ResourceRequest}s corresponding to the amount of
* starvation.
* 2. For each {@link ResourceRequest}, iterate through matching
* nodes and identify containers to preempt all on one node, also
* optimizing for least number of AM container preemptions.
* *
* @param starvedApp starved application for which we are identifying * @param starvedApp starved application for which we are identifying
* preemption targets * preemption targets
* @return list of containers to preempt to satisfy starvedApp, null if the * @return list of containers to preempt to satisfy starvedApp
* app cannot be satisfied by preempting any running containers
*/ */
private PreemptableContainers identifyContainersToPreempt( private List<RMContainer> identifyContainersToPreempt(
FSAppAttempt starvedApp) { FSAppAttempt starvedApp) {
List<RMContainer> containersToPreempt = new ArrayList<>();
// Iterate through enough RRs to address app's starvation
for (ResourceRequest rr : starvedApp.getStarvedResourceRequests()) {
for (int i = 0; i < rr.getNumContainers(); i++) {
PreemptableContainers bestContainers = null; PreemptableContainers bestContainers = null;
List<FSSchedulerNode> potentialNodes = scheduler.getNodeTracker()
// Find the nodes that match the next resource request .getNodesByResourceName(rr.getResourceName());
ResourceRequest request = starvedApp.getNextResourceRequest();
// TODO (KK): Should we check other resource requests if we can't match
// the first one?
Resource requestCapability = request.getCapability();
List<FSSchedulerNode> potentialNodes =
scheduler.getNodeTracker().getNodesByResourceName(
request.getResourceName());
// From the potential nodes, pick a node that has enough containers
// from apps over their fairshare
for (FSSchedulerNode node : potentialNodes) { for (FSSchedulerNode node : potentialNodes) {
// TODO (YARN-5829): Attempt to reserve the node for starved app. The // TODO (YARN-5829): Attempt to reserve the node for starved app.
// subsequent if-check needs to be reworked accordingly. if (isNodeAlreadyReserved(node, starvedApp)) {
FSAppAttempt nodeReservedApp = node.getReservedAppSchedulable();
if (nodeReservedApp != null && !nodeReservedApp.equals(starvedApp)) {
// This node is already reserved by another app. Let us not consider
// this for preemption.
continue; continue;
} }
int maxAMContainers = bestContainers == null ? int maxAMContainers = bestContainers == null ?
Integer.MAX_VALUE : bestContainers.numAMContainers; Integer.MAX_VALUE : bestContainers.numAMContainers;
PreemptableContainers preemptableContainers = PreemptableContainers preemptableContainers =
identifyContainersToPreemptOnNode(requestCapability, node, identifyContainersToPreemptOnNode(
maxAMContainers); rr.getCapability(), node, maxAMContainers);
if (preemptableContainers != null) { if (preemptableContainers != null) {
if (preemptableContainers.numAMContainers == 0) { // This set is better than any previously identified set.
return preemptableContainers;
} else {
bestContainers = preemptableContainers; bestContainers = preemptableContainers;
if (preemptableContainers.numAMContainers == 0) {
break;
} }
} }
} } // End of iteration through nodes for one RR
return bestContainers; if (bestContainers != null && bestContainers.containers.size() > 0) {
containersToPreempt.addAll(bestContainers.containers);
trackPreemptionsAgainstNode(bestContainers.containers);
}
}
} // End of iteration over RRs
return containersToPreempt;
} }
/** /**
@ -176,23 +179,25 @@ private PreemptableContainers identifyContainersToPreemptOnNode(
return null; return null;
} }
private void preemptContainers(List<RMContainer> containers) { private boolean isNodeAlreadyReserved(
// Mark the containers as being considered for preemption on the node. FSSchedulerNode node, FSAppAttempt app) {
// Make sure the containers are subsequently removed by calling FSAppAttempt nodeReservedApp = node.getReservedAppSchedulable();
// FSSchedulerNode#removeContainerForPreemption. return nodeReservedApp != null && !nodeReservedApp.equals(app);
if (containers.size() > 0) { }
private void trackPreemptionsAgainstNode(List<RMContainer> containers) {
FSSchedulerNode node = (FSSchedulerNode) scheduler.getNodeTracker() FSSchedulerNode node = (FSSchedulerNode) scheduler.getNodeTracker()
.getNode(containers.get(0).getNodeId()); .getNode(containers.get(0).getNodeId());
node.addContainersForPreemption(containers); node.addContainersForPreemption(containers);
} }
private void preemptContainers(List<RMContainer> containers) {
// Warn application about containers to be killed // Warn application about containers to be killed
for (RMContainer container : containers) { for (RMContainer container : containers) {
ApplicationAttemptId appAttemptId = container.getApplicationAttemptId(); ApplicationAttemptId appAttemptId = container.getApplicationAttemptId();
FSAppAttempt app = scheduler.getSchedulerApp(appAttemptId); FSAppAttempt app = scheduler.getSchedulerApp(appAttemptId);
FSLeafQueue queue = app.getQueue();
LOG.info("Preempting container " + container + LOG.info("Preempting container " + container +
" from queue " + queue.getName()); " from queue " + app.getQueueName());
app.trackContainerForPreemption(container); app.trackContainerForPreemption(container);
} }

View File

@ -1772,4 +1772,8 @@ protected void decreaseContainer(
public float getReservableNodesRatio() { public float getReservableNodesRatio() {
return reservableNodesRatio; return reservableNodesRatio;
} }
long getNMHeartbeatInterval() {
return nmHeartbeatInterval;
}
} }

View File

@ -115,11 +115,23 @@ public class FairSchedulerConfiguration extends Configuration {
CONF_PREFIX + "preemption.cluster-utilization-threshold"; CONF_PREFIX + "preemption.cluster-utilization-threshold";
protected static final float DEFAULT_PREEMPTION_THRESHOLD = 0.8f; protected static final float DEFAULT_PREEMPTION_THRESHOLD = 0.8f;
protected static final String PREEMPTION_INTERVAL = CONF_PREFIX + "preemptionInterval";
protected static final int DEFAULT_PREEMPTION_INTERVAL = 5000;
protected static final String WAIT_TIME_BEFORE_KILL = CONF_PREFIX + "waitTimeBeforeKill"; protected static final String WAIT_TIME_BEFORE_KILL = CONF_PREFIX + "waitTimeBeforeKill";
protected static final int DEFAULT_WAIT_TIME_BEFORE_KILL = 15000; protected static final int DEFAULT_WAIT_TIME_BEFORE_KILL = 15000;
/**
* Configurable delay (ms) before an app's starvation is considered after
* it is identified. This is to give the scheduler enough time to
* allocate containers post preemption. This delay is added to the
* {@link #WAIT_TIME_BEFORE_KILL} and enough heartbeats.
*
* This is intended to be a backdoor on production clusters, and hence
* intentionally not documented.
*/
protected static final String WAIT_TIME_BEFORE_NEXT_STARVATION_CHECK_MS =
CONF_PREFIX + "waitTimeBeforeNextStarvationCheck";
protected static final long
DEFAULT_WAIT_TIME_BEFORE_NEXT_STARVATION_CHECK_MS = 10000;
/** Whether to assign multiple containers in one check-in. */ /** Whether to assign multiple containers in one check-in. */
public static final String ASSIGN_MULTIPLE = CONF_PREFIX + "assignmultiple"; public static final String ASSIGN_MULTIPLE = CONF_PREFIX + "assignmultiple";
protected static final boolean DEFAULT_ASSIGN_MULTIPLE = false; protected static final boolean DEFAULT_ASSIGN_MULTIPLE = false;
@ -251,8 +263,9 @@ public String getEventlogDir() {
"/tmp/")).getAbsolutePath() + File.separator + "fairscheduler"); "/tmp/")).getAbsolutePath() + File.separator + "fairscheduler");
} }
public int getPreemptionInterval() { public long getWaitTimeBeforeNextStarvationCheck() {
return getInt(PREEMPTION_INTERVAL, DEFAULT_PREEMPTION_INTERVAL); return getLong(WAIT_TIME_BEFORE_NEXT_STARVATION_CHECK_MS,
DEFAULT_WAIT_TIME_BEFORE_NEXT_STARVATION_CHECK_MS);
} }
public int getWaitTimeBeforeKill() { public int getWaitTimeBeforeKill() {

View File

@ -0,0 +1,146 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ClusterNodeTracker;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Applications place {@link ResourceRequest}s at multiple levels. This is a
* helper class that allows tracking if a {@link ResourceRequest} has been
* visited at a different locality level.
*
* This is implemented for {@link FSAppAttempt#getStarvedResourceRequests()}.
* The implementation is not thread-safe.
*/
class VisitedResourceRequestTracker {
private static final Log LOG =
LogFactory.getLog(VisitedResourceRequestTracker.class);
private final Map<Priority, Map<Resource, TrackerPerPriorityResource>> map =
new HashMap<>();
private final ClusterNodeTracker<FSSchedulerNode> nodeTracker;
VisitedResourceRequestTracker(
ClusterNodeTracker<FSSchedulerNode> nodeTracker) {
this.nodeTracker = nodeTracker;
}
/**
* Check if the {@link ResourceRequest} is visited before, and track it.
* @param rr {@link ResourceRequest} to visit
* @return true if <code>rr</code> is the first visit across all
* locality levels, false otherwise
*/
boolean visit(ResourceRequest rr) {
Priority priority = rr.getPriority();
Resource capability = rr.getCapability();
Map<Resource, TrackerPerPriorityResource> subMap = map.get(priority);
if (subMap == null) {
subMap = new HashMap<>();
map.put(priority, subMap);
}
TrackerPerPriorityResource tracker = subMap.get(capability);
if (tracker == null) {
tracker = new TrackerPerPriorityResource();
subMap.put(capability, tracker);
}
return tracker.visit(rr.getResourceName());
}
private class TrackerPerPriorityResource {
private Set<String> racksWithNodesVisited = new HashSet<>();
private Set<String> racksVisted = new HashSet<>();
private boolean anyVisited;
private boolean visitAny() {
if (racksVisted.isEmpty() && racksWithNodesVisited.isEmpty()) {
anyVisited = true;
}
return anyVisited;
}
private boolean visitRack(String rackName) {
if (anyVisited || racksWithNodesVisited.contains(rackName)) {
return false;
} else {
racksVisted.add(rackName);
return true;
}
}
private boolean visitNode(String rackName) {
if (anyVisited || racksVisted.contains(rackName)) {
return false;
} else {
racksWithNodesVisited.add(rackName);
return true;
}
}
/**
* Based on whether <code>resourceName</code> is a node, rack or ANY,
* check if this has been visited earlier.
*
* A node is considered visited if its rack or ANY have been visited.
* A rack is considered visited if any nodes or ANY have been visited.
* Any is considered visited if any of the nodes/racks have been visited.
*
* @param resourceName nodename or rackname or ANY
* @return true if this is the first visit, false otherwise
*/
private boolean visit(String resourceName) {
if (resourceName.equals(ResourceRequest.ANY)) {
return visitAny();
}
List<FSSchedulerNode> nodes =
nodeTracker.getNodesByResourceName(resourceName);
int numNodes = nodes.size();
if (numNodes == 0) {
LOG.error("Found ResourceRequest for a non-existent node/rack named " +
resourceName);
return false;
}
if (numNodes == 1) {
// Found a single node. To be safe, let us verify it is a node and
// not a rack with a single node.
FSSchedulerNode node = nodes.get(0);
if (node.getNodeName().equals(resourceName)) {
return visitNode(node.getRackName());
}
}
// At this point, it is not ANY or a node. Must be a rack
return visitRack(resourceName);
}
}
}

View File

@ -21,6 +21,8 @@
import java.util.Set; import java.util.Set;
public class FairSchedulerWithMockPreemption extends FairScheduler { public class FairSchedulerWithMockPreemption extends FairScheduler {
static final long DELAY_FOR_NEXT_STARVATION_CHECK_MS = 10 * 60 * 1000;
@Override @Override
protected void createPreemptionThread() { protected void createPreemptionThread() {
preemptionThread = new MockPreemptionThread(this); preemptionThread = new MockPreemptionThread(this);
@ -30,7 +32,7 @@ static class MockPreemptionThread extends FSPreemptionThread {
private Set<FSAppAttempt> appsAdded = new HashSet<>(); private Set<FSAppAttempt> appsAdded = new HashSet<>();
private int totalAppsAdded = 0; private int totalAppsAdded = 0;
MockPreemptionThread(FairScheduler scheduler) { private MockPreemptionThread(FairScheduler scheduler) {
super(scheduler); super(scheduler);
} }
@ -41,6 +43,7 @@ public void run() {
FSAppAttempt app = context.getStarvedApps().take(); FSAppAttempt app = context.getStarvedApps().take();
appsAdded.add(app); appsAdded.add(app);
totalAppsAdded++; totalAppsAdded++;
app.preemptionTriggered(DELAY_FOR_NEXT_STARVATION_CHECK_MS);
} catch (InterruptedException e) { } catch (InterruptedException e) {
return; return;
} }

View File

@ -22,6 +22,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.MockRM; import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
import org.apache.hadoop.yarn.util.ControlledClock;
import org.junit.After; import org.junit.After;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
@ -43,6 +44,8 @@ public class TestFSAppStarvation extends FairSchedulerTestBase {
private static final File ALLOC_FILE = new File(TEST_DIR, "test-QUEUES"); private static final File ALLOC_FILE = new File(TEST_DIR, "test-QUEUES");
private final ControlledClock clock = new ControlledClock();
// Node Capacity = NODE_CAPACITY_MULTIPLE * (1 GB or 1 vcore) // Node Capacity = NODE_CAPACITY_MULTIPLE * (1 GB or 1 vcore)
private static final int NODE_CAPACITY_MULTIPLE = 4; private static final int NODE_CAPACITY_MULTIPLE = 4;
private static final String[] QUEUES = private static final String[] QUEUES =
@ -99,11 +102,17 @@ public void testPreemptionEnabled() throws Exception {
+ "minshare and fairshare queues", + "minshare and fairshare queues",
3, preemptionThread.uniqueAppsAdded()); 3, preemptionThread.uniqueAppsAdded());
// Verify the apps get added again on a subsequent update // Verify apps are added again only after the set delay for starvation has
// passed.
clock.tickSec(1);
scheduler.update(); scheduler.update();
Thread.yield(); assertEquals("Apps re-added even before starvation delay passed",
preemptionThread.totalAppsAdded(), preemptionThread.uniqueAppsAdded());
verifyLeafQueueStarvation(); verifyLeafQueueStarvation();
clock.tickMsec(
FairSchedulerWithMockPreemption.DELAY_FOR_NEXT_STARVATION_CHECK_MS);
scheduler.update();
assertTrue("Each app is marked as starved exactly once", assertTrue("Each app is marked as starved exactly once",
preemptionThread.totalAppsAdded() > preemptionThread.uniqueAppsAdded()); preemptionThread.totalAppsAdded() > preemptionThread.uniqueAppsAdded());
} }
@ -141,7 +150,7 @@ private void setupClusterAndSubmitJobs() throws Exception {
sendEnoughNodeUpdatesToAssignFully(); sendEnoughNodeUpdatesToAssignFully();
// Sleep to hit the preemption timeouts // Sleep to hit the preemption timeouts
Thread.sleep(10); clock.tickMsec(10);
// Scheduler update to populate starved apps // Scheduler update to populate starved apps
scheduler.update(); scheduler.update();
@ -208,8 +217,9 @@ private void setupStarvedCluster() throws IOException {
ALLOC_FILE.exists()); ALLOC_FILE.exists());
resourceManager = new MockRM(conf); resourceManager = new MockRM(conf);
resourceManager.start();
scheduler = (FairScheduler) resourceManager.getResourceScheduler(); scheduler = (FairScheduler) resourceManager.getResourceScheduler();
scheduler.setClock(clock);
resourceManager.start();
preemptionThread = (FairSchedulerWithMockPreemption.MockPreemptionThread) preemptionThread = (FairSchedulerWithMockPreemption.MockPreemptionThread)
scheduler.preemptionThread; scheduler.preemptionThread;

View File

@ -23,6 +23,7 @@
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
import org.apache.hadoop.yarn.util.ControlledClock;
import org.junit.After; import org.junit.After;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
@ -49,6 +50,9 @@ public class TestFairSchedulerPreemption extends FairSchedulerTestBase {
private static final File ALLOC_FILE = new File(TEST_DIR, "test-queues"); private static final File ALLOC_FILE = new File(TEST_DIR, "test-queues");
private static final int GB = 1024; private static final int GB = 1024;
// Scheduler clock
private final ControlledClock clock = new ControlledClock();
// Node Capacity = NODE_CAPACITY_MULTIPLE * (1 GB or 1 vcore) // Node Capacity = NODE_CAPACITY_MULTIPLE * (1 GB or 1 vcore)
private static final int NODE_CAPACITY_MULTIPLE = 4; private static final int NODE_CAPACITY_MULTIPLE = 4;
@ -60,25 +64,28 @@ public class TestFairSchedulerPreemption extends FairSchedulerTestBase {
// Starving app that is expected to instigate preemption // Starving app that is expected to instigate preemption
private FSAppAttempt starvingApp; private FSAppAttempt starvingApp;
@Parameterized.Parameters @Parameterized.Parameters(name = "{0}")
public static Collection<Boolean[]> getParameters() { public static Collection<Object[]> getParameters() {
return Arrays.asList(new Boolean[][] { return Arrays.asList(new Object[][] {
{true}, {false}}); {"FairSharePreemption", true},
{"MinSharePreemption", false}});
} }
public TestFairSchedulerPreemption(Boolean fairshare) throws IOException { public TestFairSchedulerPreemption(String name, boolean fairshare)
throws IOException {
fairsharePreemption = fairshare; fairsharePreemption = fairshare;
writeAllocFile(); writeAllocFile();
} }
@Before @Before
public void setup() { public void setup() throws IOException {
createConfiguration(); createConfiguration();
conf.set(FairSchedulerConfiguration.ALLOCATION_FILE, conf.set(FairSchedulerConfiguration.ALLOCATION_FILE,
ALLOC_FILE.getAbsolutePath()); ALLOC_FILE.getAbsolutePath());
conf.setBoolean(FairSchedulerConfiguration.PREEMPTION, true); conf.setBoolean(FairSchedulerConfiguration.PREEMPTION, true);
conf.setFloat(FairSchedulerConfiguration.PREEMPTION_THRESHOLD, 0f); conf.setFloat(FairSchedulerConfiguration.PREEMPTION_THRESHOLD, 0f);
conf.setInt(FairSchedulerConfiguration.WAIT_TIME_BEFORE_KILL, 0); conf.setInt(FairSchedulerConfiguration.WAIT_TIME_BEFORE_KILL, 0);
setupCluster();
} }
@After @After
@ -166,8 +173,9 @@ private void writeResourceParams(PrintWriter out) {
private void setupCluster() throws IOException { private void setupCluster() throws IOException {
resourceManager = new MockRM(conf); resourceManager = new MockRM(conf);
resourceManager.start();
scheduler = (FairScheduler) resourceManager.getResourceScheduler(); scheduler = (FairScheduler) resourceManager.getResourceScheduler();
scheduler.setClock(clock);
resourceManager.start();
// Create and add two nodes to the cluster // Create and add two nodes to the cluster
addNode(NODE_CAPACITY_MULTIPLE * GB, NODE_CAPACITY_MULTIPLE); addNode(NODE_CAPACITY_MULTIPLE * GB, NODE_CAPACITY_MULTIPLE);
@ -197,7 +205,7 @@ private void sendEnoughNodeUpdatesToAssignFully() {
* *
* @param queueName queue name * @param queueName queue name
*/ */
private void takeAllResource(String queueName) { private void takeAllResources(String queueName) {
// Create an app that takes up all the resources on the cluster // Create an app that takes up all the resources on the cluster
ApplicationAttemptId appAttemptId ApplicationAttemptId appAttemptId
= createSchedulingRequest(GB, 1, queueName, "default", = createSchedulingRequest(GB, 1, queueName, "default",
@ -227,8 +235,8 @@ private void preemptHalfResources(String queueName)
NODE_CAPACITY_MULTIPLE * rmNodes.size() / 2); NODE_CAPACITY_MULTIPLE * rmNodes.size() / 2);
starvingApp = scheduler.getSchedulerApp(appAttemptId); starvingApp = scheduler.getSchedulerApp(appAttemptId);
// Sleep long enough to pass // Move clock enough to identify starvation
Thread.sleep(10); clock.tickSec(1);
scheduler.update(); scheduler.update();
} }
@ -243,14 +251,13 @@ private void preemptHalfResources(String queueName)
*/ */
private void submitApps(String queue1, String queue2) private void submitApps(String queue1, String queue2)
throws InterruptedException { throws InterruptedException {
takeAllResource(queue1); takeAllResources(queue1);
preemptHalfResources(queue2); preemptHalfResources(queue2);
} }
private void verifyPreemption() throws InterruptedException { private void verifyPreemption() throws InterruptedException {
// Sleep long enough for four containers to be preempted. Note that the // Sleep long enough for four containers to be preempted.
// starved app must be queued four times for containers to be preempted. for (int i = 0; i < 100; i++) {
for (int i = 0; i < 10000; i++) {
if (greedyApp.getLiveContainers().size() == 4) { if (greedyApp.getLiveContainers().size() == 4) {
break; break;
} }
@ -268,7 +275,7 @@ private void verifyPreemption() throws InterruptedException {
private void verifyNoPreemption() throws InterruptedException { private void verifyNoPreemption() throws InterruptedException {
// Sleep long enough to ensure not even one container is preempted. // Sleep long enough to ensure not even one container is preempted.
for (int i = 0; i < 600; i++) { for (int i = 0; i < 100; i++) {
if (greedyApp.getLiveContainers().size() != 8) { if (greedyApp.getLiveContainers().size() != 8) {
break; break;
} }
@ -279,7 +286,6 @@ private void verifyNoPreemption() throws InterruptedException {
@Test @Test
public void testPreemptionWithinSameLeafQueue() throws Exception { public void testPreemptionWithinSameLeafQueue() throws Exception {
setupCluster();
String queue = "root.preemptable.child-1"; String queue = "root.preemptable.child-1";
submitApps(queue, queue); submitApps(queue, queue);
if (fairsharePreemption) { if (fairsharePreemption) {
@ -291,21 +297,18 @@ public void testPreemptionWithinSameLeafQueue() throws Exception {
@Test @Test
public void testPreemptionBetweenTwoSiblingLeafQueues() throws Exception { public void testPreemptionBetweenTwoSiblingLeafQueues() throws Exception {
setupCluster();
submitApps("root.preemptable.child-1", "root.preemptable.child-2"); submitApps("root.preemptable.child-1", "root.preemptable.child-2");
verifyPreemption(); verifyPreemption();
} }
@Test @Test
public void testPreemptionBetweenNonSiblingQueues() throws Exception { public void testPreemptionBetweenNonSiblingQueues() throws Exception {
setupCluster();
submitApps("root.preemptable.child-1", "root.nonpreemptable.child-1"); submitApps("root.preemptable.child-1", "root.nonpreemptable.child-1");
verifyPreemption(); verifyPreemption();
} }
@Test @Test
public void testNoPreemptionFromDisallowedQueue() throws Exception { public void testNoPreemptionFromDisallowedQueue() throws Exception {
setupCluster();
submitApps("root.nonpreemptable.child-1", "root.preemptable.child-1"); submitApps("root.nonpreemptable.child-1", "root.preemptable.child-1");
verifyNoPreemption(); verifyNoPreemption();
} }
@ -331,9 +334,7 @@ private void setNumAMContainersPerNode(int numAMContainersPerNode) {
@Test @Test
public void testPreemptionSelectNonAMContainer() throws Exception { public void testPreemptionSelectNonAMContainer() throws Exception {
setupCluster(); takeAllResources("root.preemptable.child-1");
takeAllResource("root.preemptable.child-1");
setNumAMContainersPerNode(2); setNumAMContainersPerNode(2);
preemptHalfResources("root.preemptable.child-2"); preemptHalfResources("root.preemptable.child-2");

View File

@ -0,0 +1,112 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.server.resourcemanager.MockNodes;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ClusterNodeTracker;
import org.apache.hadoop.yarn.util.resource.Resources;
import org.junit.Assert;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import org.junit.Test;
import java.util.List;
public class TestVisitedResourceRequestTracker {
private final ClusterNodeTracker<FSSchedulerNode>
nodeTracker = new ClusterNodeTracker<>();
private final ResourceRequest
anyRequest, rackRequest, node1Request, node2Request;
private final String NODE_VISITED = "The node is already visited. ";
private final String RACK_VISITED = "The rack is already visited. ";
private final String ANY_VISITED = "ANY is already visited. ";
private final String NODE_FAILURE = "The node is visited again.";
private final String RACK_FAILURE = "The rack is visited again.";
private final String ANY_FAILURE = "ANY is visited again.";
private final String FIRST_CALL_FAILURE = "First call to visit failed.";
public TestVisitedResourceRequestTracker() {
List<RMNode> rmNodes =
MockNodes.newNodes(1, 2, Resources.createResource(8192, 8));
FSSchedulerNode node1 = new FSSchedulerNode(rmNodes.get(0), false);
nodeTracker.addNode(node1);
node1Request = createRR(node1.getNodeName(), 1);
FSSchedulerNode node2 = new FSSchedulerNode(rmNodes.get(1), false);
node2Request = createRR(node2.getNodeName(), 1);
nodeTracker.addNode(node2);
anyRequest = createRR(ResourceRequest.ANY, 2);
rackRequest = createRR(node1.getRackName(), 2);
}
private ResourceRequest createRR(String resourceName, int count) {
return ResourceRequest.newInstance(
Priority.UNDEFINED, resourceName, Resources.none(), count);
}
@Test
public void testVisitAnyRequestFirst() {
VisitedResourceRequestTracker tracker =
new VisitedResourceRequestTracker(nodeTracker);
// Visit ANY request first
assertTrue(FIRST_CALL_FAILURE, tracker.visit(anyRequest));
// All other requests should return false
assertFalse(ANY_VISITED + RACK_FAILURE, tracker.visit(rackRequest));
assertFalse(ANY_VISITED + NODE_FAILURE, tracker.visit(node1Request));
assertFalse(ANY_VISITED + NODE_FAILURE, tracker.visit(node2Request));
}
@Test
public void testVisitRackRequestFirst() {
VisitedResourceRequestTracker tracker =
new VisitedResourceRequestTracker(nodeTracker);
// Visit rack request first
assertTrue(FIRST_CALL_FAILURE, tracker.visit(rackRequest));
// All other requests should return false
assertFalse(RACK_VISITED + ANY_FAILURE, tracker.visit(anyRequest));
assertFalse(RACK_VISITED + NODE_FAILURE, tracker.visit(node1Request));
assertFalse(RACK_VISITED + NODE_FAILURE, tracker.visit(node2Request));
}
@Test
public void testVisitNodeRequestFirst() {
VisitedResourceRequestTracker tracker =
new VisitedResourceRequestTracker(nodeTracker);
// Visit node1 first
assertTrue(FIRST_CALL_FAILURE, tracker.visit(node1Request));
// Rack and ANY should return false
assertFalse(NODE_VISITED + ANY_FAILURE, tracker.visit(anyRequest));
assertFalse(NODE_VISITED + RACK_FAILURE, tracker.visit(rackRequest));
// The other node should return true
assertTrue(NODE_VISITED + "Different node visit failed",
tracker.visit(node2Request));
}
}