Merge branch 'trunk' into HDDS-48

2018-06-28 14:57:23 -07:00 · 2018-06-28 14:57:23 -07:00 · 84ac6bb1b1
parent c104525168 291194302c
commit 84ac6bb1b1
18 changed files with 663 additions and 91 deletions
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
@ -329,9 +329,12 @@ public synchronized boolean parentZNodeExists()
   * This recursively creates the znode as well as all of its parents.
   */
  public synchronized void ensureParentZNode()
-      throws IOException, InterruptedException {
+      throws IOException, InterruptedException, KeeperException {
    Preconditions.checkState(!wantToBeInElection,
        "ensureParentZNode() may not be called while in the election");
    if (zkClient == null) {
      createConnection();
    }
    String pathParts[] = znodeWorkingDir.split("/");
    Preconditions.checkArgument(pathParts.length >= 1 &&
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ZKFailoverController.java
@ -269,7 +269,7 @@ private void printUsage() {
  }
  private int formatZK(boolean force, boolean interactive)
-      throws IOException, InterruptedException {
+      throws IOException, InterruptedException, KeeperException {
    if (elector.parentZNodeExists()) {
      if (!force && (!interactive || !confirmFormat())) {
        return ERR_CODE_FORMAT_DENIED;
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/AbstractPreemptableResourceCalculator.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/AbstractPreemptableResourceCalculator.java
@ -42,6 +42,7 @@ public class AbstractPreemptableResourceCalculator {
  protected final ResourceCalculator rc;
  protected boolean isReservedPreemptionCandidatesSelector;
  private Resource stepFactor;
  private boolean allowQueuesBalanceAfterAllQueuesSatisfied;
  static class TQComparator implements Comparator<TempQueuePerPartition> {
    private ResourceCalculator rc;
@ -83,15 +84,28 @@ private double getIdealPctOfGuaranteed(TempQueuePerPartition q) {
   *          this will be set by different implementation of candidate
   *          selectors, please refer to TempQueuePerPartition#offer for
   *          details.
   * @param allowQueuesBalanceAfterAllQueuesSatisfied
   *          Should resources be preempted from an over-served queue when the
   *          requesting queues are all at or over their guarantees?
   *          An example is, there're 10 queues under root, guaranteed resource
   *          of them are all 10%.
   *          Assume there're two queues are using resources, queueA uses 10%
   *          queueB uses 90%. For all queues are guaranteed, but it's not fair
   *          for queueA.
   *          We wanna make this behavior can be configured. By default it is
   *          not allowed.
   *
   */
  public AbstractPreemptableResourceCalculator(
      CapacitySchedulerPreemptionContext preemptionContext,
-      boolean isReservedPreemptionCandidatesSelector) {
+      boolean isReservedPreemptionCandidatesSelector,
      boolean allowQueuesBalanceAfterAllQueuesSatisfied) {
    context = preemptionContext;
    rc = preemptionContext.getResourceCalculator();
    this.isReservedPreemptionCandidatesSelector =
        isReservedPreemptionCandidatesSelector;
-
+    this.allowQueuesBalanceAfterAllQueuesSatisfied =
        allowQueuesBalanceAfterAllQueuesSatisfied;
    stepFactor = Resource.newInstance(0, 0);
    for (ResourceInformation ri : stepFactor.getResources()) {
      ri.setValue(1);
@ -193,7 +207,8 @@ protected void computeFixpointAllocation(Resource totGuarant,
        wQavail = Resources.componentwiseMin(wQavail, unassigned);
        Resource wQidle = sub.offer(wQavail, rc, totGuarant,
-            isReservedPreemptionCandidatesSelector);
+            isReservedPreemptionCandidatesSelector,
            allowQueuesBalanceAfterAllQueuesSatisfied);
        Resource wQdone = Resources.subtract(wQavail, wQidle);
        if (Resources.greaterThan(rc, totGuarant, wQdone, Resources.none())) {
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionContext.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionContext.java
@ -70,6 +70,8 @@ TempQueuePerPartition getQueueByPartition(String queueName,
  float getMaxAllowableLimitForIntraQueuePreemption();
  long getDefaultMaximumKillWaitTimeout();
  @Unstable
  IntraQueuePreemptionOrderPolicy getIntraQueuePreemptionOrderPolicy();
 }
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionUtils.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/CapacitySchedulerPreemptionUtils.java
@ -151,6 +151,7 @@ public static boolean tryPreemptContainerAndDeductResToObtain(
      Map<String, Resource> resourceToObtainByPartitions,
      RMContainer rmContainer, Resource clusterResource,
      Map<ApplicationAttemptId, Set<RMContainer>> preemptMap,
      Map<ApplicationAttemptId, Set<RMContainer>> curCandidates,
      Resource totalPreemptionAllowed, boolean conservativeDRF) {
    ApplicationAttemptId attemptId = rmContainer.getApplicationAttemptId();
@ -218,7 +219,7 @@ public static boolean tryPreemptContainerAndDeductResToObtain(
      }
      // Add to preemptMap
-      addToPreemptMap(preemptMap, attemptId, rmContainer);
+      addToPreemptMap(preemptMap, curCandidates, attemptId, rmContainer);
      return true;
    }
@ -230,15 +231,23 @@ private static String getPartitionByNodeId(
    return context.getScheduler().getSchedulerNode(nodeId).getPartition();
  }
-  private static void addToPreemptMap(
+  protected static void addToPreemptMap(
      Map<ApplicationAttemptId, Set<RMContainer>> preemptMap,
      Map<ApplicationAttemptId, Set<RMContainer>> curCandidates,
      ApplicationAttemptId appAttemptId, RMContainer containerToPreempt) {
-    Set<RMContainer> set = preemptMap.get(appAttemptId);
+    Set<RMContainer> setForToPreempt = preemptMap.get(appAttemptId);
-    if (null == set) {
+    Set<RMContainer> setForCurCandidates = curCandidates.get(appAttemptId);
-      set = new HashSet<>();
+    if (null == setForToPreempt) {
-      preemptMap.put(appAttemptId, set);
+      setForToPreempt = new HashSet<>();
      preemptMap.put(appAttemptId, setForToPreempt);
    }
-    set.add(containerToPreempt);
+    setForToPreempt.add(containerToPreempt);
    if (null == setForCurCandidates) {
      setForCurCandidates = new HashSet<>();
      curCandidates.put(appAttemptId, setForCurCandidates);
    }
    setForCurCandidates.add(containerToPreempt);
  }
  private static boolean preemptMapContains(
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/FifoCandidatesSelector.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/FifoCandidatesSelector.java
@ -31,6 +31,7 @@
 import org.apache.hadoop.yarn.util.resource.Resources;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@ -42,19 +43,25 @@ public class FifoCandidatesSelector
  private static final Log LOG =
      LogFactory.getLog(FifoCandidatesSelector.class);
  private PreemptableResourceCalculator preemptableAmountCalculator;
  private boolean allowQueuesBalanceAfterAllQueuesSatisfied;
  FifoCandidatesSelector(CapacitySchedulerPreemptionContext preemptionContext,
-      boolean includeReservedResource) {
+      boolean includeReservedResource,
      boolean allowQueuesBalanceAfterAllQueuesSatisfied) {
    super(preemptionContext);
    this.allowQueuesBalanceAfterAllQueuesSatisfied =
        allowQueuesBalanceAfterAllQueuesSatisfied;
    preemptableAmountCalculator = new PreemptableResourceCalculator(
-        preemptionContext, includeReservedResource);
+        preemptionContext, includeReservedResource,
        allowQueuesBalanceAfterAllQueuesSatisfied);
  }
  @Override
  public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
      Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates,
      Resource clusterResource, Resource totalPreemptionAllowed) {
    Map<ApplicationAttemptId, Set<RMContainer>> curCandidates = new HashMap<>();
    // Calculate how much resources we need to preempt
    preemptableAmountCalculator.computeIdealAllocation(clusterResource,
        totalPreemptionAllowed);
@ -110,7 +117,7 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
              boolean preempted = CapacitySchedulerPreemptionUtils
                  .tryPreemptContainerAndDeductResToObtain(rc,
                      preemptionContext, resToObtainByPartition, c,
-                      clusterResource, selectedCandidates,
+                      clusterResource, selectedCandidates, curCandidates,
                      totalPreemptionAllowed, false);
              if (!preempted) {
                continue;
@ -134,7 +141,7 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
          preemptFrom(fc, clusterResource, resToObtainByPartition,
              skippedAMContainerlist, skippedAMSize, selectedCandidates,
-              totalPreemptionAllowed);
+              curCandidates, totalPreemptionAllowed);
        }
        // Can try preempting AMContainers (still saving atmost
@ -145,15 +152,15 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
                leafQueue.getEffectiveCapacity(RMNodeLabelsManager.NO_LABEL),
                leafQueue.getMaxAMResourcePerQueuePercent());
-        preemptAMContainers(clusterResource, selectedCandidates, skippedAMContainerlist,
+        preemptAMContainers(clusterResource, selectedCandidates, curCandidates,
-            resToObtainByPartition, skippedAMSize, maxAMCapacityForThisQueue,
+            skippedAMContainerlist, resToObtainByPartition, skippedAMSize,
-            totalPreemptionAllowed);
+            maxAMCapacityForThisQueue, totalPreemptionAllowed);
      } finally {
        leafQueue.getReadLock().unlock();
      }
    }
-    return selectedCandidates;
+    return curCandidates;
  }
  /**
@ -169,6 +176,7 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
   */
  private void preemptAMContainers(Resource clusterResource,
      Map<ApplicationAttemptId, Set<RMContainer>> preemptMap,
      Map<ApplicationAttemptId, Set<RMContainer>> curCandidates,
      List<RMContainer> skippedAMContainerlist,
      Map<String, Resource> resToObtainByPartition, Resource skippedAMSize,
      Resource maxAMCapacityForThisQueue, Resource totalPreemptionAllowed) {
@ -187,7 +195,7 @@ private void preemptAMContainers(Resource clusterResource,
      boolean preempted = CapacitySchedulerPreemptionUtils
          .tryPreemptContainerAndDeductResToObtain(rc, preemptionContext,
              resToObtainByPartition, c, clusterResource, preemptMap,
-              totalPreemptionAllowed, false);
+              curCandidates, totalPreemptionAllowed, false);
      if (preempted) {
        Resources.subtractFrom(skippedAMSize, c.getAllocatedResource());
      }
@ -203,6 +211,7 @@ private void preemptFrom(FiCaSchedulerApp app,
      Resource clusterResource, Map<String, Resource> resToObtainByPartition,
      List<RMContainer> skippedAMContainerlist, Resource skippedAMSize,
      Map<ApplicationAttemptId, Set<RMContainer>> selectedContainers,
      Map<ApplicationAttemptId, Set<RMContainer>> curCandidates,
      Resource totalPreemptionAllowed) {
    ApplicationAttemptId appId = app.getApplicationAttemptId();
@ -219,9 +228,10 @@ private void preemptFrom(FiCaSchedulerApp app,
      }
      // Try to preempt this container
-      CapacitySchedulerPreemptionUtils.tryPreemptContainerAndDeductResToObtain(
+     CapacitySchedulerPreemptionUtils
-          rc, preemptionContext, resToObtainByPartition, c, clusterResource,
+          .tryPreemptContainerAndDeductResToObtain(rc, preemptionContext,
-          selectedContainers, totalPreemptionAllowed, false);
+              resToObtainByPartition, c, clusterResource, selectedContainers,
              curCandidates, totalPreemptionAllowed, false);
      if (!preemptionContext.isObserveOnly()) {
        preemptionContext.getRMContext().getDispatcher().getEventHandler()
@ -262,9 +272,14 @@ private void preemptFrom(FiCaSchedulerApp app,
      }
      // Try to preempt this container
-      CapacitySchedulerPreemptionUtils.tryPreemptContainerAndDeductResToObtain(
+      CapacitySchedulerPreemptionUtils
-          rc, preemptionContext, resToObtainByPartition, c, clusterResource,
+          .tryPreemptContainerAndDeductResToObtain(rc, preemptionContext,
-          selectedContainers, totalPreemptionAllowed, false);
+              resToObtainByPartition, c, clusterResource, selectedContainers,
              curCandidates, totalPreemptionAllowed, false);
    }
  }
  public boolean getAllowQueuesBalanceAfterAllQueuesSatisfied() {
    return allowQueuesBalanceAfterAllQueuesSatisfied;
  }
 }
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/IntraQueueCandidatesSelector.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/IntraQueueCandidatesSelector.java
@ -122,7 +122,7 @@ public int compare(TempAppPerPartition ta1, TempAppPerPartition ta2) {
  public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
      Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates,
      Resource clusterResource, Resource totalPreemptedResourceAllowed) {
-
+    Map<ApplicationAttemptId, Set<RMContainer>> curCandidates = new HashMap<>();
    // 1. Calculate the abnormality within each queue one by one.
    computeIntraQueuePreemptionDemand(
        clusterResource, totalPreemptedResourceAllowed, selectedCandidates);
@ -182,7 +182,7 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
          leafQueue.getReadLock().lock();
          for (FiCaSchedulerApp app : apps) {
            preemptFromLeastStarvedApp(leafQueue, app, selectedCandidates,
-                clusterResource, totalPreemptedResourceAllowed,
+                curCandidates, clusterResource, totalPreemptedResourceAllowed,
                resToObtainByPartition, rollingResourceUsagePerUser);
          }
        } finally {
@ -191,7 +191,7 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
      }
    }
-    return selectedCandidates;
+    return curCandidates;
  }
  private void initializeUsageAndUserLimitForCompute(Resource clusterResource,
@ -211,6 +211,7 @@ private void initializeUsageAndUserLimitForCompute(Resource clusterResource,
  private void preemptFromLeastStarvedApp(LeafQueue leafQueue,
      FiCaSchedulerApp app,
      Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates,
      Map<ApplicationAttemptId, Set<RMContainer>> curCandidates,
      Resource clusterResource, Resource totalPreemptedResourceAllowed,
      Map<String, Resource> resToObtainByPartition,
      Map<String, Resource> rollingResourceUsagePerUser) {
@ -270,7 +271,7 @@ private void preemptFromLeastStarvedApp(LeafQueue leafQueue,
      boolean ret = CapacitySchedulerPreemptionUtils
          .tryPreemptContainerAndDeductResToObtain(rc, preemptionContext,
              resToObtainByPartition, c, clusterResource, selectedCandidates,
-              totalPreemptedResourceAllowed, true);
+              curCandidates, totalPreemptedResourceAllowed, true);
      // Subtract from respective user's resource usage once a container is
      // selected for preemption.
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptableResourceCalculator.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptableResourceCalculator.java
@ -48,11 +48,14 @@ public class PreemptableResourceCalculator
   * @param isReservedPreemptionCandidatesSelector this will be set by
   * different implementation of candidate selectors, please refer to
   * TempQueuePerPartition#offer for details.
   * @param allowQueuesBalanceAfterAllQueuesSatisfied
   */
  public PreemptableResourceCalculator(
      CapacitySchedulerPreemptionContext preemptionContext,
-      boolean isReservedPreemptionCandidatesSelector) {
+      boolean isReservedPreemptionCandidatesSelector,
-    super(preemptionContext, isReservedPreemptionCandidatesSelector);
+      boolean allowQueuesBalanceAfterAllQueuesSatisfied) {
    super(preemptionContext, isReservedPreemptionCandidatesSelector,
        allowQueuesBalanceAfterAllQueuesSatisfied);
  }
  /**
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptionCandidatesSelector.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/PreemptionCandidatesSelector.java
@ -34,6 +34,7 @@
 public abstract class PreemptionCandidatesSelector {
  protected CapacitySchedulerPreemptionContext preemptionContext;
  protected ResourceCalculator rc;
  private long maximumKillWaitTime = -1;
  PreemptionCandidatesSelector(
      CapacitySchedulerPreemptionContext preemptionContext) {
@ -77,4 +78,14 @@ public int compare(RMContainer a, RMContainer b) {
    });
  }
  public long getMaximumKillWaitTimeMs() {
    if (maximumKillWaitTime > 0) {
      return maximumKillWaitTime;
    }
    return preemptionContext.getDefaultMaximumKillWaitTimeout();
  }
  public void setMaximumKillWaitTime(long maximumKillWaitTime) {
    this.maximumKillWaitTime = maximumKillWaitTime;
  }
 }
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicy.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ProportionalCapacityPreemptionPolicy.java
@ -131,6 +131,8 @@ public enum IntraQueuePreemptionOrderPolicy {
  private List<PreemptionCandidatesSelector> candidatesSelectionPolicies;
  private Set<String> allPartitions;
  private Set<String> leafQueueNames;
  Map<PreemptionCandidatesSelector, Map<ApplicationAttemptId,
      Set<RMContainer>>> pcsMap;
  // Preemptable Entities, synced from scheduler at every run
  private Map<String, PreemptableQueue> preemptableQueues;
@ -249,7 +251,21 @@ private void updateConfigIfNeeded() {
    // initialize candidates preemption selection policies
    candidatesSelectionPolicies.add(new FifoCandidatesSelector(this,
-        additionalPreemptionBasedOnReservedResource));
+        additionalPreemptionBasedOnReservedResource, false));
    // Do we need to do preemption to balance queue even after queues get satisfied?
    boolean isPreemptionToBalanceRequired = config.getBoolean(
        CapacitySchedulerConfiguration.PREEMPTION_TO_BALANCE_QUEUES_BEYOND_GUARANTEED,
        CapacitySchedulerConfiguration.DEFAULT_PREEMPTION_TO_BALANCE_QUEUES_BEYOND_GUARANTEED);
    long maximumKillWaitTimeForPreemptionToQueueBalance = config.getLong(
        CapacitySchedulerConfiguration.MAX_WAIT_BEFORE_KILL_FOR_QUEUE_BALANCE_PREEMPTION,
        CapacitySchedulerConfiguration.DEFAULT_MAX_WAIT_BEFORE_KILL_FOR_QUEUE_BALANCE_PREEMPTION);
    if (isPreemptionToBalanceRequired) {
      PreemptionCandidatesSelector selector = new FifoCandidatesSelector(this,
          false, true);
      selector.setMaximumKillWaitTime(maximumKillWaitTimeForPreemptionToQueueBalance);
      candidatesSelectionPolicies.add(selector);
    }
    // Do we need to specially consider intra queue
    boolean isIntraQueuePreemptionEnabled = config.getBoolean(
@ -282,7 +298,8 @@ private void updateConfigIfNeeded() {
        "select_based_on_reserved_containers = " +
          selectCandidatesForResevedContainers + "\n" +
        "additional_res_balance_based_on_reserved_containers = " +
-          additionalPreemptionBasedOnReservedResource);
+          additionalPreemptionBasedOnReservedResource + "\n" +
        "Preemption-to-balance-queue-enabled = " + isPreemptionToBalanceRequired);
    csConfig = config;
  }
@ -308,44 +325,60 @@ public synchronized void editSchedule() {
  }
  private void preemptOrkillSelectedContainerAfterWait(
-      Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates,
+      Map<PreemptionCandidatesSelector, Map<ApplicationAttemptId,
-      long currentTime) {
+          Set<RMContainer>>> toPreemptPerSelector, long currentTime) {
    int toPreemptCount = 0;
    for (Map<ApplicationAttemptId, Set<RMContainer>> containers :
        toPreemptPerSelector.values()) {
      toPreemptCount += containers.size();
    }
    if (LOG.isDebugEnabled()) {
      LOG.debug(
          "Starting to preempt containers for selectedCandidates and size:"
-              + selectedCandidates.size());
+              + toPreemptCount);
    }
    // preempt (or kill) the selected containers
-    for (Map.Entry<ApplicationAttemptId, Set<RMContainer>> e : selectedCandidates
+    // We need toPreemptPerSelector here to match list of containers to
    // its selector so that we can get custom timeout per selector when
    // checking if current container should be killed or not
    for (Map.Entry<PreemptionCandidatesSelector, Map<ApplicationAttemptId,
        Set<RMContainer>>> pc : toPreemptPerSelector
        .entrySet()) {
-      ApplicationAttemptId appAttemptId = e.getKey();
+      Map<ApplicationAttemptId, Set<RMContainer>> cMap = pc.getValue();
-      if (LOG.isDebugEnabled()) {
+      if (cMap.size() > 0) {
-        LOG.debug("Send to scheduler: in app=" + appAttemptId
+        for (Map.Entry<ApplicationAttemptId,
-            + " #containers-to-be-preemptionCandidates=" + e.getValue().size());
+            Set<RMContainer>> e : cMap.entrySet()) {
-      }
+          ApplicationAttemptId appAttemptId = e.getKey();
-      for (RMContainer container : e.getValue()) {
+          if (LOG.isDebugEnabled()) {
-        // if we tried to preempt this for more than maxWaitTime
+            LOG.debug("Send to scheduler: in app=" + appAttemptId
-        if (preemptionCandidates.get(container) != null
+                + " #containers-to-be-preemptionCandidates=" + e.getValue().size());
            && preemptionCandidates.get(container)
                + maxWaitTime <= currentTime) {
          // kill it
          rmContext.getDispatcher().getEventHandler().handle(
              new ContainerPreemptEvent(appAttemptId, container,
                  SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE));
          preemptionCandidates.remove(container);
        } else {
          if (preemptionCandidates.get(container) != null) {
            // We already updated the information to scheduler earlier, we need
            // not have to raise another event.
            continue;
          }
          for (RMContainer container : e.getValue()) {
            // if we tried to preempt this for more than maxWaitTime, this
            // should be based on custom timeout per container per selector
            if (preemptionCandidates.get(container) != null
                && preemptionCandidates.get(container)
                + pc.getKey().getMaximumKillWaitTimeMs() <= currentTime) {
              // kill it
              rmContext.getDispatcher().getEventHandler().handle(
                  new ContainerPreemptEvent(appAttemptId, container,
                      SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE));
              preemptionCandidates.remove(container);
            } else {
              if (preemptionCandidates.get(container) != null) {
                // We already updated the information to scheduler earlier, we need
                // not have to raise another event.
                continue;
              }
-          //otherwise just send preemption events
+              //otherwise just send preemption events
-          rmContext.getDispatcher().getEventHandler().handle(
+              rmContext.getDispatcher().getEventHandler().handle(
-              new ContainerPreemptEvent(appAttemptId, container,
+                  new ContainerPreemptEvent(appAttemptId, container,
-                  SchedulerEventType.MARK_CONTAINER_FOR_PREEMPTION));
+                      SchedulerEventType.MARK_CONTAINER_FOR_PREEMPTION));
-          preemptionCandidates.put(container, currentTime);
+              preemptionCandidates.put(container, currentTime);
            }
          }
        }
      }
    }
@ -438,6 +471,8 @@ private void containerBasedPreemptOrKill(CSQueue root,
    // queue and each application
    Map<ApplicationAttemptId, Set<RMContainer>> toPreempt =
        new HashMap<>();
    Map<PreemptionCandidatesSelector, Map<ApplicationAttemptId,
        Set<RMContainer>>> toPreemptPerSelector =  new HashMap<>();;
    for (PreemptionCandidatesSelector selector :
        candidatesSelectionPolicies) {
      long startTime = 0;
@ -447,20 +482,27 @@ private void containerBasedPreemptOrKill(CSQueue root,
                selector.getClass().getName()));
        startTime = clock.getTime();
      }
-      toPreempt = selector.selectCandidates(toPreempt,
+      Map<ApplicationAttemptId, Set<RMContainer>> curCandidates =
-          clusterResources, totalPreemptionAllowed);
+          selector.selectCandidates(toPreempt, clusterResources,
              totalPreemptionAllowed);
      toPreemptPerSelector.putIfAbsent(selector, curCandidates);
      if (LOG.isDebugEnabled()) {
        LOG.debug(MessageFormat
            .format("{0} uses {1} millisecond to run",
                selector.getClass().getName(), clock.getTime() - startTime));
        int totalSelected = 0;
        int curSelected = 0;
        for (Set<RMContainer> set : toPreempt.values()) {
          totalSelected += set.size();
        }
        for (Set<RMContainer> set : curCandidates.values()) {
          curSelected += set.size();
        }
        LOG.debug(MessageFormat
-            .format("So far, total {0} containers selected to be preempted",
+            .format("So far, total {0} containers selected to be preempted, {1}"
-                totalSelected));
+                    + " containers selected this round\n",
                totalSelected, curSelected));
      }
    }
@ -483,8 +525,10 @@ private void containerBasedPreemptOrKill(CSQueue root,
    long currentTime = clock.getTime();
    pcsMap = toPreemptPerSelector;
    // preempt (or kill) the selected containers
-    preemptOrkillSelectedContainerAfterWait(toPreempt, currentTime);
+    preemptOrkillSelectedContainerAfterWait(toPreemptPerSelector, currentTime);
    // cleanup staled preemption candidates
    cleanupStaledPreemptionCandidates(currentTime);
@ -689,6 +733,12 @@ Map<String, Map<String, TempQueuePerPartition>> getQueuePartitions() {
    return queueToPartitions;
  }
  @VisibleForTesting
  Map<PreemptionCandidatesSelector, Map<ApplicationAttemptId,
      Set<RMContainer>>> getToPreemptCandidatesPerSelector() {
    return pcsMap;
  }
  @Override
  public int getClusterMaxApplicationPriority() {
    return scheduler.getMaxClusterLevelAppPriority().getPriority();
@ -730,4 +780,9 @@ public void addPartitionToUnderServedQueues(String queueName,
  public IntraQueuePreemptionOrderPolicy getIntraQueuePreemptionOrderPolicy() {
    return intraQueuePreemptionOrderPolicy;
  }
  @Override
  public long getDefaultMaximumKillWaitTimeout() {
    return maxWaitTime;
  }
 }
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/QueuePriorityContainerCandidateSelector.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/QueuePriorityContainerCandidateSelector.java
@ -380,6 +380,7 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
      Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates,
      Resource clusterResource,
      Resource totalPreemptedResourceAllowed) {
    Map<ApplicationAttemptId, Set<RMContainer>> curCandidates = new HashMap<>();
    // Initialize digraph from queues
    // TODO (wangda): only do this when queue refreshed.
    priorityDigraph.clear();
@ -388,7 +389,7 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
    // When all queues are set to same priority, or priority is not respected,
    // direct return.
    if (priorityDigraph.isEmpty()) {
-      return selectedCandidates;
+      return curCandidates;
    }
    // Save parameters to be shared by other methods
@ -478,13 +479,9 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
                .getReservedResource());
          }
-          Set<RMContainer> containers = selectedCandidates.get(
+          // Add to preemptMap
-              c.getApplicationAttemptId());
+          CapacitySchedulerPreemptionUtils.addToPreemptMap(selectedCandidates,
-          if (null == containers) {
+              curCandidates, c.getApplicationAttemptId(), c);
            containers = new HashSet<>();
            selectedCandidates.put(c.getApplicationAttemptId(), containers);
          }
          containers.add(c);
          // Update totalPreemptionResourceAllowed
          Resources.subtractFrom(totalPreemptedResourceAllowed,
@ -504,7 +501,6 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
        }
      }
    }
-
+    return curCandidates;
    return selectedCandidates;
  }
 }
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ReservedContainerCandidatesSelector.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/ReservedContainerCandidatesSelector.java
@ -31,7 +31,6 @@
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@ -63,7 +62,7 @@ public NodeForPreemption(float preemptionCost,
      CapacitySchedulerPreemptionContext preemptionContext) {
    super(preemptionContext);
    preemptableAmountCalculator = new PreemptableResourceCalculator(
-        preemptionContext, true);
+        preemptionContext, true, false);
  }
  @Override
@ -71,6 +70,7 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
      Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates,
      Resource clusterResource,
      Resource totalPreemptedResourceAllowed) {
    Map<ApplicationAttemptId, Set<RMContainer>> curCandidates = new HashMap<>();
    // Calculate how much resources we need to preempt
    preemptableAmountCalculator.computeIdealAllocation(clusterResource,
        totalPreemptedResourceAllowed);
@ -101,14 +101,10 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
          selectedCandidates, totalPreemptedResourceAllowed, false);
      if (null != preemptionResult) {
        for (RMContainer c : preemptionResult.selectedContainers) {
-          ApplicationAttemptId appId = c.getApplicationAttemptId();
+          // Add to preemptMap
-          Set<RMContainer> containers = selectedCandidates.get(appId);
+          CapacitySchedulerPreemptionUtils.addToPreemptMap(selectedCandidates,
-          if (null == containers) {
+              curCandidates, c.getApplicationAttemptId(), c);
            containers = new HashSet<>();
            selectedCandidates.put(appId, containers);
          }
          containers.add(c);
          if (LOG.isDebugEnabled()) {
            LOG.debug(this.getClass().getName() + " Marked container=" + c
                .getContainerId() + " from queue=" + c.getQueueName()
@ -118,7 +114,7 @@ public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(
      }
    }
-    return selectedCandidates;
+    return curCandidates;
  }
  private Resource getPreemptableResource(String queueName,
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempQueuePerPartition.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TempQueuePerPartition.java
@ -138,7 +138,8 @@ public ArrayList<TempQueuePerPartition> getChildren() {
  // This function "accepts" all the resources it can (pending) and return
  // the unused ones
  Resource offer(Resource avail, ResourceCalculator rc,
-      Resource clusterResource, boolean considersReservedResource) {
+      Resource clusterResource, boolean considersReservedResource,
      boolean allowQueueBalanceAfterAllSafisfied) {
    Resource absMaxCapIdealAssignedDelta = Resources.componentwiseMax(
        Resources.subtract(getMax(), idealAssigned),
        Resource.newInstance(0, 0));
@ -179,7 +180,10 @@ Resource offer(Resource avail, ResourceCalculator rc,
    // leaf queues. Such under-utilized leaf queue could preemption resources
    // from over-utilized leaf queue located at other hierarchies.
-    accepted = filterByMaxDeductAssigned(rc, clusterResource, accepted);
+    // Allow queues can continue grow and balance even if all queues are satisfied.
    if (!allowQueueBalanceAfterAllSafisfied) {
      accepted = filterByMaxDeductAssigned(rc, clusterResource, accepted);
    }
    // accepted so far contains the "quota acceptable" amount, we now filter by
    // locality acceptable
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerConfiguration.java
@ -1459,6 +1459,23 @@ public boolean getLazyPreemptionEnabled() {
      + INTRA_QUEUE_PREEMPTION_CONFIG_PREFIX + "preemption-order-policy";
  public static final String DEFAULT_INTRAQUEUE_PREEMPTION_ORDER_POLICY = "userlimit_first";
  /**
   * Should we allow queues continue grow after all queue reaches their
   * guaranteed capacity.
   */
  public static final String PREEMPTION_TO_BALANCE_QUEUES_BEYOND_GUARANTEED =
      PREEMPTION_CONFIG_PREFIX + "preemption-to-balance-queue-after-satisfied.enabled";
  public static final boolean DEFAULT_PREEMPTION_TO_BALANCE_QUEUES_BEYOND_GUARANTEED = false;
  /**
   * How long we will wait to balance queues, by default it is 5 mins.
   */
  public static final String MAX_WAIT_BEFORE_KILL_FOR_QUEUE_BALANCE_PREEMPTION =
      PREEMPTION_CONFIG_PREFIX + "preemption-to-balance-queue-after-satisfied.max-wait-before-kill";
  public static final long
      DEFAULT_MAX_WAIT_BEFORE_KILL_FOR_QUEUE_BALANCE_PREEMPTION =
      300 * 1000;
  /**
   * Maximum application for a queue to be used when application per queue is
   * not defined.To be consistent with previous version the default value is set
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMEmbeddedElector.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMEmbeddedElector.java
@ -22,8 +22,10 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.ha.ClientBaseWithFixes;
 import org.apache.hadoop.ha.ServiceFailedException;
 import org.apache.hadoop.service.ServiceStateException;
 import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
@ -304,6 +306,26 @@ private void testCallbackSynchronizationTimingStandby(AdminService as,
    verify(as, times(1)).transitionToStandby(any());
  }
  /**
   * Test that active elector service triggers a fatal RM Event when connection
   * to ZK fails. YARN-8409
   */
  @Test
  public void testFailureToConnectToZookeeper() throws Exception {
    stopServer();
    Configuration myConf = new Configuration(conf);
    ResourceManager rm = new MockRM(conf);
    ActiveStandbyElectorBasedElectorService ees =
        new ActiveStandbyElectorBasedElectorService(rm);
    try {
      ees.init(myConf);
      Assert.fail("expect failure to connect to Zookeeper");
    } catch (ServiceStateException sse) {
      Assert.assertTrue(sse.getMessage().contains("ConnectionLoss"));
    }
  }
  private class MockRMWithElector extends MockRM {
    private long delayMs = 0;
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestPreemptionForQueueWithPriorities.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestPreemptionForQueueWithPriorities.java
@ -20,6 +20,7 @@
 import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes;
 import org.apache.hadoop.yarn.api.records.ResourceInformation;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration;
 import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.ResourceUtils;
@ -538,4 +539,61 @@ public void test3ResourceTypesInterQueuePreemption() throws IOException {
        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
            getAppAttemptId(1))));
  }
  @Test
  public void testPriorityPreemptionForBalanceBetweenSatisfiedQueues()
      throws IOException {
    /**
     * All queues are beyond guarantee, c has higher priority than b.
     * c ask for more resource, and there is no idle left, c should preempt
     * some resource from b but won’t let b under its guarantee.
     *
     * Queue structure is:
     *
     * <pre>
     *        root
     *       / |  \
     *      a  b   c
     * </pre>
     *
     * For priorities
     * - a=1
     * - b=1
     * - c=2
     *
     */
    String labelsConfig = "=100,true"; // default partition
    String nodesConfig = "n1="; // only one node
    String queuesConfig =
        // guaranteed,max,used,pending
        "root(=[100 100 100 100]);" + //root
            "-a(=[30 100 0 0]){priority=1};" + // a
            "-b(=[30 100 40 50]){priority=1};" + // b
            "-c(=[40 100 60 25]){priority=2}";   // c
    String appsConfig =
        //queueName\t(priority,resource,host,expression,#repeat,reserved)
        "b\t(1,1,n1,,40,false);" + // app1 in b
            "c\t(1,1,n1,,60,false)"; // app2 in c
    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig);
    CapacitySchedulerConfiguration newConf =
        new CapacitySchedulerConfiguration(conf);
    boolean isPreemptionToBalanceRequired = true;
    newConf.setBoolean(
        CapacitySchedulerConfiguration.PREEMPTION_TO_BALANCE_QUEUES_BEYOND_GUARANTEED,
        isPreemptionToBalanceRequired);
    when(cs.getConfiguration()).thenReturn(newConf);
    policy.editSchedule();
    // IdealAssigned b: 30 c: 70. initIdealAssigned: b: 30 c: 40, even though
    // b and c has same relativeAssigned=1.0f(idealAssigned / guaranteed),
    // since c has higher priority, c will be put in mostUnderServedQueue and
    // get all remain 30 capacity.
    verify(mDisp, times(10)).handle(argThat(
        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
            getAppAttemptId(1))));
    verify(mDisp, never()).handle(argThat(
        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
            getAppAttemptId(2))));
  }
 }
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyPreemptToBalance.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/capacity/TestProportionalCapacityPreemptionPolicyPreemptToBalance.java
@ -0,0 +1,254 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity;
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration;
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.junit.Test;
 import java.io.IOException;
 import java.util.Map;
 import java.util.Set;
 import static org.junit.Assert.assertEquals;
 import static org.mockito.Matchers.argThat;
 import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
 public class TestProportionalCapacityPreemptionPolicyPreemptToBalance
    extends ProportionalCapacityPreemptionPolicyMockFramework {
  @Test
  public void testPreemptionToBalanceDisabled() throws IOException {
    String labelsConfig = "=100,true"; // default partition
    String nodesConfig = "n1="; // only one node
    String queuesConfig =
        // guaranteed,max,used,pending
        "root(=[100 100 100 100]);" + //root
            "-a(=[30 100 10 30]);" + // a
            "-b(=[30 100 40 30]);" + // b
            "-c(=[30 100 50 30]);" + // c
            "-d(=[10 100 0 0])";   // d
    String appsConfig =
        //queueName\t(priority,resource,host,expression,#repeat,reserved)
        "a\t(1,1,n1,,10,false);" + // app1 in a
            "b\t(1,1,n1,,40,false);" + // app2 in b
            "c\t(1,1,n1,,50,false)"; // app3 in c
    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig);
    policy.editSchedule();
    // I_A: A:30 B:35 C:35, preempt 5 from B and 15 from C to A
    verify(mDisp, times(5)).handle(argThat(
        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
            getAppAttemptId(2))));
    verify(mDisp, times(15)).handle(argThat(
        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
            getAppAttemptId(3))));
    assertEquals(30, policy.getQueuePartitions().get("a")
        .get("").getIdealAssigned().getMemorySize());
    assertEquals(35, policy.getQueuePartitions().get("b")
        .get("").getIdealAssigned().getMemorySize());
    assertEquals(35, policy.getQueuePartitions().get("c")
        .get("").getIdealAssigned().getMemorySize());
  }
  @Test
  public void testPreemptionToBalanceEnabled() throws IOException {
    String labelsConfig = "=100,true"; // default partition
    String nodesConfig = "n1="; // only one node
    String queuesConfig =
        // guaranteed,max,used,pending
        "root(=[100 100 100 100]);" + //root
            "-a(=[30 100 10 30]);" + // a
            "-b(=[30 100 40 30]);" + // b
            "-c(=[30 100 50 30]);" + // c
            "-d(=[10 100 0 0])";   // d
    String appsConfig =
        //queueName\t(priority,resource,host,expression,#repeat,reserved)
        "a\t(1,1,n1,,10,false);" + // app1 in a
            "b\t(1,1,n1,,40,false);" + // app2 in b
            "c\t(1,1,n1,,50,false)"; // app3 in c
    // enable preempt to balance and ideal assignment will change.
    boolean isPreemptionToBalanceEnabled = true;
    conf.setBoolean(
        CapacitySchedulerConfiguration.PREEMPTION_TO_BALANCE_QUEUES_BEYOND_GUARANTEED,
        isPreemptionToBalanceEnabled);
    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig);
    policy.editSchedule();
    // I_A: A:33 B:33 C:33, preempt 7 from B and 17 from C to A
    verify(mDisp, times(7)).handle(argThat(
        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
            getAppAttemptId(2))));
    verify(mDisp, times(17)).handle(argThat(
        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
            getAppAttemptId(3))));
    assertEquals(33, policy.getQueuePartitions().get("a")
        .get("").getIdealAssigned().getMemorySize());
    assertEquals(33, policy.getQueuePartitions().get("b")
        .get("").getIdealAssigned().getMemorySize());
    assertEquals(33, policy.getQueuePartitions().get("c")
        .get("").getIdealAssigned().getMemorySize());
  }
  @Test
  public void testPreemptionToBalanceUsedPlusPendingLessThanGuaranteed()
      throws IOException{
    String labelsConfig = "=100,true"; // default partition
    String nodesConfig = "n1="; // only one node
    String queuesConfig =
        // guaranteed,max,used,pending
        "root(=[100 100 100 100]);" + //root
            "-a(=[30 100 10 6]);" + // a
            "-b(=[30 100 40 30]);" + // b
            "-c(=[30 100 50 30]);" + // c
            "-d(=[10 100 0 0])";   // d
    String appsConfig =
        //queueName\t(priority,resource,host,expression,#repeat,reserved)
        "a\t(1,1,n1,,10,false);" + // app1 in a
            "b\t(1,1,n1,,40,false);" + // app2 in b
            "c\t(1,1,n1,,50,false)"; // app3 in c
    boolean isPreemptionToBalanceEnabled = true;
    conf.setBoolean(
        CapacitySchedulerConfiguration.PREEMPTION_TO_BALANCE_QUEUES_BEYOND_GUARANTEED,
        isPreemptionToBalanceEnabled);
    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig);
    policy.editSchedule();
    // I_A: A:15 B:42 C:43, preempt 7 from B and 17 from C to A
    verify(mDisp, times(8)).handle(argThat(
        new TestProportionalCapacityPreemptionPolicy.IsPreemptionRequestFor(
            getAppAttemptId(3))));
    assertEquals(16, policy.getQueuePartitions().get("a")
        .get("").getIdealAssigned().getMemorySize());
    assertEquals(42, policy.getQueuePartitions().get("b")
        .get("").getIdealAssigned().getMemorySize());
    assertEquals(42, policy.getQueuePartitions().get("c")
        .get("").getIdealAssigned().getMemorySize());
  }
  @Test
  public void testPreemptionToBalanceWithVcoreResource() throws IOException {
    Logger.getRootLogger().setLevel(Level.DEBUG);
    String labelsConfig = "=100:100,true"; // default partition
    String nodesConfig = "n1="; // only one node
    String queuesConfig =
        // guaranteed,max,used,pending
        "root(=[100:100 100:100 100:100 120:140]);" + //root
            "-a(=[60:60 100:100 40:40 70:40]);" + // a
            "-b(=[40:40 100:100 60:60 50:100])";   // b
    String appsConfig =
        //queueName\t(priority,resource,host,expression,#repeat,reserved)
        "a\t(1,1:1,n1,,40,false);" + // app1 in a
            "b\t(1,1:1,n1,,60,false)"; // app2 in b
    boolean isPreemptionToBalanceEnabled = true;
    conf.setBoolean(
        CapacitySchedulerConfiguration.PREEMPTION_TO_BALANCE_QUEUES_BEYOND_GUARANTEED,
        isPreemptionToBalanceEnabled);
    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig, true);
    policy.editSchedule();
    // 21 containers will be preempted here
    verify(mDisp, times(21)).handle(argThat(
        new TestProportionalCapacityPreemptionPolicy.
            IsPreemptionRequestFor(getAppAttemptId(2))));
    assertEquals(60, policy.getQueuePartitions().get("a")
        .get("").getIdealAssigned().getMemorySize());
    assertEquals(60, policy.getQueuePartitions().get("a")
        .get("").getIdealAssigned().getVirtualCores());
    assertEquals(40, policy.getQueuePartitions().get("b")
        .get("").getIdealAssigned().getMemorySize());
    assertEquals(40, policy.getQueuePartitions().get("b")
        .get("").getIdealAssigned().getVirtualCores());
  }
  @Test
  public void testPreemptionToBalanceWithConfiguredTimeout() throws IOException {
    Logger.getRootLogger().setLevel(Level.DEBUG);
    String labelsConfig = "=100:100,true"; // default partition
    String nodesConfig = "n1="; // only one node
    String queuesConfig =
        // guaranteed,max,used,pending
        "root(=[100:100 100:100 100:100 120:140]);" + //root
            "-a(=[60:60 100:100 40:40 70:40]);" + // a
            "-b(=[40:40 100:100 60:60 50:100])";   // b
    String appsConfig =
        //queueName\t(priority,resource,host,expression,#repeat,reserved)
        "a\t(1,1:1,n1,,40,false);" + // app1 in a
            "b\t(1,1:1,n1,,60,false)"; // app2 in b
    boolean isPreemptionToBalanceEnabled = true;
    conf.setBoolean(
        CapacitySchedulerConfiguration.PREEMPTION_TO_BALANCE_QUEUES_BEYOND_GUARANTEED,
        isPreemptionToBalanceEnabled);
    final long FB_MAX_BEFORE_KILL = 60 *1000;
    conf.setLong(
        CapacitySchedulerConfiguration.MAX_WAIT_BEFORE_KILL_FOR_QUEUE_BALANCE_PREEMPTION,
        FB_MAX_BEFORE_KILL);
    buildEnv(labelsConfig, nodesConfig, queuesConfig, appsConfig, true);
    policy.editSchedule();
    Map<PreemptionCandidatesSelector, Map<ApplicationAttemptId,
        Set<RMContainer>>> pcps= policy.getToPreemptCandidatesPerSelector();
    String FIFO_CANDIDATE_SELECTOR = "FifoCandidatesSelector";
    boolean hasFifoSelector = false;
    for (Map.Entry<PreemptionCandidatesSelector, Map<ApplicationAttemptId,
        Set<RMContainer>>> pc : pcps.entrySet()) {
      if (pc.getKey().getClass().getSimpleName().equals(FIFO_CANDIDATE_SELECTOR)) {
        FifoCandidatesSelector pcs = (FifoCandidatesSelector) pc.getKey();
        if (pcs.getAllowQueuesBalanceAfterAllQueuesSatisfied() == true) {
          hasFifoSelector = true;
          assertEquals(pcs.getMaximumKillWaitTimeMs(), FB_MAX_BEFORE_KILL);
        }
      }
    }
    assertEquals(hasFifoSelector, true);
    // 21 containers will be preempted here
    verify(mDisp, times(21)).handle(argThat(
        new TestProportionalCapacityPreemptionPolicy.
            IsPreemptionRequestFor(getAppAttemptId(2))));
    assertEquals(60, policy.getQueuePartitions().get("a")
        .get("").getIdealAssigned().getMemorySize());
    assertEquals(60, policy.getQueuePartitions().get("a")
        .get("").getIdealAssigned().getVirtualCores());
    assertEquals(40, policy.getQueuePartitions().get("b")
        .get("").getIdealAssigned().getMemorySize());
    assertEquals(40, policy.getQueuePartitions().get("b")
        .get("").getIdealAssigned().getVirtualCores());
  }
 }
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerSurgicalPreemption.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerSurgicalPreemption.java
@ -1111,5 +1111,116 @@ public void testPreemptionForFragmentatedCluster() throws Exception {
    rm1.close();
  }
  @Test(timeout = 600000)
  public void testPreemptionToBalanceWithCustomTimeout() throws Exception {
    /**
     * Test case: Submit two application (app1/app2) to different queues, queue
     * structure:
     *
     * <pre>
     *             Root
     *            /  |  \
     *           a   b   c
     *          10   20  70
     * </pre>
     *
     * 1) Two nodes (n1/n2) in the cluster, each of them has 20G.
     *
     * 2) app1 submit to queue-b, asks for 1G * 5
     *
     * 3) app2 submit to queue-c, ask for one 4G container (for AM)
     *
     * After preemption, we should expect:
     * 1. Preempt 4 containers from app1
     * 2. the selected containers will be killed after configured timeout.
     * 3. AM of app2 successfully allocated.
     */
    conf.setBoolean(
        CapacitySchedulerConfiguration.PREEMPTION_TO_BALANCE_QUEUES_BEYOND_GUARANTEED,
        true);
    conf.setLong(
        CapacitySchedulerConfiguration.MAX_WAIT_BEFORE_KILL_FOR_QUEUE_BALANCE_PREEMPTION,
        20*1000);
    CapacitySchedulerConfiguration conf = new CapacitySchedulerConfiguration(
        this.conf);
    MockRM rm1 = new MockRM(conf);
    rm1.getRMContext().setNodeLabelManager(mgr);
    rm1.start();
    MockNM nm1 = rm1.registerNode("h1:1234", 20 * GB);
    MockNM nm2 = rm1.registerNode("h2:1234", 20 * GB);
    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
    RMNode rmNode2 = rm1.getRMContext().getRMNodes().get(nm2.getNodeId());
    // launch an app to queue, AM container should be launched in nm1
    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "a");
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    am1.allocate("*", 1 * GB, 38, new ArrayList<ContainerId>());
    // Do allocation for node1/node2
    for (int i = 0; i < 38; i++) {
      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
      cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
    }
    // App1 should have 39 containers now
    FiCaSchedulerApp schedulerApp1 = cs.getApplicationAttempt(
        am1.getApplicationAttemptId());
    Assert.assertEquals(39, schedulerApp1.getLiveContainers().size());
    // 20 from n1 and 19 from n2
    waitNumberOfLiveContainersOnNodeFromApp(cs.getNode(rmNode1.getNodeID()),
        am1.getApplicationAttemptId(), 20);
    waitNumberOfLiveContainersOnNodeFromApp(cs.getNode(rmNode2.getNodeID()),
        am1.getApplicationAttemptId(), 19);
    // Submit app2 to queue-c and asks for a 4G container for AM
    RMApp app2 = rm1.submitApp(4 * GB, "app", "user", null, "c");
    FiCaSchedulerApp schedulerApp2 = cs.getApplicationAttempt(
        ApplicationAttemptId.newInstance(app2.getApplicationId(), 1));
    // Call editSchedule: containers are selected to be preemption candidate
    SchedulingMonitorManager smm = ((CapacityScheduler) rm1.
        getResourceScheduler()).getSchedulingMonitorManager();
    SchedulingMonitor smon = smm.getAvailableSchedulingMonitor();
    ProportionalCapacityPreemptionPolicy editPolicy =
        (ProportionalCapacityPreemptionPolicy) smon.getSchedulingEditPolicy();
    editPolicy.editSchedule();
    Assert.assertEquals(4, editPolicy.getToPreemptContainers().size());
    // check live containers immediately, nothing happen
    Assert.assertEquals(39, schedulerApp1.getLiveContainers().size());
    Thread.sleep(20*1000);
    // Call editSchedule again: selected containers are killed
    editPolicy.editSchedule();
    waitNumberOfLiveContainersFromApp(schedulerApp1, 35);
    // Call allocation, containers are reserved
    cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
    cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
    waitNumberOfReservedContainersFromApp(schedulerApp2, 1);
    // Call editSchedule twice and allocation once, container should get allocated
    editPolicy.editSchedule();
    editPolicy.editSchedule();
    int tick = 0;
    while (schedulerApp2.getLiveContainers().size() != 1 && tick < 10) {
      cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
      cs.handle(new NodeUpdateSchedulerEvent(rmNode2));
      tick++;
      Thread.sleep(100);
    }
    waitNumberOfReservedContainersFromApp(schedulerApp2, 0);
    rm1.close();
  }
 }