YARN-10590. Consider legacy auto queue creation absolute resource template to avoid rounding errors. Contributed by Andras Gyori

This commit is contained in:
Szilard Nemeth 2022-02-22 12:26:35 +01:00
parent 0463498adc
commit 365375412f
6 changed files with 83 additions and 42 deletions

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueResourceQuotas;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerDynamicEditException; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerDynamicEditException;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.QueueEntitlement; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.QueueEntitlement;
@ -30,6 +31,8 @@ import java.io.IOException;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.AbstractCSQueue.CapacityConfigType.ABSOLUTE_RESOURCE;
/** /**
* Leaf queues which are auto created by an underlying implementation of * Leaf queues which are auto created by an underlying implementation of
* AbstractManagedParentQueue. Eg: PlanQueue for reservations or * AbstractManagedParentQueue. Eg: PlanQueue for reservations or
@ -81,14 +84,14 @@ public class AutoCreatedLeafQueue extends AbstractAutoCreatedLeafQueue {
QueueCapacities capacities = leafQueueTemplate.getQueueCapacities(); QueueCapacities capacities = leafQueueTemplate.getQueueCapacities();
//reset capacities for the leaf queue //reset capacities for the leaf queue
mergeCapacities(capacities); mergeCapacities(capacities, leafQueueTemplate.getResourceQuotas());
} finally { } finally {
writeLock.unlock(); writeLock.unlock();
} }
} }
public void mergeCapacities(QueueCapacities capacities) { public void mergeCapacities(QueueCapacities capacities, QueueResourceQuotas resourceQuotas) {
for ( String nodeLabel : capacities.getExistingNodeLabels()) { for ( String nodeLabel : capacities.getExistingNodeLabels()) {
queueCapacities.setCapacity(nodeLabel, queueCapacities.setCapacity(nodeLabel,
capacities.getCapacity(nodeLabel)); capacities.getCapacity(nodeLabel));
@ -101,9 +104,19 @@ public class AutoCreatedLeafQueue extends AbstractAutoCreatedLeafQueue {
Resource resourceByLabel = labelManager.getResourceByLabel(nodeLabel, Resource resourceByLabel = labelManager.getResourceByLabel(nodeLabel,
queueContext.getClusterResource()); queueContext.getClusterResource());
getQueueResourceQuotas().setEffectiveMinResource(nodeLabel, // Update effective resource from template due to rounding errors.
Resources.multiply(resourceByLabel, // However, we need to consider deactivation as well, in which case we fall back to
queueCapacities.getAbsoluteCapacity(nodeLabel))); // Percentage calculation (as absolute capacity will be 0, resource will be zero as well).
if (getCapacityConfigType().equals(ABSOLUTE_RESOURCE)
&& queueCapacities.getAbsoluteCapacity(nodeLabel) > 0) {
getQueueResourceQuotas().setEffectiveMinResource(nodeLabel,
resourceQuotas.getConfiguredMinResource(nodeLabel));
} else {
getQueueResourceQuotas().setEffectiveMinResource(nodeLabel,
Resources.multiply(resourceByLabel,
queueCapacities.getAbsoluteCapacity(nodeLabel)));
}
getQueueResourceQuotas().setEffectiveMaxResource(nodeLabel, getQueueResourceQuotas().setEffectiveMaxResource(nodeLabel,
Resources.multiply(resourceByLabel, queueCapacities Resources.multiply(resourceByLabel, queueCapacities
.getAbsoluteMaximumCapacity(nodeLabel))); .getAbsoluteMaximumCapacity(nodeLabel)));

View File

@ -17,6 +17,8 @@
*/ */
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueResourceQuotas;
/** /**
* Auto Created Leaf queue configurations, capacity * Auto Created Leaf queue configurations, capacity
*/ */
@ -31,15 +33,19 @@ public class AutoCreatedLeafQueueConfig {
private CapacitySchedulerConfiguration leafQueueConfigs; private CapacitySchedulerConfiguration leafQueueConfigs;
private final QueueResourceQuotas resourceQuotas;
public AutoCreatedLeafQueueConfig(Builder builder) { public AutoCreatedLeafQueueConfig(Builder builder) {
this.queueCapacities = builder.queueCapacities; this.queueCapacities = builder.queueCapacities;
this.leafQueueConfigs = builder.leafQueueConfigs; this.leafQueueConfigs = builder.leafQueueConfigs;
this.resourceQuotas = builder.queueResourceQuotas;
} }
public static class Builder { public static class Builder {
private QueueCapacities queueCapacities; private QueueCapacities queueCapacities;
private CapacitySchedulerConfiguration leafQueueConfigs; private CapacitySchedulerConfiguration leafQueueConfigs;
private QueueResourceQuotas queueResourceQuotas;
public Builder capacities(QueueCapacities capacities) { public Builder capacities(QueueCapacities capacities) {
this.queueCapacities = capacities; this.queueCapacities = capacities;
@ -54,6 +60,11 @@ public class AutoCreatedLeafQueueConfig {
public AutoCreatedLeafQueueConfig build() { public AutoCreatedLeafQueueConfig build() {
return new AutoCreatedLeafQueueConfig(this); return new AutoCreatedLeafQueueConfig(this);
} }
public Builder resourceQuotas(QueueResourceQuotas queueResourceQuotas) {
this.queueResourceQuotas = queueResourceQuotas;
return this;
}
} }
public QueueCapacities getQueueCapacities() { public QueueCapacities getQueueCapacities() {
@ -64,6 +75,10 @@ public class AutoCreatedLeafQueueConfig {
return leafQueueConfigs; return leafQueueConfigs;
} }
public QueueResourceQuotas getResourceQuotas() {
return resourceQuotas;
}
@Override public String toString() { @Override public String toString() {
return "AutoCreatedLeafQueueConfig{" + "queueCapacities=" + queueCapacities return "AutoCreatedLeafQueueConfig{" + "queueCapacities=" + queueCapacities
+ ", leafQueueConfigs=" + leafQueueConfigs + '}'; + ", leafQueueConfigs=" + leafQueueConfigs + '}';

View File

@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueResourceQuotas;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler import org.apache.hadoop.yarn.server.resourcemanager.scheduler
.SchedulerDynamicEditException; .SchedulerDynamicEditException;
@ -161,25 +162,14 @@ public class ManagedParentQueue extends AbstractManagedParentQueue {
CapacitySchedulerConfiguration autoCreatedTemplateConfig = CapacitySchedulerConfiguration autoCreatedTemplateConfig =
super.initializeLeafQueueConfigs(leafQueueTemplateConfPrefix); super.initializeLeafQueueConfigs(leafQueueTemplateConfPrefix);
builder.configuration(autoCreatedTemplateConfig); builder.configuration(autoCreatedTemplateConfig);
QueueResourceQuotas queueResourceQuotas = new QueueResourceQuotas();
setAbsoluteResourceTemplates(configuration, queueResourceQuotas);
QueuePath templateQueuePath = configuration QueuePath templateQueuePath = configuration
.getAutoCreatedQueueObjectTemplateConfPrefix(getQueuePath()); .getAutoCreatedQueueObjectTemplateConfPrefix(getQueuePath());
Set<String> templateConfiguredNodeLabels = queueContext Set<String> templateConfiguredNodeLabels = queueContext
.getQueueManager().getConfiguredNodeLabelsForAllQueues() .getQueueManager().getConfiguredNodeLabelsForAllQueues()
.getLabelsByQueue(templateQueuePath.getFullPath()); .getLabelsByQueue(templateQueuePath.getFullPath());
for (String nodeLabel : templateConfiguredNodeLabels) {
Resource templateMinResource = autoCreatedTemplateConfig.getMinimumResourceRequirement(
nodeLabel, configuration
.getAutoCreatedQueueTemplateConfPrefix(getQueuePath()),
resourceTypes);
if (this.capacityConfigType.equals(CapacityConfigType.PERCENTAGE)
&& !templateMinResource.equals(Resources.none())) {
throw new IOException("Managed Parent Queue " + this.getQueuePath()
+ " config type is different from leaf queue template config type");
}
}
//Load template capacities //Load template capacities
QueueCapacities queueCapacities = new QueueCapacities(false); QueueCapacities queueCapacities = new QueueCapacities(false);
CSQueueUtils.loadCapacitiesByLabelsFromConf(templateQueuePath, CSQueueUtils.loadCapacitiesByLabelsFromConf(templateQueuePath,
@ -187,7 +177,6 @@ public class ManagedParentQueue extends AbstractManagedParentQueue {
configuration, configuration,
templateConfiguredNodeLabels); templateConfiguredNodeLabels);
/** /**
* Populate leaf queue template (of Parent resources configured in * Populate leaf queue template (of Parent resources configured in
* ABSOLUTE_RESOURCE) capacities with actual values for which configured has * ABSOLUTE_RESOURCE) capacities with actual values for which configured has
@ -198,9 +187,31 @@ public class ManagedParentQueue extends AbstractManagedParentQueue {
updateQueueCapacities(queueCapacities); updateQueueCapacities(queueCapacities);
} }
builder.capacities(queueCapacities); builder.capacities(queueCapacities);
builder.resourceQuotas(queueResourceQuotas);
return builder; return builder;
} }
private void setAbsoluteResourceTemplates(CapacitySchedulerConfiguration configuration,
QueueResourceQuotas queueResourceQuotas) throws IOException {
QueuePath templateQueuePath = configuration
.getAutoCreatedQueueObjectTemplateConfPrefix(getQueuePath());
Set<String> templateConfiguredNodeLabels = queueContext
.getQueueManager().getConfiguredNodeLabelsForAllQueues()
.getLabelsByQueue(templateQueuePath.getFullPath());
for (String nodeLabel : templateConfiguredNodeLabels) {
Resource templateMinResource = configuration.getMinimumResourceRequirement(
nodeLabel, templateQueuePath.getFullPath(), resourceTypes);
queueResourceQuotas.setConfiguredMinResource(nodeLabel, templateMinResource);
if (this.capacityConfigType.equals(CapacityConfigType.PERCENTAGE)
&& !templateMinResource.equals(Resources.none())) {
throw new IOException("Managed Parent Queue " + this.getQueuePath()
+ " config type is different from leaf queue template config type");
}
}
}
private void updateQueueCapacities(QueueCapacities queueCapacities) { private void updateQueueCapacities(QueueCapacities queueCapacities) {
CapacitySchedulerConfiguration configuration = CapacitySchedulerConfiguration configuration =
queueContext.getConfiguration(); queueContext.getConfiguration();

View File

@ -592,19 +592,18 @@ public class GuaranteedOrZeroCapacityOverTimePolicy
for (String nodeLabel : updatedQueueTemplate.getQueueCapacities() for (String nodeLabel : updatedQueueTemplate.getQueueCapacities()
.getExistingNodeLabels()) { .getExistingNodeLabels()) {
if (updatedQueueTemplate.getQueueCapacities(). if (updatedQueueTemplate.getQueueCapacities().getCapacity(nodeLabel) > 0) {
getCapacity(nodeLabel) > 0) {
if (isActive(leafQueue, nodeLabel)) { if (isActive(leafQueue, nodeLabel)) {
LOG.debug("Queue is already active. Skipping activation : {}", LOG.debug("Queue is already active. Skipping activation : {}",
leafQueue.getQueuePath()); leafQueue.getQueuePath());
} else{ } else{
activate(leafQueue, nodeLabel); activate(leafQueue, nodeLabel);
} }
} else{ } else {
if (!isActive(leafQueue, nodeLabel)) { if (!isActive(leafQueue, nodeLabel)) {
LOG.debug("Queue is already de-activated. Skipping " LOG.debug("Queue is already de-activated. Skipping "
+ "de-activation : {}", leafQueue.getQueuePath()); + "de-activation : {}", leafQueue.getQueuePath());
} else{ } else {
/** /**
* While deactivating queues of type ABSOLUTE_RESOURCE, configured * While deactivating queues of type ABSOLUTE_RESOURCE, configured
* min resource has to be set based on updated capacity (which is * min resource has to be set based on updated capacity (which is
@ -613,7 +612,7 @@ public class GuaranteedOrZeroCapacityOverTimePolicy
* leads to incorrect results. * leads to incorrect results.
*/ */
leafQueue leafQueue
.mergeCapacities(updatedQueueTemplate.getQueueCapacities()); .mergeCapacities(updatedQueueTemplate.getQueueCapacities(), leafQueueTemplate.getResourceQuotas());
leafQueue.getQueueResourceQuotas() leafQueue.getQueueResourceQuotas()
.setConfiguredMinResource(Resources.multiply( .setConfiguredMinResource(Resources.multiply(
managedParentQueue.getQueueContext().getClusterResource(), managedParentQueue.getQueueContext().getClusterResource(),
@ -787,6 +786,7 @@ public class GuaranteedOrZeroCapacityOverTimePolicy
AutoCreatedLeafQueueConfig.Builder templateBuilder = AutoCreatedLeafQueueConfig.Builder templateBuilder =
new AutoCreatedLeafQueueConfig.Builder(); new AutoCreatedLeafQueueConfig.Builder();
templateBuilder.capacities(capacities); templateBuilder.capacities(capacities);
templateBuilder.resourceQuotas(managedParentQueue.getLeafQueueTemplate().getResourceQuotas());
return new AutoCreatedLeafQueueConfig(templateBuilder); return new AutoCreatedLeafQueueConfig(templateBuilder);
} }
} }

View File

@ -148,8 +148,6 @@ public class TestAbsoluteResourceWithAutoQueue
return csConf; return csConf;
} }
// TODO: Wangda: I think this test case is not correct, Sunil could help look
// into details.
@Test(timeout = 20000) @Test(timeout = 20000)
public void testAutoCreateLeafQueueCreation() throws Exception { public void testAutoCreateLeafQueueCreation() throws Exception {
@ -182,10 +180,8 @@ public class TestAbsoluteResourceWithAutoQueue
ManagedParentQueue parentQueue = (ManagedParentQueue) cs.getQueue(QUEUED); ManagedParentQueue parentQueue = (ManagedParentQueue) cs.getQueue(QUEUED);
assertEquals(parentQueue, autoCreatedLeafQueue.getParent()); assertEquals(parentQueue, autoCreatedLeafQueue.getParent());
validateCapacities((AutoCreatedLeafQueue) autoCreatedLeafQueue, 0.4f, validateCapacities(autoCreatedLeafQueue, 0.4f, 0.04f, 1f, 0.6f);
0.04f, 1f, 0.6f); validateCapacitiesByLabel(parentQueue, autoCreatedLeafQueue, NO_LABEL);
validateCapacitiesByLabel((ManagedParentQueue) parentQueue,
(AutoCreatedLeafQueue) autoCreatedLeafQueue, NO_LABEL);
Map<String, Float> expectedChildQueueAbsCapacity = Map<String, Float> expectedChildQueueAbsCapacity =
new HashMap<String, Float>() { new HashMap<String, Float>() {

View File

@ -89,6 +89,7 @@ import java.util.concurrent.TimeUnit;
import static org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager import static org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager
.NO_LABEL; .NO_LABEL;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.AbstractCSQueue.CapacityConfigType.ABSOLUTE_RESOURCE;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler import static org.apache.hadoop.yarn.server.resourcemanager.scheduler
.capacity.CSQueueUtils.EPSILON; .capacity.CSQueueUtils.EPSILON;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler import static org.apache.hadoop.yarn.server.resourcemanager.scheduler
@ -166,6 +167,7 @@ public class TestCapacitySchedulerAutoCreatedQueueBase {
public static final float NODE_LABEL_GPU_TEMPLATE_CAPACITY = 30.0f; public static final float NODE_LABEL_GPU_TEMPLATE_CAPACITY = 30.0f;
public static final float NODEL_LABEL_SSD_TEMPLATE_CAPACITY = 40.0f; public static final float NODEL_LABEL_SSD_TEMPLATE_CAPACITY = 40.0f;
public static final ImmutableSet<String> RESOURCE_TYPES = ImmutableSet.of("memory", "vcores");
protected MockRM mockRM = null; protected MockRM mockRM = null;
protected MockNM nm1 = null; protected MockNM nm1 = null;
@ -777,17 +779,21 @@ public class TestCapacitySchedulerAutoCreatedQueueBase {
* parentQueue.getQueueCapacities().getAbsoluteCapacity(label)); * parentQueue.getQueueCapacities().getAbsoluteCapacity(label));
assertEquals(effMinCapacity, Resources.multiply(resourceByLabel, assertEquals(effMinCapacity, Resources.multiply(resourceByLabel,
leafQueue.getQueueCapacities().getAbsoluteCapacity(label))); leafQueue.getQueueCapacities().getAbsoluteCapacity(label)));
// TODO: Wangda, I think this is a wrong test, it doesn't consider rounding
// loss of multiplication, the right value should be <10240, 2>, but the if (expectedQueueEntitlements.get(label).getCapacity() > EPSILON) {
// test expects <10240, 1> if (leafQueue.getCapacityConfigType().equals(ABSOLUTE_RESOURCE)) {
// fixme, address this in the future patch (auto queue creation). String templatePrefix = cs.getConfiguration().getAutoCreatedQueueTemplateConfPrefix(
// if (expectedQueueEntitlements.get(label).getCapacity() > EPSILON) { parentQueue.getQueuePath());
// assertEquals(Resource.newInstance(10 * GB, 2), Resource resourceTemplate = parentQueue.getLeafQueueTemplate().getLeafQueueConfigs()
// leafQueue.getEffectiveCapacity(label)); .getMinimumResourceRequirement(label, templatePrefix, RESOURCE_TYPES);
// } else { assertEquals(resourceTemplate, leafQueue.getEffectiveCapacity(label));
// assertEquals(Resource.newInstance(0, 0), } else {
// leafQueue.getEffectiveCapacity(label)); assertEquals(effMinCapacity, leafQueue.getEffectiveCapacity(label));
// } }
} else {
assertEquals(Resource.newInstance(0, 0),
leafQueue.getEffectiveCapacity(label));
}
if (leafQueue.getQueueCapacities().getAbsoluteCapacity(label) > 0) { if (leafQueue.getQueueCapacities().getAbsoluteCapacity(label) > 0) {
assertTrue(Resources.greaterThan(cs.getResourceCalculator(), assertTrue(Resources.greaterThan(cs.getResourceCalculator(),