YARN-10590. Consider legacy auto queue creation absolute resource template to avoid rounding errors. Contributed by Andras Gyori

This commit is contained in:
Szilard Nemeth 2022-02-22 12:26:35 +01:00
parent 0463498adc
commit 365375412f
6 changed files with 83 additions and 42 deletions

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueResourceQuotas;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerDynamicEditException;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.QueueEntitlement;
@ -30,6 +31,8 @@ import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.AbstractCSQueue.CapacityConfigType.ABSOLUTE_RESOURCE;
/**
* Leaf queues which are auto created by an underlying implementation of
* AbstractManagedParentQueue. Eg: PlanQueue for reservations or
@ -81,14 +84,14 @@ public class AutoCreatedLeafQueue extends AbstractAutoCreatedLeafQueue {
QueueCapacities capacities = leafQueueTemplate.getQueueCapacities();
//reset capacities for the leaf queue
mergeCapacities(capacities);
mergeCapacities(capacities, leafQueueTemplate.getResourceQuotas());
} finally {
writeLock.unlock();
}
}
public void mergeCapacities(QueueCapacities capacities) {
public void mergeCapacities(QueueCapacities capacities, QueueResourceQuotas resourceQuotas) {
for ( String nodeLabel : capacities.getExistingNodeLabels()) {
queueCapacities.setCapacity(nodeLabel,
capacities.getCapacity(nodeLabel));
@ -101,9 +104,19 @@ public class AutoCreatedLeafQueue extends AbstractAutoCreatedLeafQueue {
Resource resourceByLabel = labelManager.getResourceByLabel(nodeLabel,
queueContext.getClusterResource());
getQueueResourceQuotas().setEffectiveMinResource(nodeLabel,
Resources.multiply(resourceByLabel,
queueCapacities.getAbsoluteCapacity(nodeLabel)));
// Update effective resource from template due to rounding errors.
// However, we need to consider deactivation as well, in which case we fall back to
// Percentage calculation (as absolute capacity will be 0, resource will be zero as well).
if (getCapacityConfigType().equals(ABSOLUTE_RESOURCE)
&& queueCapacities.getAbsoluteCapacity(nodeLabel) > 0) {
getQueueResourceQuotas().setEffectiveMinResource(nodeLabel,
resourceQuotas.getConfiguredMinResource(nodeLabel));
} else {
getQueueResourceQuotas().setEffectiveMinResource(nodeLabel,
Resources.multiply(resourceByLabel,
queueCapacities.getAbsoluteCapacity(nodeLabel)));
}
getQueueResourceQuotas().setEffectiveMaxResource(nodeLabel,
Resources.multiply(resourceByLabel, queueCapacities
.getAbsoluteMaximumCapacity(nodeLabel)));

View File

@ -17,6 +17,8 @@
*/
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueResourceQuotas;
/**
* Auto Created Leaf queue configurations, capacity
*/
@ -31,15 +33,19 @@ public class AutoCreatedLeafQueueConfig {
private CapacitySchedulerConfiguration leafQueueConfigs;
private final QueueResourceQuotas resourceQuotas;
public AutoCreatedLeafQueueConfig(Builder builder) {
this.queueCapacities = builder.queueCapacities;
this.leafQueueConfigs = builder.leafQueueConfigs;
this.resourceQuotas = builder.queueResourceQuotas;
}
public static class Builder {
private QueueCapacities queueCapacities;
private CapacitySchedulerConfiguration leafQueueConfigs;
private QueueResourceQuotas queueResourceQuotas;
public Builder capacities(QueueCapacities capacities) {
this.queueCapacities = capacities;
@ -54,6 +60,11 @@ public class AutoCreatedLeafQueueConfig {
public AutoCreatedLeafQueueConfig build() {
return new AutoCreatedLeafQueueConfig(this);
}
public Builder resourceQuotas(QueueResourceQuotas queueResourceQuotas) {
this.queueResourceQuotas = queueResourceQuotas;
return this;
}
}
public QueueCapacities getQueueCapacities() {
@ -64,6 +75,10 @@ public class AutoCreatedLeafQueueConfig {
return leafQueueConfigs;
}
public QueueResourceQuotas getResourceQuotas() {
return resourceQuotas;
}
@Override public String toString() {
return "AutoCreatedLeafQueueConfig{" + "queueCapacities=" + queueCapacities
+ ", leafQueueConfigs=" + leafQueueConfigs + '}';

View File

@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueResourceQuotas;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler
.SchedulerDynamicEditException;
@ -161,25 +162,14 @@ public class ManagedParentQueue extends AbstractManagedParentQueue {
CapacitySchedulerConfiguration autoCreatedTemplateConfig =
super.initializeLeafQueueConfigs(leafQueueTemplateConfPrefix);
builder.configuration(autoCreatedTemplateConfig);
QueueResourceQuotas queueResourceQuotas = new QueueResourceQuotas();
setAbsoluteResourceTemplates(configuration, queueResourceQuotas);
QueuePath templateQueuePath = configuration
.getAutoCreatedQueueObjectTemplateConfPrefix(getQueuePath());
Set<String> templateConfiguredNodeLabels = queueContext
.getQueueManager().getConfiguredNodeLabelsForAllQueues()
.getLabelsByQueue(templateQueuePath.getFullPath());
for (String nodeLabel : templateConfiguredNodeLabels) {
Resource templateMinResource = autoCreatedTemplateConfig.getMinimumResourceRequirement(
nodeLabel, configuration
.getAutoCreatedQueueTemplateConfPrefix(getQueuePath()),
resourceTypes);
if (this.capacityConfigType.equals(CapacityConfigType.PERCENTAGE)
&& !templateMinResource.equals(Resources.none())) {
throw new IOException("Managed Parent Queue " + this.getQueuePath()
+ " config type is different from leaf queue template config type");
}
}
//Load template capacities
QueueCapacities queueCapacities = new QueueCapacities(false);
CSQueueUtils.loadCapacitiesByLabelsFromConf(templateQueuePath,
@ -187,7 +177,6 @@ public class ManagedParentQueue extends AbstractManagedParentQueue {
configuration,
templateConfiguredNodeLabels);
/**
* Populate leaf queue template (of Parent resources configured in
* ABSOLUTE_RESOURCE) capacities with actual values for which configured has
@ -198,9 +187,31 @@ public class ManagedParentQueue extends AbstractManagedParentQueue {
updateQueueCapacities(queueCapacities);
}
builder.capacities(queueCapacities);
builder.resourceQuotas(queueResourceQuotas);
return builder;
}
private void setAbsoluteResourceTemplates(CapacitySchedulerConfiguration configuration,
QueueResourceQuotas queueResourceQuotas) throws IOException {
QueuePath templateQueuePath = configuration
.getAutoCreatedQueueObjectTemplateConfPrefix(getQueuePath());
Set<String> templateConfiguredNodeLabels = queueContext
.getQueueManager().getConfiguredNodeLabelsForAllQueues()
.getLabelsByQueue(templateQueuePath.getFullPath());
for (String nodeLabel : templateConfiguredNodeLabels) {
Resource templateMinResource = configuration.getMinimumResourceRequirement(
nodeLabel, templateQueuePath.getFullPath(), resourceTypes);
queueResourceQuotas.setConfiguredMinResource(nodeLabel, templateMinResource);
if (this.capacityConfigType.equals(CapacityConfigType.PERCENTAGE)
&& !templateMinResource.equals(Resources.none())) {
throw new IOException("Managed Parent Queue " + this.getQueuePath()
+ " config type is different from leaf queue template config type");
}
}
}
private void updateQueueCapacities(QueueCapacities queueCapacities) {
CapacitySchedulerConfiguration configuration =
queueContext.getConfiguration();

View File

@ -592,19 +592,18 @@ public class GuaranteedOrZeroCapacityOverTimePolicy
for (String nodeLabel : updatedQueueTemplate.getQueueCapacities()
.getExistingNodeLabels()) {
if (updatedQueueTemplate.getQueueCapacities().
getCapacity(nodeLabel) > 0) {
if (updatedQueueTemplate.getQueueCapacities().getCapacity(nodeLabel) > 0) {
if (isActive(leafQueue, nodeLabel)) {
LOG.debug("Queue is already active. Skipping activation : {}",
leafQueue.getQueuePath());
} else{
activate(leafQueue, nodeLabel);
}
} else{
} else {
if (!isActive(leafQueue, nodeLabel)) {
LOG.debug("Queue is already de-activated. Skipping "
+ "de-activation : {}", leafQueue.getQueuePath());
} else{
} else {
/**
* While deactivating queues of type ABSOLUTE_RESOURCE, configured
* min resource has to be set based on updated capacity (which is
@ -613,7 +612,7 @@ public class GuaranteedOrZeroCapacityOverTimePolicy
* leads to incorrect results.
*/
leafQueue
.mergeCapacities(updatedQueueTemplate.getQueueCapacities());
.mergeCapacities(updatedQueueTemplate.getQueueCapacities(), leafQueueTemplate.getResourceQuotas());
leafQueue.getQueueResourceQuotas()
.setConfiguredMinResource(Resources.multiply(
managedParentQueue.getQueueContext().getClusterResource(),
@ -787,6 +786,7 @@ public class GuaranteedOrZeroCapacityOverTimePolicy
AutoCreatedLeafQueueConfig.Builder templateBuilder =
new AutoCreatedLeafQueueConfig.Builder();
templateBuilder.capacities(capacities);
templateBuilder.resourceQuotas(managedParentQueue.getLeafQueueTemplate().getResourceQuotas());
return new AutoCreatedLeafQueueConfig(templateBuilder);
}
}

View File

@ -148,8 +148,6 @@ public class TestAbsoluteResourceWithAutoQueue
return csConf;
}
// TODO: Wangda: I think this test case is not correct, Sunil could help look
// into details.
@Test(timeout = 20000)
public void testAutoCreateLeafQueueCreation() throws Exception {
@ -182,10 +180,8 @@ public class TestAbsoluteResourceWithAutoQueue
ManagedParentQueue parentQueue = (ManagedParentQueue) cs.getQueue(QUEUED);
assertEquals(parentQueue, autoCreatedLeafQueue.getParent());
validateCapacities((AutoCreatedLeafQueue) autoCreatedLeafQueue, 0.4f,
0.04f, 1f, 0.6f);
validateCapacitiesByLabel((ManagedParentQueue) parentQueue,
(AutoCreatedLeafQueue) autoCreatedLeafQueue, NO_LABEL);
validateCapacities(autoCreatedLeafQueue, 0.4f, 0.04f, 1f, 0.6f);
validateCapacitiesByLabel(parentQueue, autoCreatedLeafQueue, NO_LABEL);
Map<String, Float> expectedChildQueueAbsCapacity =
new HashMap<String, Float>() {

View File

@ -89,6 +89,7 @@ import java.util.concurrent.TimeUnit;
import static org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager
.NO_LABEL;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.AbstractCSQueue.CapacityConfigType.ABSOLUTE_RESOURCE;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler
.capacity.CSQueueUtils.EPSILON;
import static org.apache.hadoop.yarn.server.resourcemanager.scheduler
@ -166,6 +167,7 @@ public class TestCapacitySchedulerAutoCreatedQueueBase {
public static final float NODE_LABEL_GPU_TEMPLATE_CAPACITY = 30.0f;
public static final float NODEL_LABEL_SSD_TEMPLATE_CAPACITY = 40.0f;
public static final ImmutableSet<String> RESOURCE_TYPES = ImmutableSet.of("memory", "vcores");
protected MockRM mockRM = null;
protected MockNM nm1 = null;
@ -777,17 +779,21 @@ public class TestCapacitySchedulerAutoCreatedQueueBase {
* parentQueue.getQueueCapacities().getAbsoluteCapacity(label));
assertEquals(effMinCapacity, Resources.multiply(resourceByLabel,
leafQueue.getQueueCapacities().getAbsoluteCapacity(label)));
// TODO: Wangda, I think this is a wrong test, it doesn't consider rounding
// loss of multiplication, the right value should be <10240, 2>, but the
// test expects <10240, 1>
// fixme, address this in the future patch (auto queue creation).
// if (expectedQueueEntitlements.get(label).getCapacity() > EPSILON) {
// assertEquals(Resource.newInstance(10 * GB, 2),
// leafQueue.getEffectiveCapacity(label));
// } else {
// assertEquals(Resource.newInstance(0, 0),
// leafQueue.getEffectiveCapacity(label));
// }
if (expectedQueueEntitlements.get(label).getCapacity() > EPSILON) {
if (leafQueue.getCapacityConfigType().equals(ABSOLUTE_RESOURCE)) {
String templatePrefix = cs.getConfiguration().getAutoCreatedQueueTemplateConfPrefix(
parentQueue.getQueuePath());
Resource resourceTemplate = parentQueue.getLeafQueueTemplate().getLeafQueueConfigs()
.getMinimumResourceRequirement(label, templatePrefix, RESOURCE_TYPES);
assertEquals(resourceTemplate, leafQueue.getEffectiveCapacity(label));
} else {
assertEquals(effMinCapacity, leafQueue.getEffectiveCapacity(label));
}
} else {
assertEquals(Resource.newInstance(0, 0),
leafQueue.getEffectiveCapacity(label));
}
if (leafQueue.getQueueCapacities().getAbsoluteCapacity(label) > 0) {
assertTrue(Resources.greaterThan(cs.getResourceCalculator(),