YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. Contributed by Rohith Sharmaks
This commit is contained in:
parent
e71d0d87d9
commit
bdd90110e6
|
@ -151,6 +151,9 @@ Release 2.8.0 - UNRELEASED
|
||||||
YARN-3451. Display attempt start time and elapsed time on the web UI.
|
YARN-3451. Display attempt start time and elapsed time on the web UI.
|
||||||
(Rohith Sharmaks via jianhe)
|
(Rohith Sharmaks via jianhe)
|
||||||
|
|
||||||
|
YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. (Rohith
|
||||||
|
Sharmaks via jianhe)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
YARN-3339. TestDockerContainerExecutor should pull a single image and not
|
YARN-3339. TestDockerContainerExecutor should pull a single image and not
|
||||||
|
|
|
@ -83,16 +83,17 @@ public class QueueMetrics implements MetricsSource {
|
||||||
static final MetricsInfo RECORD_INFO = info("QueueMetrics",
|
static final MetricsInfo RECORD_INFO = info("QueueMetrics",
|
||||||
"Metrics for the resource scheduler");
|
"Metrics for the resource scheduler");
|
||||||
protected static final MetricsInfo QUEUE_INFO = info("Queue", "Metrics by queue");
|
protected static final MetricsInfo QUEUE_INFO = info("Queue", "Metrics by queue");
|
||||||
static final MetricsInfo USER_INFO = info("User", "Metrics by user");
|
protected static final MetricsInfo USER_INFO =
|
||||||
|
info("User", "Metrics by user");
|
||||||
static final Splitter Q_SPLITTER =
|
static final Splitter Q_SPLITTER =
|
||||||
Splitter.on('.').omitEmptyStrings().trimResults();
|
Splitter.on('.').omitEmptyStrings().trimResults();
|
||||||
|
|
||||||
final MetricsRegistry registry;
|
protected final MetricsRegistry registry;
|
||||||
final String queueName;
|
protected final String queueName;
|
||||||
final QueueMetrics parent;
|
protected final QueueMetrics parent;
|
||||||
final MetricsSystem metricsSystem;
|
protected final MetricsSystem metricsSystem;
|
||||||
private final Map<String, QueueMetrics> users;
|
protected final Map<String, QueueMetrics> users;
|
||||||
private final Configuration conf;
|
protected final Configuration conf;
|
||||||
|
|
||||||
protected QueueMetrics(MetricsSystem ms, String queueName, Queue parent,
|
protected QueueMetrics(MetricsSystem ms, String queueName, Queue parent,
|
||||||
boolean enableUserMetrics, Configuration conf) {
|
boolean enableUserMetrics, Configuration conf) {
|
||||||
|
|
|
@ -43,7 +43,6 @@ import org.apache.hadoop.yarn.security.PrivilegedEntity.EntityType;
|
||||||
import org.apache.hadoop.yarn.security.YarnAuthorizationProvider;
|
import org.apache.hadoop.yarn.security.YarnAuthorizationProvider;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
|
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
|
||||||
|
@ -67,7 +66,7 @@ public abstract class AbstractCSQueue implements CSQueue {
|
||||||
final Resource minimumAllocation;
|
final Resource minimumAllocation;
|
||||||
Resource maximumAllocation;
|
Resource maximumAllocation;
|
||||||
QueueState state;
|
QueueState state;
|
||||||
final QueueMetrics metrics;
|
final CSQueueMetrics metrics;
|
||||||
protected final PrivilegedEntity queueEntity;
|
protected final PrivilegedEntity queueEntity;
|
||||||
|
|
||||||
final ResourceCalculator resourceCalculator;
|
final ResourceCalculator resourceCalculator;
|
||||||
|
@ -100,10 +99,10 @@ public abstract class AbstractCSQueue implements CSQueue {
|
||||||
this.resourceCalculator = cs.getResourceCalculator();
|
this.resourceCalculator = cs.getResourceCalculator();
|
||||||
|
|
||||||
// must be called after parent and queueName is set
|
// must be called after parent and queueName is set
|
||||||
this.metrics = old != null ? old.getMetrics() :
|
this.metrics =
|
||||||
QueueMetrics.forQueue(getQueuePath(), parent,
|
old != null ? (CSQueueMetrics) old.getMetrics() : CSQueueMetrics
|
||||||
cs.getConfiguration().getEnableUserMetrics(),
|
.forQueue(getQueuePath(), parent, cs.getConfiguration()
|
||||||
cs.getConf());
|
.getEnableUserMetrics(), cs.getConf());
|
||||||
|
|
||||||
this.csContext = cs;
|
this.csContext = cs;
|
||||||
this.minimumAllocation = csContext.getMinimumResourceCapability();
|
this.minimumAllocation = csContext.getMinimumResourceCapability();
|
||||||
|
@ -171,7 +170,7 @@ public abstract class AbstractCSQueue implements CSQueue {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public QueueMetrics getMetrics() {
|
public CSQueueMetrics getMetrics() {
|
||||||
return metrics;
|
return metrics;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,133 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.metrics2.MetricsSystem;
|
||||||
|
import org.apache.hadoop.metrics2.annotation.Metric;
|
||||||
|
import org.apache.hadoop.metrics2.annotation.Metrics;
|
||||||
|
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||||
|
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
|
||||||
|
import org.apache.hadoop.yarn.api.records.Resource;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
|
||||||
|
|
||||||
|
@Metrics(context = "yarn")
|
||||||
|
public class CSQueueMetrics extends QueueMetrics {
|
||||||
|
|
||||||
|
@Metric("AM memory limit in MB")
|
||||||
|
MutableGaugeInt AMResourceLimitMB;
|
||||||
|
@Metric("AM CPU limit in virtual cores")
|
||||||
|
MutableGaugeInt AMResourceLimitVCores;
|
||||||
|
@Metric("Used AM memory limit in MB")
|
||||||
|
MutableGaugeInt usedAMResourceMB;
|
||||||
|
@Metric("Used AM CPU limit in virtual cores")
|
||||||
|
MutableGaugeInt usedAMResourceVCores;
|
||||||
|
|
||||||
|
CSQueueMetrics(MetricsSystem ms, String queueName, Queue parent,
|
||||||
|
boolean enableUserMetrics, Configuration conf) {
|
||||||
|
super(ms, queueName, parent, enableUserMetrics, conf);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getAMResourceLimitMB() {
|
||||||
|
return AMResourceLimitMB.value();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getAMResourceLimitVCores() {
|
||||||
|
return AMResourceLimitVCores.value();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getUsedAMResourceMB() {
|
||||||
|
return usedAMResourceMB.value();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getUsedAMResourceVCores() {
|
||||||
|
return usedAMResourceVCores.value();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAMResouceLimit(Resource res) {
|
||||||
|
AMResourceLimitMB.set(res.getMemory());
|
||||||
|
AMResourceLimitVCores.set(res.getVirtualCores());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAMResouceLimitForUser(String user, Resource res) {
|
||||||
|
CSQueueMetrics userMetrics = (CSQueueMetrics) getUserMetrics(user);
|
||||||
|
if (userMetrics != null) {
|
||||||
|
userMetrics.setAMResouceLimit(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void incAMUsed(String user, Resource res) {
|
||||||
|
usedAMResourceMB.incr(res.getMemory());
|
||||||
|
usedAMResourceVCores.incr(res.getVirtualCores());
|
||||||
|
CSQueueMetrics userMetrics = (CSQueueMetrics) getUserMetrics(user);
|
||||||
|
if (userMetrics != null) {
|
||||||
|
userMetrics.incAMUsed(user, res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void decAMUsed(String user, Resource res) {
|
||||||
|
usedAMResourceMB.decr(res.getMemory());
|
||||||
|
usedAMResourceVCores.decr(res.getVirtualCores());
|
||||||
|
CSQueueMetrics userMetrics = (CSQueueMetrics) getUserMetrics(user);
|
||||||
|
if (userMetrics != null) {
|
||||||
|
userMetrics.decAMUsed(user, res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized static CSQueueMetrics forQueue(String queueName,
|
||||||
|
Queue parent, boolean enableUserMetrics, Configuration conf) {
|
||||||
|
MetricsSystem ms = DefaultMetricsSystem.instance();
|
||||||
|
QueueMetrics metrics = queueMetrics.get(queueName);
|
||||||
|
if (metrics == null) {
|
||||||
|
metrics =
|
||||||
|
new CSQueueMetrics(ms, queueName, parent, enableUserMetrics, conf)
|
||||||
|
.tag(QUEUE_INFO, queueName);
|
||||||
|
|
||||||
|
// Register with the MetricsSystems
|
||||||
|
if (ms != null) {
|
||||||
|
metrics =
|
||||||
|
ms.register(sourceName(queueName).toString(), "Metrics for queue: "
|
||||||
|
+ queueName, metrics);
|
||||||
|
}
|
||||||
|
queueMetrics.put(queueName, metrics);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (CSQueueMetrics) metrics;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized QueueMetrics getUserMetrics(String userName) {
|
||||||
|
if (users == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
CSQueueMetrics metrics = (CSQueueMetrics) users.get(userName);
|
||||||
|
if (metrics == null) {
|
||||||
|
metrics = new CSQueueMetrics(metricsSystem, queueName, null, false, conf);
|
||||||
|
users.put(userName, metrics);
|
||||||
|
metricsSystem.register(
|
||||||
|
sourceName(queueName).append(",user=").append(userName).toString(),
|
||||||
|
"Metrics for user '" + userName + "' in queue '" + queueName + "'",
|
||||||
|
((CSQueueMetrics) metrics.tag(QUEUE_INFO, queueName)).tag(USER_INFO,
|
||||||
|
userName));
|
||||||
|
}
|
||||||
|
return metrics;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -563,10 +563,12 @@ public class LeafQueue extends AbstractCSQueue {
|
||||||
}
|
}
|
||||||
Resource queueCap = Resources.max(resourceCalculator, lastClusterResource,
|
Resource queueCap = Resources.max(resourceCalculator, lastClusterResource,
|
||||||
absoluteCapacityResource, queueCurrentLimit);
|
absoluteCapacityResource, queueCurrentLimit);
|
||||||
return Resources.multiplyAndNormalizeUp(
|
Resource amResouceLimit =
|
||||||
resourceCalculator,
|
Resources.multiplyAndNormalizeUp(resourceCalculator, queueCap,
|
||||||
queueCap,
|
|
||||||
maxAMResourcePerQueuePercent, minimumAllocation);
|
maxAMResourcePerQueuePercent, minimumAllocation);
|
||||||
|
|
||||||
|
metrics.setAMResouceLimit(amResouceLimit);
|
||||||
|
return amResouceLimit;
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized Resource getUserAMResourceLimit() {
|
public synchronized Resource getUserAMResourceLimit() {
|
||||||
|
@ -645,6 +647,8 @@ public class LeafQueue extends AbstractCSQueue {
|
||||||
orderingPolicy.addSchedulableEntity(application);
|
orderingPolicy.addSchedulableEntity(application);
|
||||||
queueUsage.incAMUsed(application.getAMResource());
|
queueUsage.incAMUsed(application.getAMResource());
|
||||||
user.getResourceUsage().incAMUsed(application.getAMResource());
|
user.getResourceUsage().incAMUsed(application.getAMResource());
|
||||||
|
metrics.incAMUsed(application.getUser(), application.getAMResource());
|
||||||
|
metrics.setAMResouceLimitForUser(application.getUser(), userAMLimit);
|
||||||
i.remove();
|
i.remove();
|
||||||
LOG.info("Application " + application.getApplicationId() +
|
LOG.info("Application " + application.getApplicationId() +
|
||||||
" from user: " + application.getUser() +
|
" from user: " + application.getUser() +
|
||||||
|
@ -698,6 +702,7 @@ public class LeafQueue extends AbstractCSQueue {
|
||||||
} else {
|
} else {
|
||||||
queueUsage.decAMUsed(application.getAMResource());
|
queueUsage.decAMUsed(application.getAMResource());
|
||||||
user.getResourceUsage().decAMUsed(application.getAMResource());
|
user.getResourceUsage().decAMUsed(application.getAMResource());
|
||||||
|
metrics.decAMUsed(application.getUser(), application.getAMResource());
|
||||||
}
|
}
|
||||||
applicationAttemptMap.remove(application.getApplicationAttemptId());
|
applicationAttemptMap.remove(application.getApplicationAttemptId());
|
||||||
|
|
||||||
|
|
|
@ -278,10 +278,18 @@ public class TestApplicationLimits {
|
||||||
" UserAMResourceLimit=" +
|
" UserAMResourceLimit=" +
|
||||||
queue.getUserAMResourceLimit());
|
queue.getUserAMResourceLimit());
|
||||||
|
|
||||||
assertEquals(queue.getAMResourceLimit(), Resource.newInstance(160*GB, 1));
|
Resource amResourceLimit = Resource.newInstance(160 * GB, 1);
|
||||||
|
assertEquals(queue.getAMResourceLimit(), amResourceLimit);
|
||||||
|
assertEquals(queue.getAMResourceLimit(), amResourceLimit);
|
||||||
assertEquals(queue.getUserAMResourceLimit(),
|
assertEquals(queue.getUserAMResourceLimit(),
|
||||||
Resource.newInstance(80*GB, 1));
|
Resource.newInstance(80*GB, 1));
|
||||||
|
|
||||||
|
// Assert in metrics
|
||||||
|
assertEquals(queue.getMetrics().getAMResourceLimitMB(),
|
||||||
|
amResourceLimit.getMemory());
|
||||||
|
assertEquals(queue.getMetrics().getAMResourceLimitVCores(),
|
||||||
|
amResourceLimit.getVirtualCores());
|
||||||
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
(int)(clusterResource.getMemory() * queue.getAbsoluteCapacity()),
|
(int)(clusterResource.getMemory() * queue.getAbsoluteCapacity()),
|
||||||
queue.getMetrics().getAvailableMB()
|
queue.getMetrics().getAvailableMB()
|
||||||
|
|
|
@ -432,10 +432,16 @@ public class TestLeafQueue {
|
||||||
.getMockApplicationAttemptId(0, 2);
|
.getMockApplicationAttemptId(0, 2);
|
||||||
FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_0, a, null,
|
FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_0, a, null,
|
||||||
spyRMContext);
|
spyRMContext);
|
||||||
|
app_1.setAMResource(Resource.newInstance(100, 1));
|
||||||
a.submitApplicationAttempt(app_1, user_0); // same user
|
a.submitApplicationAttempt(app_1, user_0); // same user
|
||||||
|
|
||||||
assertEquals(1, a.getMetrics().getAppsSubmitted());
|
assertEquals(1, a.getMetrics().getAppsSubmitted());
|
||||||
assertEquals(1, a.getMetrics().getAppsPending());
|
assertEquals(1, a.getMetrics().getAppsPending());
|
||||||
|
assertEquals(1, a.getUser(user_0).getActiveApplications());
|
||||||
|
assertEquals(app_1.getAMResource().getMemory(), a.getMetrics()
|
||||||
|
.getUsedAMResourceMB());
|
||||||
|
assertEquals(app_1.getAMResource().getVirtualCores(), a.getMetrics()
|
||||||
|
.getUsedAMResourceVCores());
|
||||||
|
|
||||||
event = new AppAttemptRemovedSchedulerEvent(appAttemptId_0,
|
event = new AppAttemptRemovedSchedulerEvent(appAttemptId_0,
|
||||||
RMAppAttemptState.FINISHED, false);
|
RMAppAttemptState.FINISHED, false);
|
||||||
|
|
Loading…
Reference in New Issue