From dc4698bb3345870df3afdc5aaeea4d66c094bd2b Mon Sep 17 00:00:00 2001 From: Jian He Date: Tue, 21 Apr 2015 20:06:20 -0700 Subject: [PATCH] YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. Contributed by Rohith Sharmaks (cherry picked from commit bdd90110e6904b59746812d9a093924a65e72280) --- hadoop-yarn-project/CHANGES.txt | 3 + .../scheduler/QueueMetrics.java | 15 +- .../scheduler/capacity/AbstractCSQueue.java | 13 +- .../scheduler/capacity/CSQueueMetrics.java | 133 ++++++++++++++++++ .../scheduler/capacity/LeafQueue.java | 13 +- .../capacity/TestApplicationLimits.java | 10 +- .../scheduler/capacity/TestLeafQueue.java | 6 + 7 files changed, 174 insertions(+), 19 deletions(-) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueMetrics.java diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index ef78ad0d52f..73a398cd2ec 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -103,6 +103,9 @@ Release 2.8.0 - UNRELEASED YARN-3451. Display attempt start time and elapsed time on the web UI. (Rohith Sharmaks via jianhe) + YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. (Rohith + Sharmaks via jianhe) + OPTIMIZATIONS YARN-3339. TestDockerContainerExecutor should pull a single image and not diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java index 30c11132bd8..09fd73ee6de 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java @@ -83,16 +83,17 @@ public class QueueMetrics implements MetricsSource { static final MetricsInfo RECORD_INFO = info("QueueMetrics", "Metrics for the resource scheduler"); protected static final MetricsInfo QUEUE_INFO = info("Queue", "Metrics by queue"); - static final MetricsInfo USER_INFO = info("User", "Metrics by user"); + protected static final MetricsInfo USER_INFO = + info("User", "Metrics by user"); static final Splitter Q_SPLITTER = Splitter.on('.').omitEmptyStrings().trimResults(); - final MetricsRegistry registry; - final String queueName; - final QueueMetrics parent; - final MetricsSystem metricsSystem; - private final Map users; - private final Configuration conf; + protected final MetricsRegistry registry; + protected final String queueName; + protected final QueueMetrics parent; + protected final MetricsSystem metricsSystem; + protected final Map users; + protected final Configuration conf; protected QueueMetrics(MetricsSystem ms, String queueName, Queue parent, boolean enableUserMetrics, Configuration conf) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java index 550c6aa2fdb..9233e015ac1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java @@ -43,7 +43,6 @@ import org.apache.hadoop.yarn.security.PrivilegedEntity.EntityType; import org.apache.hadoop.yarn.security.YarnAuthorizationProvider; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType; -import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; @@ -67,7 +66,7 @@ public abstract class AbstractCSQueue implements CSQueue { final Resource minimumAllocation; Resource maximumAllocation; QueueState state; - final QueueMetrics metrics; + final CSQueueMetrics metrics; protected final PrivilegedEntity queueEntity; final ResourceCalculator resourceCalculator; @@ -100,10 +99,10 @@ public abstract class AbstractCSQueue implements CSQueue { this.resourceCalculator = cs.getResourceCalculator(); // must be called after parent and queueName is set - this.metrics = old != null ? old.getMetrics() : - QueueMetrics.forQueue(getQueuePath(), parent, - cs.getConfiguration().getEnableUserMetrics(), - cs.getConf()); + this.metrics = + old != null ? (CSQueueMetrics) old.getMetrics() : CSQueueMetrics + .forQueue(getQueuePath(), parent, cs.getConfiguration() + .getEnableUserMetrics(), cs.getConf()); this.csContext = cs; this.minimumAllocation = csContext.getMinimumResourceCapability(); @@ -171,7 +170,7 @@ public abstract class AbstractCSQueue implements CSQueue { } @Override - public QueueMetrics getMetrics() { + public CSQueueMetrics getMetrics() { return metrics; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueMetrics.java new file mode 100644 index 00000000000..51d39ff6c45 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueMetrics.java @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.metrics2.MetricsSystem; +import org.apache.hadoop.metrics2.annotation.Metric; +import org.apache.hadoop.metrics2.annotation.Metrics; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.lib.MutableGaugeInt; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; + +@Metrics(context = "yarn") +public class CSQueueMetrics extends QueueMetrics { + + @Metric("AM memory limit in MB") + MutableGaugeInt AMResourceLimitMB; + @Metric("AM CPU limit in virtual cores") + MutableGaugeInt AMResourceLimitVCores; + @Metric("Used AM memory limit in MB") + MutableGaugeInt usedAMResourceMB; + @Metric("Used AM CPU limit in virtual cores") + MutableGaugeInt usedAMResourceVCores; + + CSQueueMetrics(MetricsSystem ms, String queueName, Queue parent, + boolean enableUserMetrics, Configuration conf) { + super(ms, queueName, parent, enableUserMetrics, conf); + } + + public int getAMResourceLimitMB() { + return AMResourceLimitMB.value(); + } + + public int getAMResourceLimitVCores() { + return AMResourceLimitVCores.value(); + } + + public int getUsedAMResourceMB() { + return usedAMResourceMB.value(); + } + + public int getUsedAMResourceVCores() { + return usedAMResourceVCores.value(); + } + + public void setAMResouceLimit(Resource res) { + AMResourceLimitMB.set(res.getMemory()); + AMResourceLimitVCores.set(res.getVirtualCores()); + } + + public void setAMResouceLimitForUser(String user, Resource res) { + CSQueueMetrics userMetrics = (CSQueueMetrics) getUserMetrics(user); + if (userMetrics != null) { + userMetrics.setAMResouceLimit(res); + } + } + + public void incAMUsed(String user, Resource res) { + usedAMResourceMB.incr(res.getMemory()); + usedAMResourceVCores.incr(res.getVirtualCores()); + CSQueueMetrics userMetrics = (CSQueueMetrics) getUserMetrics(user); + if (userMetrics != null) { + userMetrics.incAMUsed(user, res); + } + } + + public void decAMUsed(String user, Resource res) { + usedAMResourceMB.decr(res.getMemory()); + usedAMResourceVCores.decr(res.getVirtualCores()); + CSQueueMetrics userMetrics = (CSQueueMetrics) getUserMetrics(user); + if (userMetrics != null) { + userMetrics.decAMUsed(user, res); + } + } + + public synchronized static CSQueueMetrics forQueue(String queueName, + Queue parent, boolean enableUserMetrics, Configuration conf) { + MetricsSystem ms = DefaultMetricsSystem.instance(); + QueueMetrics metrics = queueMetrics.get(queueName); + if (metrics == null) { + metrics = + new CSQueueMetrics(ms, queueName, parent, enableUserMetrics, conf) + .tag(QUEUE_INFO, queueName); + + // Register with the MetricsSystems + if (ms != null) { + metrics = + ms.register(sourceName(queueName).toString(), "Metrics for queue: " + + queueName, metrics); + } + queueMetrics.put(queueName, metrics); + } + + return (CSQueueMetrics) metrics; + } + + @Override + public synchronized QueueMetrics getUserMetrics(String userName) { + if (users == null) { + return null; + } + CSQueueMetrics metrics = (CSQueueMetrics) users.get(userName); + if (metrics == null) { + metrics = new CSQueueMetrics(metricsSystem, queueName, null, false, conf); + users.put(userName, metrics); + metricsSystem.register( + sourceName(queueName).append(",user=").append(userName).toString(), + "Metrics for user '" + userName + "' in queue '" + queueName + "'", + ((CSQueueMetrics) metrics.tag(QUEUE_INFO, queueName)).tag(USER_INFO, + userName)); + } + return metrics; + } + +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java index 452d4a8ba8b..69e7e535b70 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java @@ -563,10 +563,12 @@ public class LeafQueue extends AbstractCSQueue { } Resource queueCap = Resources.max(resourceCalculator, lastClusterResource, absoluteCapacityResource, queueCurrentLimit); - return Resources.multiplyAndNormalizeUp( - resourceCalculator, - queueCap, - maxAMResourcePerQueuePercent, minimumAllocation); + Resource amResouceLimit = + Resources.multiplyAndNormalizeUp(resourceCalculator, queueCap, + maxAMResourcePerQueuePercent, minimumAllocation); + + metrics.setAMResouceLimit(amResouceLimit); + return amResouceLimit; } public synchronized Resource getUserAMResourceLimit() { @@ -645,6 +647,8 @@ public class LeafQueue extends AbstractCSQueue { orderingPolicy.addSchedulableEntity(application); queueUsage.incAMUsed(application.getAMResource()); user.getResourceUsage().incAMUsed(application.getAMResource()); + metrics.incAMUsed(application.getUser(), application.getAMResource()); + metrics.setAMResouceLimitForUser(application.getUser(), userAMLimit); i.remove(); LOG.info("Application " + application.getApplicationId() + " from user: " + application.getUser() + @@ -698,6 +702,7 @@ public class LeafQueue extends AbstractCSQueue { } else { queueUsage.decAMUsed(application.getAMResource()); user.getResourceUsage().decAMUsed(application.getAMResource()); + metrics.decAMUsed(application.getUser(), application.getAMResource()); } applicationAttemptMap.remove(application.getApplicationAttemptId()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java index 929b3e1d2ec..484090daeae 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java @@ -278,10 +278,18 @@ public class TestApplicationLimits { " UserAMResourceLimit=" + queue.getUserAMResourceLimit()); - assertEquals(queue.getAMResourceLimit(), Resource.newInstance(160*GB, 1)); + Resource amResourceLimit = Resource.newInstance(160 * GB, 1); + assertEquals(queue.getAMResourceLimit(), amResourceLimit); + assertEquals(queue.getAMResourceLimit(), amResourceLimit); assertEquals(queue.getUserAMResourceLimit(), Resource.newInstance(80*GB, 1)); + // Assert in metrics + assertEquals(queue.getMetrics().getAMResourceLimitMB(), + amResourceLimit.getMemory()); + assertEquals(queue.getMetrics().getAMResourceLimitVCores(), + amResourceLimit.getVirtualCores()); + assertEquals( (int)(clusterResource.getMemory() * queue.getAbsoluteCapacity()), queue.getMetrics().getAvailableMB() diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java index 33b8f568f84..3ba80364a43 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java @@ -432,10 +432,16 @@ public class TestLeafQueue { .getMockApplicationAttemptId(0, 2); FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_0, a, null, spyRMContext); + app_1.setAMResource(Resource.newInstance(100, 1)); a.submitApplicationAttempt(app_1, user_0); // same user assertEquals(1, a.getMetrics().getAppsSubmitted()); assertEquals(1, a.getMetrics().getAppsPending()); + assertEquals(1, a.getUser(user_0).getActiveApplications()); + assertEquals(app_1.getAMResource().getMemory(), a.getMetrics() + .getUsedAMResourceMB()); + assertEquals(app_1.getAMResource().getVirtualCores(), a.getMetrics() + .getUsedAMResourceVCores()); event = new AppAttemptRemovedSchedulerEvent(appAttemptId_0, RMAppAttemptState.FINISHED, false);