YARN-10628. Add node usage metrics in SLS. Contributed by Vadaga Ananyo Rao

This commit is contained in:
Szilard Nemeth 2021-07-29 13:43:40 +02:00
parent d78b300ed4
commit 54f9fff218
6 changed files with 215 additions and 3 deletions

View File

@ -100,6 +100,10 @@
<div class="divborder span8 " style="margin-left:50px" id="area7"></div>
<div class="span7 chart-area" id="area8"></div>
</div>
<div class="row">
<div class="divborder span8 chart-area" style="margin-left:50px" id="area9"></div>
<div class="divborder span8 chart-area" id="area10"></div>
</div>
</div>
<p>&nbsp;</p>
<script>
@ -162,6 +166,23 @@
"scheduler.handle-APP_REMOVED.timecost", "scheduler.handle-CONTAINER_EXPIRED.timecost"
];
drawEachChart("#area7", data, legends, "Scheduler allocate & handle operations timecost", "Timecost (ms)", 0, 210);
// Node usage stats
legends = [
"nodes.memory.unused", "nodes.memory.1to19pctUsed",
"nodes.memory.20to39pctUsed", "nodes.memory.40to59pctUsed",
"nodes.memory.60to79pctUsed","nodes.memory.80to99pctUsed",
"nodes.memory.full"
];
drawEachChart("#area9", data, legends, "Cluster nodes memory usage", "Node count", 1, 0);
legends = [
"nodes.vcores.unused", "nodes.vcores.1to19pctUsed",
"nodes.vcores.20to39pctUsed", "nodes.vcores.40to59pctUsed",
"nodes.vcores.60to79pctUsed", "nodes.vcores.80to99pctUsed",
"nodes.vcores.full"
];
drawEachChart("#area10", data, legends, "Cluster nodes vcores usage", "Node count", 1, 0);
});
}).done(function() {
$("#data").css("display", "block");

View File

@ -49,6 +49,10 @@
<div class="row">
<div class="divborder span8" style="margin-left:50px" id="area7"></div>
<div class="divborder span8" style="margin-left:50px" id="area8"></div>
</div>
<div class="row">
<div class="divborder span8" style="margin-left:50px" id="area9"></div>
<div class="divborder span8" style="margin-left:50px" id="area10"></div>
</div><br/><br/>
<script>
@ -87,6 +91,20 @@
''scheduler.commit.failure.timecost''];
legends[7] = [''scheduler.commit.success.throughput'',
''scheduler.commit.failure.throughput''];
legends[8] = [''nodes.memory.unused'',
''nodes.memory.1to19pctUsed'',
''nodes.memory.20to39pctUsed'',
''nodes.memory.40to59pctUsed'',
''nodes.memory.60to79pctUsed'',
''nodes.memory.80to99pctUsed'',
''nodes.memory.full''];
legends[9] = [''nodes.vcores.unused'',
''nodes.vcores.1to19pctUsed'',
''nodes.vcores.20to39pctUsed'',
''nodes.vcores.40to59pctUsed'',
''nodes.vcores.60to79pctUsed'',
''nodes.vcores.80to99pctUsed'',
''nodes.vcores.full''];
// title
titles[0] = ''Cluster running applications & containers'';
@ -97,6 +115,8 @@
titles[5] = ''Queue allocated vcores'';
titles[6] = ''Scheduler allocate & handle & commit operation timecost'';
titles[7] = ''Scheduler commit success/failure operation throughput'';
titles[8] = ''Cluster nodes memory usage'';
titles[9] = ''Cluster nodes vcores usage'';
// ylabels
yLabels[0] = ''Number'';
@ -107,12 +127,14 @@
yLabels[5] = ''Number'';
yLabels[6] = ''Timecost (ms)'';
yLabels[7] = ''Number'';
yLabels[8] = ''Number'';
yLabels[9] = ''Number'';
// is area?
isAreas = [0, 0, 0, 0, 1, 1, 0, 0];
isAreas = [0, 0, 0, 0, 1, 1, 0, 0, 1, 1];
// draw all charts
for (var i = 0; i < 8; i ++) '{'
for (var i = 0; i < 10; i ++) '{'
drawEachChart(i);
'}'
@ -180,7 +202,7 @@
data.push(point);
// clear old
for (var i = 0; i < 8; i ++) '{'
for (var i = 0; i < 10; i ++) '{'
svgs[i].selectAll(''g.tick'').remove();
svgs[i].selectAll(''g'').remove();
var color = d3.scale.category10();

View File

@ -54,11 +54,13 @@
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler;
import org.apache.hadoop.yarn.sls.conf.SLSConfiguration;
import org.apache.hadoop.yarn.sls.utils.NodeUsageRanges;
import org.apache.hadoop.yarn.sls.web.SLSWebApp;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -157,6 +159,8 @@ void init(ResourceScheduler resourceScheduler, Configuration config)
registerClusterResourceMetrics();
registerContainerAppNumMetrics();
registerSchedulerMetrics();
registerNodesUsageMetrics("memory");
registerNodesUsageMetrics("vcores");
// .csv output
initMetricsCSVOutput();
@ -463,6 +467,49 @@ private void registerSchedulerMetrics() {
}
}
private void registerNodesUsageMetrics(String resourceType) {
samplerLock.lock();
try {
for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
String metricName = "nodes." + resourceType + "." + range.getKeyword();
metrics.register(metricName,
new Gauge<Integer>() {
@Override
public Integer getValue() {
if (!(scheduler instanceof AbstractYarnScheduler)) {
return 0;
} else {
int count = 0;
AbstractYarnScheduler sch = (AbstractYarnScheduler) scheduler;
for (Object node : sch.getNodeTracker().getAllNodes()) {
SchedulerNode sNode = (SchedulerNode) node;
long allocated = 0, total = 0;
if (resourceType.equals("memory")) {
allocated = sNode.getAllocatedResource().getMemorySize();
total = sNode.getTotalResource().getMemorySize();
} else if (resourceType.equals("vcores")) {
allocated =
sNode.getAllocatedResource().getVirtualCores();
total =
sNode.getTotalResource().getVirtualCores();
}
float usedPct = allocated * 100f / total;
if (range.getLowerLimit() <= usedPct
&& usedPct <= range.getUpperLimit()) {
count++;
}
}
return count;
}
}
}
);
}
} finally {
samplerLock.unlock();
}
}
private void initMetricsCSVOutput() {
int timeIntervalMS = conf.getInt(
SLSConfiguration.METRICS_RECORD_INTERVAL_MS,

View File

@ -0,0 +1,68 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.sls.utils;
import java.util.LinkedHashSet;
import java.util.Set;
public final class NodeUsageRanges {
private NodeUsageRanges() {}
/**
* Class to store the keyword, lower-limit and upper-limit of a resource
* within the specified lower-limit (inclusive) and upper-limit (inclusive).
*/
public static class Range {
private String keyword;
private float lowerLimit, upperLimit;
public Range(String keyword, float lowerLimit, float upperLimit) {
this.keyword = keyword;
this.lowerLimit = lowerLimit;
this.upperLimit = upperLimit;
}
public String getKeyword() {
return keyword;
}
public float getLowerLimit() {
return lowerLimit;
}
public float getUpperLimit() {
return upperLimit;
}
}
private static final Set<Range> RANGES;
static {
RANGES = new LinkedHashSet<>();
RANGES.add(new Range("unused", 0, 0));
RANGES.add(new Range("1to19pctUsed", 1, 19));
RANGES.add(new Range("20to39pctUsed", 20, 39));
RANGES.add(new Range("40to59pctUsed", 40, 59));
RANGES.add(new Range("60to79pctUsed", 60, 79));
RANGES.add(new Range("80to99pctUsed", 80, 99));
RANGES.add(new Range("full", 100, 100));
}
public static Set<Range> getRanges() {
return RANGES;
}
}

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Utility classes for SLS.
*/
package org.apache.hadoop.yarn.sls.utils;

View File

@ -40,6 +40,7 @@
import org.apache.hadoop.yarn.sls.scheduler.SchedulerMetrics;
import org.apache.hadoop.yarn.sls.scheduler.SchedulerWrapper;
import org.apache.hadoop.yarn.sls.utils.NodeUsageRanges;
import org.eclipse.jetty.http.MimeTypes;
import org.eclipse.jetty.server.Handler;
import org.eclipse.jetty.server.Request;
@ -73,6 +74,7 @@ public class SLSWebApp extends HttpServlet {
private transient Gauge allocatedVCoresGauge;
private transient Gauge availableMemoryGauge;
private transient Gauge availableVCoresGauge;
private transient Map<String, Gauge> perNodeUsageGaugeMap;
private transient Histogram allocateTimecostHistogram;
private transient Histogram commitSuccessTimecostHistogram;
private transient Histogram commitFailureTimecostHistogram;
@ -122,6 +124,7 @@ public SLSWebApp(SchedulerWrapper wrapper, int metricsAddressPort) {
handleOperTimecostHistogramMap = new HashMap<>();
queueAllocatedMemoryCounterMap = new HashMap<>();
queueAllocatedVCoresCounterMap = new HashMap<>();
perNodeUsageGaugeMap = new HashMap<>();
schedulerMetrics = wrapper.getSchedulerMetrics();
metrics = schedulerMetrics.getMetrics();
port = metricsAddressPort;
@ -547,10 +550,40 @@ public String generateRealTimeTrackingMetrics() {
sb.append(",\"scheduler.handle-").append(e).append(".timecost\":")
.append(handleOperTimecostMap.get(e));
}
sb.append(generateNodeUsageMetrics("memory"));
sb.append(generateNodeUsageMetrics("vcores"));
sb.append("}");
return sb.toString();
}
private String generateNodeUsageMetrics(String resourceType) {
StringBuilder sb = new StringBuilder();
Map<String, Integer> perNodeUsageMap = new HashMap<>();
for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
String metricName = "nodes." + resourceType + "." + range.getKeyword();
if (!perNodeUsageGaugeMap.containsKey(metricName) &&
metrics.getGauges().containsKey(metricName)) {
perNodeUsageGaugeMap.put(metricName,
metrics.getGauges().get(metricName));
}
int perNodeUsageCount =
perNodeUsageGaugeMap.containsKey(metricName) ?
Integer.parseInt(
perNodeUsageGaugeMap.get(metricName).getValue().toString()) : 0;
perNodeUsageMap.put(metricName, perNodeUsageCount);
}
// per node memory and vcores used
for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
String metricName = "nodes." + resourceType + "." + range.getKeyword();
sb.append(",\"").append(metricName).append("\":")
.append(perNodeUsageMap.get(metricName));
}
return sb.toString();
}
/**
* package metrics information for one tracked queue/app
* only support FairScheduler currently