YARN-10628. Add node usage metrics in SLS. Contributed by Vadaga Ananyo Rao
This commit is contained in:
parent
d78b300ed4
commit
54f9fff218
|
@ -100,6 +100,10 @@
|
|||
<div class="divborder span8 " style="margin-left:50px" id="area7"></div>
|
||||
<div class="span7 chart-area" id="area8"></div>
|
||||
</div>
|
||||
<div class="row">
|
||||
<div class="divborder span8 chart-area" style="margin-left:50px" id="area9"></div>
|
||||
<div class="divborder span8 chart-area" id="area10"></div>
|
||||
</div>
|
||||
</div>
|
||||
<p> </p>
|
||||
<script>
|
||||
|
@ -162,6 +166,23 @@ function drawCharts(filepath) {
|
|||
"scheduler.handle-APP_REMOVED.timecost", "scheduler.handle-CONTAINER_EXPIRED.timecost"
|
||||
];
|
||||
drawEachChart("#area7", data, legends, "Scheduler allocate & handle operations timecost", "Timecost (ms)", 0, 210);
|
||||
|
||||
// Node usage stats
|
||||
legends = [
|
||||
"nodes.memory.unused", "nodes.memory.1to19pctUsed",
|
||||
"nodes.memory.20to39pctUsed", "nodes.memory.40to59pctUsed",
|
||||
"nodes.memory.60to79pctUsed","nodes.memory.80to99pctUsed",
|
||||
"nodes.memory.full"
|
||||
];
|
||||
drawEachChart("#area9", data, legends, "Cluster nodes memory usage", "Node count", 1, 0);
|
||||
|
||||
legends = [
|
||||
"nodes.vcores.unused", "nodes.vcores.1to19pctUsed",
|
||||
"nodes.vcores.20to39pctUsed", "nodes.vcores.40to59pctUsed",
|
||||
"nodes.vcores.60to79pctUsed", "nodes.vcores.80to99pctUsed",
|
||||
"nodes.vcores.full"
|
||||
];
|
||||
drawEachChart("#area10", data, legends, "Cluster nodes vcores usage", "Node count", 1, 0);
|
||||
});
|
||||
}).done(function() {
|
||||
$("#data").css("display", "block");
|
||||
|
|
|
@ -49,6 +49,10 @@
|
|||
<div class="row">
|
||||
<div class="divborder span8" style="margin-left:50px" id="area7"></div>
|
||||
<div class="divborder span8" style="margin-left:50px" id="area8"></div>
|
||||
</div>
|
||||
<div class="row">
|
||||
<div class="divborder span8" style="margin-left:50px" id="area9"></div>
|
||||
<div class="divborder span8" style="margin-left:50px" id="area10"></div>
|
||||
</div><br/><br/>
|
||||
|
||||
<script>
|
||||
|
@ -87,6 +91,20 @@
|
|||
''scheduler.commit.failure.timecost''];
|
||||
legends[7] = [''scheduler.commit.success.throughput'',
|
||||
''scheduler.commit.failure.throughput''];
|
||||
legends[8] = [''nodes.memory.unused'',
|
||||
''nodes.memory.1to19pctUsed'',
|
||||
''nodes.memory.20to39pctUsed'',
|
||||
''nodes.memory.40to59pctUsed'',
|
||||
''nodes.memory.60to79pctUsed'',
|
||||
''nodes.memory.80to99pctUsed'',
|
||||
''nodes.memory.full''];
|
||||
legends[9] = [''nodes.vcores.unused'',
|
||||
''nodes.vcores.1to19pctUsed'',
|
||||
''nodes.vcores.20to39pctUsed'',
|
||||
''nodes.vcores.40to59pctUsed'',
|
||||
''nodes.vcores.60to79pctUsed'',
|
||||
''nodes.vcores.80to99pctUsed'',
|
||||
''nodes.vcores.full''];
|
||||
|
||||
// title
|
||||
titles[0] = ''Cluster running applications & containers'';
|
||||
|
@ -97,6 +115,8 @@
|
|||
titles[5] = ''Queue allocated vcores'';
|
||||
titles[6] = ''Scheduler allocate & handle & commit operation timecost'';
|
||||
titles[7] = ''Scheduler commit success/failure operation throughput'';
|
||||
titles[8] = ''Cluster nodes memory usage'';
|
||||
titles[9] = ''Cluster nodes vcores usage'';
|
||||
|
||||
// ylabels
|
||||
yLabels[0] = ''Number'';
|
||||
|
@ -107,12 +127,14 @@
|
|||
yLabels[5] = ''Number'';
|
||||
yLabels[6] = ''Timecost (ms)'';
|
||||
yLabels[7] = ''Number'';
|
||||
yLabels[8] = ''Number'';
|
||||
yLabels[9] = ''Number'';
|
||||
|
||||
// is area?
|
||||
isAreas = [0, 0, 0, 0, 1, 1, 0, 0];
|
||||
isAreas = [0, 0, 0, 0, 1, 1, 0, 0, 1, 1];
|
||||
|
||||
// draw all charts
|
||||
for (var i = 0; i < 8; i ++) '{'
|
||||
for (var i = 0; i < 10; i ++) '{'
|
||||
drawEachChart(i);
|
||||
'}'
|
||||
|
||||
|
@ -180,7 +202,7 @@
|
|||
data.push(point);
|
||||
|
||||
// clear old
|
||||
for (var i = 0; i < 8; i ++) '{'
|
||||
for (var i = 0; i < 10; i ++) '{'
|
||||
svgs[i].selectAll(''g.tick'').remove();
|
||||
svgs[i].selectAll(''g'').remove();
|
||||
var color = d3.scale.category10();
|
||||
|
|
|
@ -54,11 +54,13 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnSched
|
|||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler;
|
||||
import org.apache.hadoop.yarn.sls.conf.SLSConfiguration;
|
||||
import org.apache.hadoop.yarn.sls.utils.NodeUsageRanges;
|
||||
import org.apache.hadoop.yarn.sls.web.SLSWebApp;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -157,6 +159,8 @@ public abstract class SchedulerMetrics {
|
|||
registerClusterResourceMetrics();
|
||||
registerContainerAppNumMetrics();
|
||||
registerSchedulerMetrics();
|
||||
registerNodesUsageMetrics("memory");
|
||||
registerNodesUsageMetrics("vcores");
|
||||
|
||||
// .csv output
|
||||
initMetricsCSVOutput();
|
||||
|
@ -463,6 +467,49 @@ public abstract class SchedulerMetrics {
|
|||
}
|
||||
}
|
||||
|
||||
private void registerNodesUsageMetrics(String resourceType) {
|
||||
samplerLock.lock();
|
||||
try {
|
||||
for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
|
||||
String metricName = "nodes." + resourceType + "." + range.getKeyword();
|
||||
metrics.register(metricName,
|
||||
new Gauge<Integer>() {
|
||||
@Override
|
||||
public Integer getValue() {
|
||||
if (!(scheduler instanceof AbstractYarnScheduler)) {
|
||||
return 0;
|
||||
} else {
|
||||
int count = 0;
|
||||
AbstractYarnScheduler sch = (AbstractYarnScheduler) scheduler;
|
||||
for (Object node : sch.getNodeTracker().getAllNodes()) {
|
||||
SchedulerNode sNode = (SchedulerNode) node;
|
||||
long allocated = 0, total = 0;
|
||||
if (resourceType.equals("memory")) {
|
||||
allocated = sNode.getAllocatedResource().getMemorySize();
|
||||
total = sNode.getTotalResource().getMemorySize();
|
||||
} else if (resourceType.equals("vcores")) {
|
||||
allocated =
|
||||
sNode.getAllocatedResource().getVirtualCores();
|
||||
total =
|
||||
sNode.getTotalResource().getVirtualCores();
|
||||
}
|
||||
float usedPct = allocated * 100f / total;
|
||||
if (range.getLowerLimit() <= usedPct
|
||||
&& usedPct <= range.getUpperLimit()) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
} finally {
|
||||
samplerLock.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
private void initMetricsCSVOutput() {
|
||||
int timeIntervalMS = conf.getInt(
|
||||
SLSConfiguration.METRICS_RECORD_INTERVAL_MS,
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.yarn.sls.utils;
|
||||
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Set;
|
||||
|
||||
public final class NodeUsageRanges {
|
||||
private NodeUsageRanges() {}
|
||||
|
||||
/**
|
||||
* Class to store the keyword, lower-limit and upper-limit of a resource
|
||||
* within the specified lower-limit (inclusive) and upper-limit (inclusive).
|
||||
*/
|
||||
public static class Range {
|
||||
private String keyword;
|
||||
private float lowerLimit, upperLimit;
|
||||
public Range(String keyword, float lowerLimit, float upperLimit) {
|
||||
this.keyword = keyword;
|
||||
this.lowerLimit = lowerLimit;
|
||||
this.upperLimit = upperLimit;
|
||||
}
|
||||
|
||||
public String getKeyword() {
|
||||
return keyword;
|
||||
}
|
||||
|
||||
public float getLowerLimit() {
|
||||
return lowerLimit;
|
||||
}
|
||||
|
||||
public float getUpperLimit() {
|
||||
return upperLimit;
|
||||
}
|
||||
}
|
||||
|
||||
private static final Set<Range> RANGES;
|
||||
static {
|
||||
RANGES = new LinkedHashSet<>();
|
||||
RANGES.add(new Range("unused", 0, 0));
|
||||
RANGES.add(new Range("1to19pctUsed", 1, 19));
|
||||
RANGES.add(new Range("20to39pctUsed", 20, 39));
|
||||
RANGES.add(new Range("40to59pctUsed", 40, 59));
|
||||
RANGES.add(new Range("60to79pctUsed", 60, 79));
|
||||
RANGES.add(new Range("80to99pctUsed", 80, 99));
|
||||
RANGES.add(new Range("full", 100, 100));
|
||||
}
|
||||
|
||||
public static Set<Range> getRanges() {
|
||||
return RANGES;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
/**
|
||||
* Utility classes for SLS.
|
||||
*/
|
||||
package org.apache.hadoop.yarn.sls.utils;
|
|
@ -40,6 +40,7 @@ import org.apache.hadoop.yarn.sls.scheduler.FairSchedulerMetrics;
|
|||
import org.apache.hadoop.yarn.sls.scheduler.SchedulerMetrics;
|
||||
import org.apache.hadoop.yarn.sls.scheduler.SchedulerWrapper;
|
||||
|
||||
import org.apache.hadoop.yarn.sls.utils.NodeUsageRanges;
|
||||
import org.eclipse.jetty.http.MimeTypes;
|
||||
import org.eclipse.jetty.server.Handler;
|
||||
import org.eclipse.jetty.server.Request;
|
||||
|
@ -73,6 +74,7 @@ public class SLSWebApp extends HttpServlet {
|
|||
private transient Gauge allocatedVCoresGauge;
|
||||
private transient Gauge availableMemoryGauge;
|
||||
private transient Gauge availableVCoresGauge;
|
||||
private transient Map<String, Gauge> perNodeUsageGaugeMap;
|
||||
private transient Histogram allocateTimecostHistogram;
|
||||
private transient Histogram commitSuccessTimecostHistogram;
|
||||
private transient Histogram commitFailureTimecostHistogram;
|
||||
|
@ -122,6 +124,7 @@ public class SLSWebApp extends HttpServlet {
|
|||
handleOperTimecostHistogramMap = new HashMap<>();
|
||||
queueAllocatedMemoryCounterMap = new HashMap<>();
|
||||
queueAllocatedVCoresCounterMap = new HashMap<>();
|
||||
perNodeUsageGaugeMap = new HashMap<>();
|
||||
schedulerMetrics = wrapper.getSchedulerMetrics();
|
||||
metrics = schedulerMetrics.getMetrics();
|
||||
port = metricsAddressPort;
|
||||
|
@ -547,10 +550,40 @@ public class SLSWebApp extends HttpServlet {
|
|||
sb.append(",\"scheduler.handle-").append(e).append(".timecost\":")
|
||||
.append(handleOperTimecostMap.get(e));
|
||||
}
|
||||
sb.append(generateNodeUsageMetrics("memory"));
|
||||
sb.append(generateNodeUsageMetrics("vcores"));
|
||||
sb.append("}");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String generateNodeUsageMetrics(String resourceType) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Map<String, Integer> perNodeUsageMap = new HashMap<>();
|
||||
for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
|
||||
String metricName = "nodes." + resourceType + "." + range.getKeyword();
|
||||
if (!perNodeUsageGaugeMap.containsKey(metricName) &&
|
||||
metrics.getGauges().containsKey(metricName)) {
|
||||
perNodeUsageGaugeMap.put(metricName,
|
||||
metrics.getGauges().get(metricName));
|
||||
}
|
||||
|
||||
int perNodeUsageCount =
|
||||
perNodeUsageGaugeMap.containsKey(metricName) ?
|
||||
Integer.parseInt(
|
||||
perNodeUsageGaugeMap.get(metricName).getValue().toString()) : 0;
|
||||
|
||||
perNodeUsageMap.put(metricName, perNodeUsageCount);
|
||||
}
|
||||
|
||||
// per node memory and vcores used
|
||||
for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
|
||||
String metricName = "nodes." + resourceType + "." + range.getKeyword();
|
||||
sb.append(",\"").append(metricName).append("\":")
|
||||
.append(perNodeUsageMap.get(metricName));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* package metrics information for one tracked queue/app
|
||||
* only support FairScheduler currently
|
||||
|
|
Loading…
Reference in New Issue