YARN-10628. Add node usage metrics in SLS. Contributed by Vadaga Ananyo Rao
This commit is contained in:
parent
d78b300ed4
commit
54f9fff218
|
@ -100,6 +100,10 @@
|
||||||
<div class="divborder span8 " style="margin-left:50px" id="area7"></div>
|
<div class="divborder span8 " style="margin-left:50px" id="area7"></div>
|
||||||
<div class="span7 chart-area" id="area8"></div>
|
<div class="span7 chart-area" id="area8"></div>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="row">
|
||||||
|
<div class="divborder span8 chart-area" style="margin-left:50px" id="area9"></div>
|
||||||
|
<div class="divborder span8 chart-area" id="area10"></div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<p> </p>
|
<p> </p>
|
||||||
<script>
|
<script>
|
||||||
|
@ -162,6 +166,23 @@ function drawCharts(filepath) {
|
||||||
"scheduler.handle-APP_REMOVED.timecost", "scheduler.handle-CONTAINER_EXPIRED.timecost"
|
"scheduler.handle-APP_REMOVED.timecost", "scheduler.handle-CONTAINER_EXPIRED.timecost"
|
||||||
];
|
];
|
||||||
drawEachChart("#area7", data, legends, "Scheduler allocate & handle operations timecost", "Timecost (ms)", 0, 210);
|
drawEachChart("#area7", data, legends, "Scheduler allocate & handle operations timecost", "Timecost (ms)", 0, 210);
|
||||||
|
|
||||||
|
// Node usage stats
|
||||||
|
legends = [
|
||||||
|
"nodes.memory.unused", "nodes.memory.1to19pctUsed",
|
||||||
|
"nodes.memory.20to39pctUsed", "nodes.memory.40to59pctUsed",
|
||||||
|
"nodes.memory.60to79pctUsed","nodes.memory.80to99pctUsed",
|
||||||
|
"nodes.memory.full"
|
||||||
|
];
|
||||||
|
drawEachChart("#area9", data, legends, "Cluster nodes memory usage", "Node count", 1, 0);
|
||||||
|
|
||||||
|
legends = [
|
||||||
|
"nodes.vcores.unused", "nodes.vcores.1to19pctUsed",
|
||||||
|
"nodes.vcores.20to39pctUsed", "nodes.vcores.40to59pctUsed",
|
||||||
|
"nodes.vcores.60to79pctUsed", "nodes.vcores.80to99pctUsed",
|
||||||
|
"nodes.vcores.full"
|
||||||
|
];
|
||||||
|
drawEachChart("#area10", data, legends, "Cluster nodes vcores usage", "Node count", 1, 0);
|
||||||
});
|
});
|
||||||
}).done(function() {
|
}).done(function() {
|
||||||
$("#data").css("display", "block");
|
$("#data").css("display", "block");
|
||||||
|
|
|
@ -49,6 +49,10 @@
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="divborder span8" style="margin-left:50px" id="area7"></div>
|
<div class="divborder span8" style="margin-left:50px" id="area7"></div>
|
||||||
<div class="divborder span8" style="margin-left:50px" id="area8"></div>
|
<div class="divborder span8" style="margin-left:50px" id="area8"></div>
|
||||||
|
</div>
|
||||||
|
<div class="row">
|
||||||
|
<div class="divborder span8" style="margin-left:50px" id="area9"></div>
|
||||||
|
<div class="divborder span8" style="margin-left:50px" id="area10"></div>
|
||||||
</div><br/><br/>
|
</div><br/><br/>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
|
@ -87,6 +91,20 @@
|
||||||
''scheduler.commit.failure.timecost''];
|
''scheduler.commit.failure.timecost''];
|
||||||
legends[7] = [''scheduler.commit.success.throughput'',
|
legends[7] = [''scheduler.commit.success.throughput'',
|
||||||
''scheduler.commit.failure.throughput''];
|
''scheduler.commit.failure.throughput''];
|
||||||
|
legends[8] = [''nodes.memory.unused'',
|
||||||
|
''nodes.memory.1to19pctUsed'',
|
||||||
|
''nodes.memory.20to39pctUsed'',
|
||||||
|
''nodes.memory.40to59pctUsed'',
|
||||||
|
''nodes.memory.60to79pctUsed'',
|
||||||
|
''nodes.memory.80to99pctUsed'',
|
||||||
|
''nodes.memory.full''];
|
||||||
|
legends[9] = [''nodes.vcores.unused'',
|
||||||
|
''nodes.vcores.1to19pctUsed'',
|
||||||
|
''nodes.vcores.20to39pctUsed'',
|
||||||
|
''nodes.vcores.40to59pctUsed'',
|
||||||
|
''nodes.vcores.60to79pctUsed'',
|
||||||
|
''nodes.vcores.80to99pctUsed'',
|
||||||
|
''nodes.vcores.full''];
|
||||||
|
|
||||||
// title
|
// title
|
||||||
titles[0] = ''Cluster running applications & containers'';
|
titles[0] = ''Cluster running applications & containers'';
|
||||||
|
@ -97,6 +115,8 @@
|
||||||
titles[5] = ''Queue allocated vcores'';
|
titles[5] = ''Queue allocated vcores'';
|
||||||
titles[6] = ''Scheduler allocate & handle & commit operation timecost'';
|
titles[6] = ''Scheduler allocate & handle & commit operation timecost'';
|
||||||
titles[7] = ''Scheduler commit success/failure operation throughput'';
|
titles[7] = ''Scheduler commit success/failure operation throughput'';
|
||||||
|
titles[8] = ''Cluster nodes memory usage'';
|
||||||
|
titles[9] = ''Cluster nodes vcores usage'';
|
||||||
|
|
||||||
// ylabels
|
// ylabels
|
||||||
yLabels[0] = ''Number'';
|
yLabels[0] = ''Number'';
|
||||||
|
@ -107,12 +127,14 @@
|
||||||
yLabels[5] = ''Number'';
|
yLabels[5] = ''Number'';
|
||||||
yLabels[6] = ''Timecost (ms)'';
|
yLabels[6] = ''Timecost (ms)'';
|
||||||
yLabels[7] = ''Number'';
|
yLabels[7] = ''Number'';
|
||||||
|
yLabels[8] = ''Number'';
|
||||||
|
yLabels[9] = ''Number'';
|
||||||
|
|
||||||
// is area?
|
// is area?
|
||||||
isAreas = [0, 0, 0, 0, 1, 1, 0, 0];
|
isAreas = [0, 0, 0, 0, 1, 1, 0, 0, 1, 1];
|
||||||
|
|
||||||
// draw all charts
|
// draw all charts
|
||||||
for (var i = 0; i < 8; i ++) '{'
|
for (var i = 0; i < 10; i ++) '{'
|
||||||
drawEachChart(i);
|
drawEachChart(i);
|
||||||
'}'
|
'}'
|
||||||
|
|
||||||
|
@ -180,7 +202,7 @@
|
||||||
data.push(point);
|
data.push(point);
|
||||||
|
|
||||||
// clear old
|
// clear old
|
||||||
for (var i = 0; i < 8; i ++) '{'
|
for (var i = 0; i < 10; i ++) '{'
|
||||||
svgs[i].selectAll(''g.tick'').remove();
|
svgs[i].selectAll(''g.tick'').remove();
|
||||||
svgs[i].selectAll(''g'').remove();
|
svgs[i].selectAll(''g'').remove();
|
||||||
var color = d3.scale.category10();
|
var color = d3.scale.category10();
|
||||||
|
|
|
@ -54,11 +54,13 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnSched
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler;
|
||||||
import org.apache.hadoop.yarn.sls.conf.SLSConfiguration;
|
import org.apache.hadoop.yarn.sls.conf.SLSConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.sls.utils.NodeUsageRanges;
|
||||||
import org.apache.hadoop.yarn.sls.web.SLSWebApp;
|
import org.apache.hadoop.yarn.sls.web.SLSWebApp;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -157,6 +159,8 @@ public abstract class SchedulerMetrics {
|
||||||
registerClusterResourceMetrics();
|
registerClusterResourceMetrics();
|
||||||
registerContainerAppNumMetrics();
|
registerContainerAppNumMetrics();
|
||||||
registerSchedulerMetrics();
|
registerSchedulerMetrics();
|
||||||
|
registerNodesUsageMetrics("memory");
|
||||||
|
registerNodesUsageMetrics("vcores");
|
||||||
|
|
||||||
// .csv output
|
// .csv output
|
||||||
initMetricsCSVOutput();
|
initMetricsCSVOutput();
|
||||||
|
@ -463,6 +467,49 @@ public abstract class SchedulerMetrics {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void registerNodesUsageMetrics(String resourceType) {
|
||||||
|
samplerLock.lock();
|
||||||
|
try {
|
||||||
|
for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
|
||||||
|
String metricName = "nodes." + resourceType + "." + range.getKeyword();
|
||||||
|
metrics.register(metricName,
|
||||||
|
new Gauge<Integer>() {
|
||||||
|
@Override
|
||||||
|
public Integer getValue() {
|
||||||
|
if (!(scheduler instanceof AbstractYarnScheduler)) {
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
int count = 0;
|
||||||
|
AbstractYarnScheduler sch = (AbstractYarnScheduler) scheduler;
|
||||||
|
for (Object node : sch.getNodeTracker().getAllNodes()) {
|
||||||
|
SchedulerNode sNode = (SchedulerNode) node;
|
||||||
|
long allocated = 0, total = 0;
|
||||||
|
if (resourceType.equals("memory")) {
|
||||||
|
allocated = sNode.getAllocatedResource().getMemorySize();
|
||||||
|
total = sNode.getTotalResource().getMemorySize();
|
||||||
|
} else if (resourceType.equals("vcores")) {
|
||||||
|
allocated =
|
||||||
|
sNode.getAllocatedResource().getVirtualCores();
|
||||||
|
total =
|
||||||
|
sNode.getTotalResource().getVirtualCores();
|
||||||
|
}
|
||||||
|
float usedPct = allocated * 100f / total;
|
||||||
|
if (range.getLowerLimit() <= usedPct
|
||||||
|
&& usedPct <= range.getUpperLimit()) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
samplerLock.unlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void initMetricsCSVOutput() {
|
private void initMetricsCSVOutput() {
|
||||||
int timeIntervalMS = conf.getInt(
|
int timeIntervalMS = conf.getInt(
|
||||||
SLSConfiguration.METRICS_RECORD_INTERVAL_MS,
|
SLSConfiguration.METRICS_RECORD_INTERVAL_MS,
|
||||||
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.yarn.sls.utils;
|
||||||
|
|
||||||
|
import java.util.LinkedHashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public final class NodeUsageRanges {
|
||||||
|
private NodeUsageRanges() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class to store the keyword, lower-limit and upper-limit of a resource
|
||||||
|
* within the specified lower-limit (inclusive) and upper-limit (inclusive).
|
||||||
|
*/
|
||||||
|
public static class Range {
|
||||||
|
private String keyword;
|
||||||
|
private float lowerLimit, upperLimit;
|
||||||
|
public Range(String keyword, float lowerLimit, float upperLimit) {
|
||||||
|
this.keyword = keyword;
|
||||||
|
this.lowerLimit = lowerLimit;
|
||||||
|
this.upperLimit = upperLimit;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getKeyword() {
|
||||||
|
return keyword;
|
||||||
|
}
|
||||||
|
|
||||||
|
public float getLowerLimit() {
|
||||||
|
return lowerLimit;
|
||||||
|
}
|
||||||
|
|
||||||
|
public float getUpperLimit() {
|
||||||
|
return upperLimit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Set<Range> RANGES;
|
||||||
|
static {
|
||||||
|
RANGES = new LinkedHashSet<>();
|
||||||
|
RANGES.add(new Range("unused", 0, 0));
|
||||||
|
RANGES.add(new Range("1to19pctUsed", 1, 19));
|
||||||
|
RANGES.add(new Range("20to39pctUsed", 20, 39));
|
||||||
|
RANGES.add(new Range("40to59pctUsed", 40, 59));
|
||||||
|
RANGES.add(new Range("60to79pctUsed", 60, 79));
|
||||||
|
RANGES.add(new Range("80to99pctUsed", 80, 99));
|
||||||
|
RANGES.add(new Range("full", 100, 100));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Set<Range> getRanges() {
|
||||||
|
return RANGES;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,21 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
/**
|
||||||
|
* Utility classes for SLS.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.yarn.sls.utils;
|
|
@ -40,6 +40,7 @@ import org.apache.hadoop.yarn.sls.scheduler.FairSchedulerMetrics;
|
||||||
import org.apache.hadoop.yarn.sls.scheduler.SchedulerMetrics;
|
import org.apache.hadoop.yarn.sls.scheduler.SchedulerMetrics;
|
||||||
import org.apache.hadoop.yarn.sls.scheduler.SchedulerWrapper;
|
import org.apache.hadoop.yarn.sls.scheduler.SchedulerWrapper;
|
||||||
|
|
||||||
|
import org.apache.hadoop.yarn.sls.utils.NodeUsageRanges;
|
||||||
import org.eclipse.jetty.http.MimeTypes;
|
import org.eclipse.jetty.http.MimeTypes;
|
||||||
import org.eclipse.jetty.server.Handler;
|
import org.eclipse.jetty.server.Handler;
|
||||||
import org.eclipse.jetty.server.Request;
|
import org.eclipse.jetty.server.Request;
|
||||||
|
@ -73,6 +74,7 @@ public class SLSWebApp extends HttpServlet {
|
||||||
private transient Gauge allocatedVCoresGauge;
|
private transient Gauge allocatedVCoresGauge;
|
||||||
private transient Gauge availableMemoryGauge;
|
private transient Gauge availableMemoryGauge;
|
||||||
private transient Gauge availableVCoresGauge;
|
private transient Gauge availableVCoresGauge;
|
||||||
|
private transient Map<String, Gauge> perNodeUsageGaugeMap;
|
||||||
private transient Histogram allocateTimecostHistogram;
|
private transient Histogram allocateTimecostHistogram;
|
||||||
private transient Histogram commitSuccessTimecostHistogram;
|
private transient Histogram commitSuccessTimecostHistogram;
|
||||||
private transient Histogram commitFailureTimecostHistogram;
|
private transient Histogram commitFailureTimecostHistogram;
|
||||||
|
@ -122,6 +124,7 @@ public class SLSWebApp extends HttpServlet {
|
||||||
handleOperTimecostHistogramMap = new HashMap<>();
|
handleOperTimecostHistogramMap = new HashMap<>();
|
||||||
queueAllocatedMemoryCounterMap = new HashMap<>();
|
queueAllocatedMemoryCounterMap = new HashMap<>();
|
||||||
queueAllocatedVCoresCounterMap = new HashMap<>();
|
queueAllocatedVCoresCounterMap = new HashMap<>();
|
||||||
|
perNodeUsageGaugeMap = new HashMap<>();
|
||||||
schedulerMetrics = wrapper.getSchedulerMetrics();
|
schedulerMetrics = wrapper.getSchedulerMetrics();
|
||||||
metrics = schedulerMetrics.getMetrics();
|
metrics = schedulerMetrics.getMetrics();
|
||||||
port = metricsAddressPort;
|
port = metricsAddressPort;
|
||||||
|
@ -547,10 +550,40 @@ public class SLSWebApp extends HttpServlet {
|
||||||
sb.append(",\"scheduler.handle-").append(e).append(".timecost\":")
|
sb.append(",\"scheduler.handle-").append(e).append(".timecost\":")
|
||||||
.append(handleOperTimecostMap.get(e));
|
.append(handleOperTimecostMap.get(e));
|
||||||
}
|
}
|
||||||
|
sb.append(generateNodeUsageMetrics("memory"));
|
||||||
|
sb.append(generateNodeUsageMetrics("vcores"));
|
||||||
sb.append("}");
|
sb.append("}");
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String generateNodeUsageMetrics(String resourceType) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
Map<String, Integer> perNodeUsageMap = new HashMap<>();
|
||||||
|
for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
|
||||||
|
String metricName = "nodes." + resourceType + "." + range.getKeyword();
|
||||||
|
if (!perNodeUsageGaugeMap.containsKey(metricName) &&
|
||||||
|
metrics.getGauges().containsKey(metricName)) {
|
||||||
|
perNodeUsageGaugeMap.put(metricName,
|
||||||
|
metrics.getGauges().get(metricName));
|
||||||
|
}
|
||||||
|
|
||||||
|
int perNodeUsageCount =
|
||||||
|
perNodeUsageGaugeMap.containsKey(metricName) ?
|
||||||
|
Integer.parseInt(
|
||||||
|
perNodeUsageGaugeMap.get(metricName).getValue().toString()) : 0;
|
||||||
|
|
||||||
|
perNodeUsageMap.put(metricName, perNodeUsageCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
// per node memory and vcores used
|
||||||
|
for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
|
||||||
|
String metricName = "nodes." + resourceType + "." + range.getKeyword();
|
||||||
|
sb.append(",\"").append(metricName).append("\":")
|
||||||
|
.append(perNodeUsageMap.get(metricName));
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* package metrics information for one tracked queue/app
|
* package metrics information for one tracked queue/app
|
||||||
* only support FairScheduler currently
|
* only support FairScheduler currently
|
||||||
|
|
Loading…
Reference in New Issue