YARN-8152. Add chart in SLS to illustrate the throughput of the scheduler. Contributed by Tao Yang.
This commit is contained in:
parent
71220d218d
commit
ba2db39c65
|
@ -36,19 +36,19 @@
|
|||
</div>
|
||||
<div class="row">
|
||||
<div class="divborder span8" style="margin-left:50px" id="area1"></div>
|
||||
<div class="divborder span8" id="area2"></div>
|
||||
<div class="divborder span8" style="margin-left:50px" id="area2"></div>
|
||||
</div>
|
||||
<div class="row">
|
||||
<div class="divborder span8" style="margin-left:50px" id="area3"></div>
|
||||
<div class="divborder span8" id="area4"></div>
|
||||
<div class="divborder span8" style="margin-left:50px" id="area4"></div>
|
||||
</div>
|
||||
<div class="row">
|
||||
<div class="divborder span8" style="margin-left:50px" id="area5"></div>
|
||||
<div class="divborder span8" id="area6"></div>
|
||||
<div class="divborder span8" style="margin-left:50px" id="area6"></div>
|
||||
</div>
|
||||
<div class="row">
|
||||
<div class="divborder span8" style="margin-left:50px" id="area7"></div>
|
||||
<div class="span8" id="area8"></div>
|
||||
<div class="divborder span8" style="margin-left:50px" id="area8"></div>
|
||||
</div><br/><br/>
|
||||
|
||||
<script>
|
||||
|
@ -82,7 +82,11 @@
|
|||
''scheduler.handle-NODE_UPDATE.timecost'',
|
||||
''scheduler.handle-APP_ADDED.timecost'',
|
||||
''scheduler.handle-APP_REMOVED.timecost'',
|
||||
''scheduler.handle-CONTAINER_EXPIRED.timecost''];
|
||||
''scheduler.handle-CONTAINER_EXPIRED.timecost'',
|
||||
''scheduler.commit.success.timecost'',
|
||||
''scheduler.commit.failure.timecost''];
|
||||
legends[7] = [''scheduler.commit.success.throughput'',
|
||||
''scheduler.commit.failure.throughput''];
|
||||
|
||||
// title
|
||||
titles[0] = ''Cluster running applications & containers'';
|
||||
|
@ -91,7 +95,8 @@
|
|||
titles[3] = ''Cluster allocated & available vcores'';
|
||||
titles[4] = ''Queue allocated memory'';
|
||||
titles[5] = ''Queue allocated vcores'';
|
||||
titles[6] = ''Scheduler allocate & handle operation timecost'';
|
||||
titles[6] = ''Scheduler allocate & handle & commit operation timecost'';
|
||||
titles[7] = ''Scheduler commit success/failure operation throughput'';
|
||||
|
||||
// ylabels
|
||||
yLabels[0] = ''Number'';
|
||||
|
@ -101,12 +106,13 @@
|
|||
yLabels[4] = ''Memory (GB)'';
|
||||
yLabels[5] = ''Number'';
|
||||
yLabels[6] = ''Timecost (ms)'';
|
||||
yLabels[7] = ''Number'';
|
||||
|
||||
// is area?
|
||||
isAreas = [0, 0, 0, 0, 1, 1, 0];
|
||||
isAreas = [0, 0, 0, 0, 1, 1, 0, 0];
|
||||
|
||||
// draw all charts
|
||||
for (var i = 0; i < 7; i ++) '{'
|
||||
for (var i = 0; i < 8; i ++) '{'
|
||||
drawEachChart(i);
|
||||
'}'
|
||||
|
||||
|
@ -174,7 +180,7 @@
|
|||
data.push(point);
|
||||
|
||||
// clear old
|
||||
for (var i = 0; i < 7; i ++) '{'
|
||||
for (var i = 0; i < 8; i ++) '{'
|
||||
svgs[i].selectAll(''g.tick'').remove();
|
||||
svgs[i].selectAll(''g'').remove();
|
||||
var color = d3.scale.category10();
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
||||
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
||||
|
@ -44,6 +45,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ContainerUpdates;
|
|||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerAppReport;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ResourceCommitRequest;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent;
|
||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;
|
||||
|
@ -129,6 +131,33 @@ public class SLSCapacityScheduler extends CapacityScheduler implements
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean tryCommit(Resource cluster, ResourceCommitRequest r,
|
||||
boolean updatePending) {
|
||||
if (metricsON) {
|
||||
boolean isSuccess = false;
|
||||
long startTimeNs = System.nanoTime();
|
||||
try {
|
||||
isSuccess = super.tryCommit(cluster, r, updatePending);
|
||||
return isSuccess;
|
||||
} finally {
|
||||
long elapsedNs = System.nanoTime() - startTimeNs;
|
||||
if (isSuccess) {
|
||||
schedulerMetrics.getSchedulerCommitSuccessTimer()
|
||||
.update(elapsedNs, TimeUnit.NANOSECONDS);
|
||||
schedulerMetrics.increaseSchedulerCommitSuccessCounter();
|
||||
} else {
|
||||
schedulerMetrics.getSchedulerCommitFailureTimer()
|
||||
.update(elapsedNs, TimeUnit.NANOSECONDS);
|
||||
schedulerMetrics.increaseSchedulerCommitFailureCounter();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return super.tryCommit(cluster, r, updatePending);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handle(SchedulerEvent schedulerEvent) {
|
||||
if (!metricsON) {
|
||||
|
|
|
@ -89,11 +89,15 @@ public abstract class SchedulerMetrics {
|
|||
|
||||
// counters for scheduler allocate/handle operations
|
||||
private Counter schedulerAllocateCounter;
|
||||
private Counter schedulerCommitSuccessCounter;
|
||||
private Counter schedulerCommitFailureCounter;
|
||||
private Counter schedulerHandleCounter;
|
||||
private Map<SchedulerEventType, Counter> schedulerHandleCounterMap;
|
||||
|
||||
// Timers for scheduler allocate/handle operations
|
||||
private Timer schedulerAllocateTimer;
|
||||
private Timer schedulerCommitSuccessTimer;
|
||||
private Timer schedulerCommitFailureTimer;
|
||||
private Timer schedulerHandleTimer;
|
||||
private Map<SchedulerEventType, Timer> schedulerHandleTimerMap;
|
||||
private List<Histogram> schedulerHistogramList;
|
||||
|
@ -387,6 +391,10 @@ public abstract class SchedulerMetrics {
|
|||
// counters for scheduler operations
|
||||
schedulerAllocateCounter = metrics.counter(
|
||||
"counter.scheduler.operation.allocate");
|
||||
schedulerCommitSuccessCounter = metrics.counter(
|
||||
"counter.scheduler.operation.commit.success");
|
||||
schedulerCommitFailureCounter = metrics.counter(
|
||||
"counter.scheduler.operation.commit.failure");
|
||||
schedulerHandleCounter = metrics.counter(
|
||||
"counter.scheduler.operation.handle");
|
||||
schedulerHandleCounterMap = new HashMap<>();
|
||||
|
@ -401,6 +409,10 @@ public abstract class SchedulerMetrics {
|
|||
SLSConfiguration.METRICS_TIMER_WINDOW_SIZE_DEFAULT);
|
||||
schedulerAllocateTimer = new Timer(
|
||||
new SlidingWindowReservoir(timeWindowSize));
|
||||
schedulerCommitSuccessTimer = new Timer(
|
||||
new SlidingWindowReservoir(timeWindowSize));
|
||||
schedulerCommitFailureTimer = new Timer(
|
||||
new SlidingWindowReservoir(timeWindowSize));
|
||||
schedulerHandleTimer = new Timer(
|
||||
new SlidingWindowReservoir(timeWindowSize));
|
||||
schedulerHandleTimerMap = new HashMap<>();
|
||||
|
@ -417,6 +429,20 @@ public abstract class SchedulerMetrics {
|
|||
schedulerAllocateHistogram);
|
||||
schedulerHistogramList.add(schedulerAllocateHistogram);
|
||||
histogramTimerMap.put(schedulerAllocateHistogram, schedulerAllocateTimer);
|
||||
Histogram schedulerCommitHistogram = new Histogram(
|
||||
new SlidingWindowReservoir(SAMPLING_SIZE));
|
||||
metrics.register("sampler.scheduler.operation.commit.success.timecost",
|
||||
schedulerCommitHistogram);
|
||||
schedulerHistogramList.add(schedulerCommitHistogram);
|
||||
histogramTimerMap
|
||||
.put(schedulerCommitHistogram, schedulerCommitSuccessTimer);
|
||||
Histogram schedulerCommitFailureHistogram =
|
||||
new Histogram(new SlidingWindowReservoir(SAMPLING_SIZE));
|
||||
metrics.register("sampler.scheduler.operation.commit.failure.timecost",
|
||||
schedulerCommitFailureHistogram);
|
||||
schedulerHistogramList.add(schedulerCommitFailureHistogram);
|
||||
histogramTimerMap
|
||||
.put(schedulerCommitFailureHistogram, schedulerCommitFailureTimer);
|
||||
Histogram schedulerHandleHistogram = new Histogram(
|
||||
new SlidingWindowReservoir(SAMPLING_SIZE));
|
||||
metrics.register("sampler.scheduler.operation.handle.timecost",
|
||||
|
@ -534,6 +560,14 @@ public abstract class SchedulerMetrics {
|
|||
schedulerAllocateCounter.inc();
|
||||
}
|
||||
|
||||
void increaseSchedulerCommitSuccessCounter() {
|
||||
schedulerCommitSuccessCounter.inc();
|
||||
}
|
||||
|
||||
void increaseSchedulerCommitFailureCounter() {
|
||||
schedulerCommitFailureCounter.inc();
|
||||
}
|
||||
|
||||
void increaseSchedulerHandleCounter(SchedulerEventType schedulerEventType) {
|
||||
schedulerHandleCounter.inc();
|
||||
schedulerHandleCounterMap.get(schedulerEventType).inc();
|
||||
|
@ -543,6 +577,14 @@ public abstract class SchedulerMetrics {
|
|||
return schedulerAllocateTimer;
|
||||
}
|
||||
|
||||
Timer getSchedulerCommitSuccessTimer() {
|
||||
return schedulerCommitSuccessTimer;
|
||||
}
|
||||
|
||||
Timer getSchedulerCommitFailureTimer() {
|
||||
return schedulerCommitFailureTimer;
|
||||
}
|
||||
|
||||
Timer getSchedulerHandleTimer() {
|
||||
return schedulerHandleTimer;
|
||||
}
|
||||
|
|
|
@ -69,6 +69,8 @@ public class SLSWebApp extends HttpServlet {
|
|||
private transient Gauge availableMemoryGauge;
|
||||
private transient Gauge availableVCoresGauge;
|
||||
private transient Histogram allocateTimecostHistogram;
|
||||
private transient Histogram commitSuccessTimecostHistogram;
|
||||
private transient Histogram commitFailureTimecostHistogram;
|
||||
private transient Histogram handleTimecostHistogram;
|
||||
private transient Map<SchedulerEventType, Histogram>
|
||||
handleOperTimecostHistogramMap;
|
||||
|
@ -81,6 +83,12 @@ public class SLSWebApp extends HttpServlet {
|
|||
private String simulateTemplate;
|
||||
private String trackTemplate;
|
||||
|
||||
private transient Counter schedulerCommitSuccessCounter;
|
||||
private transient Counter schedulerCommitFailureCounter;
|
||||
private Long lastTrackingTime;
|
||||
private Long lastSchedulerCommitSuccessCount;
|
||||
private Long lastSchedulerCommitFailureCount;
|
||||
|
||||
{
|
||||
// load templates
|
||||
ClassLoader cl = Thread.currentThread().getContextClassLoader();
|
||||
|
@ -386,13 +394,26 @@ public class SLSWebApp extends HttpServlet {
|
|||
Double.parseDouble(availableVCoresGauge.getValue().toString());
|
||||
|
||||
// scheduler operation
|
||||
double allocateTimecost, handleTimecost;
|
||||
double allocateTimecost, commitSuccessTimecost, commitFailureTimecost,
|
||||
handleTimecost;
|
||||
if (allocateTimecostHistogram == null &&
|
||||
metrics.getHistograms().containsKey(
|
||||
"sampler.scheduler.operation.allocate.timecost")) {
|
||||
allocateTimecostHistogram = metrics.getHistograms()
|
||||
.get("sampler.scheduler.operation.allocate.timecost");
|
||||
}
|
||||
if (commitSuccessTimecostHistogram == null &&
|
||||
metrics.getHistograms().containsKey(
|
||||
"sampler.scheduler.operation.commit.success.timecost")) {
|
||||
commitSuccessTimecostHistogram = metrics.getHistograms()
|
||||
.get("sampler.scheduler.operation.commit.success.timecost");
|
||||
}
|
||||
if (commitFailureTimecostHistogram == null &&
|
||||
metrics.getHistograms().containsKey(
|
||||
"sampler.scheduler.operation.commit.failure.timecost")) {
|
||||
commitFailureTimecostHistogram = metrics.getHistograms()
|
||||
.get("sampler.scheduler.operation.commit.failure.timecost");
|
||||
}
|
||||
if (handleTimecostHistogram == null &&
|
||||
metrics.getHistograms().containsKey(
|
||||
"sampler.scheduler.operation.handle.timecost")) {
|
||||
|
@ -401,6 +422,10 @@ public class SLSWebApp extends HttpServlet {
|
|||
}
|
||||
allocateTimecost = allocateTimecostHistogram == null ? 0.0 :
|
||||
allocateTimecostHistogram.getSnapshot().getMean()/1000000;
|
||||
commitSuccessTimecost = commitSuccessTimecostHistogram == null ? 0.0 :
|
||||
commitSuccessTimecostHistogram.getSnapshot().getMean()/1000000;
|
||||
commitFailureTimecost = commitFailureTimecostHistogram == null ? 0.0 :
|
||||
commitFailureTimecostHistogram.getSnapshot().getMean()/1000000;
|
||||
handleTimecost = handleTimecostHistogram == null ? 0.0 :
|
||||
handleTimecostHistogram.getSnapshot().getMean()/1000000;
|
||||
// various handle operation
|
||||
|
@ -447,6 +472,41 @@ public class SLSWebApp extends HttpServlet {
|
|||
queueAllocatedVCoresMap.put(queue, queueAllocatedVCores);
|
||||
}
|
||||
|
||||
// calculate commit throughput, unit is number/second
|
||||
if (schedulerCommitSuccessCounter == null && metrics.getCounters()
|
||||
.containsKey("counter.scheduler.operation.commit.success")) {
|
||||
schedulerCommitSuccessCounter = metrics.getCounters()
|
||||
.get("counter.scheduler.operation.commit.success");
|
||||
}
|
||||
if (schedulerCommitFailureCounter == null && metrics.getCounters()
|
||||
.containsKey("counter.scheduler.operation.commit.failure")) {
|
||||
schedulerCommitFailureCounter = metrics.getCounters()
|
||||
.get("counter.scheduler.operation.commit.failure");
|
||||
}
|
||||
long schedulerCommitSuccessThroughput = 0;
|
||||
long schedulerCommitFailureThroughput = 0;
|
||||
if (schedulerCommitSuccessCounter != null
|
||||
&& schedulerCommitFailureCounter != null) {
|
||||
long currentTrackingTime = System.currentTimeMillis();
|
||||
long currentSchedulerCommitSucessCount =
|
||||
schedulerCommitSuccessCounter.getCount();
|
||||
long currentSchedulerCommitFailureCount =
|
||||
schedulerCommitFailureCounter.getCount();
|
||||
if (lastTrackingTime != null) {
|
||||
double intervalSeconds =
|
||||
(double) (currentTrackingTime - lastTrackingTime) / 1000;
|
||||
schedulerCommitSuccessThroughput = Math.round(
|
||||
(currentSchedulerCommitSucessCount
|
||||
- lastSchedulerCommitSuccessCount) / intervalSeconds);
|
||||
schedulerCommitFailureThroughput = Math.round(
|
||||
(currentSchedulerCommitFailureCount
|
||||
- lastSchedulerCommitFailureCount) / intervalSeconds);
|
||||
}
|
||||
lastTrackingTime = currentTrackingTime;
|
||||
lastSchedulerCommitSuccessCount = currentSchedulerCommitSucessCount;
|
||||
lastSchedulerCommitFailureCount = currentSchedulerCommitFailureCount;
|
||||
}
|
||||
|
||||
// package results
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("{");
|
||||
|
@ -469,6 +529,14 @@ public class SLSWebApp extends HttpServlet {
|
|||
}
|
||||
// scheduler allocate & handle
|
||||
sb.append(",\"scheduler.allocate.timecost\":").append(allocateTimecost);
|
||||
sb.append(",\"scheduler.commit.success.timecost\":")
|
||||
.append(commitSuccessTimecost);
|
||||
sb.append(",\"scheduler.commit.failure.timecost\":")
|
||||
.append(commitFailureTimecost);
|
||||
sb.append(",\"scheduler.commit.success.throughput\":")
|
||||
.append(schedulerCommitSuccessThroughput);
|
||||
sb.append(",\"scheduler.commit.failure.throughput\":")
|
||||
.append(schedulerCommitFailureThroughput);
|
||||
sb.append(",\"scheduler.handle.timecost\":").append(handleTimecost);
|
||||
for (SchedulerEventType e : SchedulerEventType.values()) {
|
||||
sb.append(",\"scheduler.handle-").append(e).append(".timecost\":")
|
||||
|
|
Loading…
Reference in New Issue