YARN-10702. Add cluster metric for amount of CPU used by RM Event Processor.

Contributed by Jim Brennan.
This commit is contained in:
Eric Badger 2021-04-08 18:33:42 +00:00
parent 3422691fb2
commit 899cef53bd
9 changed files with 187 additions and 7 deletions

View File

@ -2513,6 +2513,18 @@ public class YarnConfiguration extends Configuration {
public static final int
DEFAULT_YARN_DISPATCHER_PRINT_EVENTS_INFO_THRESHOLD = 5000;
/** Resource manager dispatcher thread monitor sampling rate.
* Units are samples per minute. This controls how often to sample
* the cpu utilization of the resource manager dispatcher thread.
* The cpu utilization is displayed on the RM UI as scheduler busy %.
* Set to zero to disable the dispatcher thread monitor.
*/
public static final String
YARN_DISPATCHER_CPU_MONITOR_SAMPLES_PER_MIN =
YARN_PREFIX + "dispatcher.cpu-monitor.samples-per-min";
public static final int
DEFAULT_YARN_DISPATCHER_CPU_MONITOR_SAMPLES_PER_MIN = 60;
/**
* CLASSPATH for YARN applications. A comma-separated list of CLASSPATH
* entries

View File

@ -131,4 +131,11 @@ public class EventDispatcher<T extends Event> extends
public void disableExitOnError() {
shouldExitOnError = false;
}
protected long getEventProcessorId() {
return this.eventProcessor.getId();
}
protected boolean isStopped() {
return this.stopped;
}
}

View File

@ -120,6 +120,19 @@
<value>5000</value>
</property>
<property>
<description>
Resource manager dispatcher thread cpu monitor sampling rate.
Units are samples per minute. This controls how often to sample
the cpu utilization of the resource manager dispatcher thread.
The cpu utilization is displayed on the RM UI as scheduler busy %.
Set this to zero to disable the dispatcher thread monitor. Defaults
to 60 samples per minute.
</description>
<name>yarn.dispatcher.cpu-monitor.samples-per-min</name>
<value>60</value>
</property>
<property>
<description>The expiry interval for application master reporting.</description>
<name>yarn.am.liveness-monitor.expiry-interval-ms</name>

View File

@ -59,6 +59,12 @@ public class ClusterMetrics {
@Metric("Memory Capability") MutableGaugeLong capabilityMB;
@Metric("Vcore Capability") MutableGaugeLong capabilityVirtualCores;
@Metric("GPU Capability") MutableGaugeLong capabilityGPUs;
@Metric("RM Event Processor CPU Usage 60 second Avg") MutableGaugeLong
rmEventProcCPUAvg;
@Metric("RM Event Processor CPU Usage 60 second Max") MutableGaugeLong
rmEventProcCPUMax;
private boolean rmEventProcMonitorEnable = false;
private static final MetricsInfo RECORD_INFO = info("ClusterMetrics",
"Metrics for the Yarn Cluster");
@ -94,6 +100,27 @@ public class ClusterMetrics {
INSTANCE = null;
}
// Indicate whether RM Event Thread CPU Monitor is enabled
public void setRmEventProcMonitorEnable(boolean value) {
rmEventProcMonitorEnable = value;
}
public boolean getRmEventProcMonitorEnable() {
return rmEventProcMonitorEnable;
}
// RM Event Processor CPU Usage
public long getRmEventProcCPUAvg() {
return rmEventProcCPUAvg.value();
}
public void setRmEventProcCPUAvg(long value) {
rmEventProcCPUAvg.set(value);
}
public long getRmEventProcCPUMax() {
return rmEventProcCPUMax.value();
}
public void setRmEventProcCPUMax(long value) {
rmEventProcCPUMax.set(value);
}
//Active Nodemanagers
public int getNumActiveNMs() {
return numActiveNMs.value();

View File

@ -48,8 +48,9 @@ import org.apache.hadoop.util.JvmPauseMonitor;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.VersionInfo;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.util.curator.ZKCuratorManager;
import org.apache.hadoop.util.VersionInfo;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
@ -131,6 +132,8 @@ import org.eclipse.jetty.webapp.WebAppContext;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.lang.management.ManagementFactory;
import java.lang.management.ThreadMXBean;
import java.net.InetSocketAddress;
import java.net.URI;
import java.net.URL;
@ -439,7 +442,21 @@ public class ResourceManager extends CompositeService
}
protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
return new EventDispatcher(this.scheduler, "SchedulerEventDispatcher");
String dispatcherName = "SchedulerEventDispatcher";
EventDispatcher dispatcher;
int threadMonitorRate = conf.getInt(
YarnConfiguration.YARN_DISPATCHER_CPU_MONITOR_SAMPLES_PER_MIN,
YarnConfiguration.DEFAULT_YARN_DISPATCHER_CPU_MONITOR_SAMPLES_PER_MIN);
if (threadMonitorRate > 0) {
dispatcher = new SchedulerEventDispatcher(dispatcherName,
threadMonitorRate);
ClusterMetrics.getMetrics().setRmEventProcMonitorEnable(true);
} else {
dispatcher = new EventDispatcher(this.scheduler, dispatcherName);
}
return dispatcher;
}
protected Dispatcher createDispatcher() {
@ -978,7 +995,95 @@ public class ResourceManager extends CompositeService
}
}
/**
@Private
private class SchedulerEventDispatcher extends
EventDispatcher<SchedulerEvent> {
private final Thread eventProcessorMonitor;
SchedulerEventDispatcher(String name, int samplesPerMin) {
super(scheduler, name);
this.eventProcessorMonitor =
new Thread(new EventProcessorMonitor(getEventProcessorId(),
samplesPerMin));
this.eventProcessorMonitor
.setName("ResourceManager Event Processor Monitor");
}
// EventProcessorMonitor keeps track of how much CPU the EventProcessor
// thread is using. It takes a configurable number of samples per minute,
// and then reports the Avg and Max of previous 60 seconds as cluster
// metrics. Units are usecs per second of CPU used.
// Avg is not accurate until one minute of samples have been received.
private final class EventProcessorMonitor implements Runnable {
private final long tid;
private final boolean run;
private final ThreadMXBean tmxb;
private final ClusterMetrics clusterMetrics = ClusterMetrics.getMetrics();
private final int samples;
EventProcessorMonitor(long id, int samplesPerMin) {
assert samplesPerMin > 0;
this.tid = id;
this.samples = samplesPerMin;
this.tmxb = ManagementFactory.getThreadMXBean();
if (clusterMetrics != null &&
tmxb != null && tmxb.isThreadCpuTimeSupported()) {
this.run = true;
clusterMetrics.setRmEventProcMonitorEnable(true);
} else {
this.run = false;
}
}
public void run() {
int index = 0;
long[] values = new long[samples];
int sleepMs = (60 * 1000) / samples;
while (run && !isStopped() && !Thread.currentThread().isInterrupted()) {
try {
long cpuBefore = tmxb.getThreadCpuTime(tid);
long wallClockBefore = Time.monotonicNow();
Thread.sleep(sleepMs);
long wallClockDelta = Time.monotonicNow() - wallClockBefore;
long cpuDelta = tmxb.getThreadCpuTime(tid) - cpuBefore;
// Nanoseconds / Milliseconds = usec per second
values[index] = cpuDelta / wallClockDelta;
index = (index + 1) % samples;
long max = 0;
long sum = 0;
for (int i = 0; i < samples; i++) {
sum += values[i];
max = Math.max(max, values[i]);
}
clusterMetrics.setRmEventProcCPUAvg(sum / samples);
clusterMetrics.setRmEventProcCPUMax(max);
} catch (InterruptedException e) {
LOG.error("Returning, interrupted : " + e);
return;
}
}
}
}
@Override
protected void serviceStart() throws Exception {
super.serviceStart();
this.eventProcessorMonitor.start();
}
@Override
protected void serviceStop() throws Exception {
super.serviceStop();
this.eventProcessorMonitor.interrupt();
try {
this.eventProcessorMonitor.join();
} catch (InterruptedException e) {
throw new YarnRuntimeException(e);
}
}
}
/**
* Transition to standby state in a new thread. The transition operation is
* asynchronous to avoid deadlock caused by cyclic dependency.
*/

View File

@ -204,7 +204,8 @@ public class MetricsOverviewTable extends HtmlBlock {
}
SchedulerInfo schedulerInfo = new SchedulerInfo(this.rm);
int schedBusy = clusterMetrics.getRmSchedulerBusyPercent();
div.h3("Scheduler Metrics").
table("#schedulermetricsoverview").
thead().$class("ui-widget-header").
@ -215,6 +216,7 @@ public class MetricsOverviewTable extends HtmlBlock {
th().$class("ui-state-default").__("Maximum Allocation").__().
th().$class("ui-state-default")
.__("Maximum Cluster Application Priority").__().
th().$class("ui-state-default").__("Scheduler Busy %").__().
__().
__().
tbody().$class("ui-widget-content").
@ -225,6 +227,7 @@ public class MetricsOverviewTable extends HtmlBlock {
td(schedulerInfo.getMinAllocation().toString()).
td(schedulerInfo.getMaxAllocation().toString()).
td(String.valueOf(schedulerInfo.getMaxClusterLevelAppPriority())).
td(schedBusy == -1 ? UNAVAILABLE : String.valueOf(schedBusy)).
__().
__().__();

View File

@ -55,6 +55,7 @@ public class ClusterMetricsInfo {
private long totalVirtualCores;
private int utilizedMBPercent;
private int utilizedVirtualCoresPercent;
private int rmSchedulerBusyPercent;
private int totalNodes;
private int lostNodes;
private int unhealthyNodes;
@ -139,7 +140,11 @@ public class ClusterMetricsInfo {
this.utilizedVirtualCoresPercent = baseCores <= 0 ? 0 :
(int) (clusterMetrics.getUtilizedVirtualCores() * 100 /
baseCores);
// Scheduler Busy is in usec per sec, so to get percent divide by 10^4
// Set to -1 if disabled.
this.rmSchedulerBusyPercent =
clusterMetrics.getRmEventProcMonitorEnable() ?
(int)(clusterMetrics.getRmEventProcCPUAvg() / 10000L) : -1;
this.activeNodes = clusterMetrics.getNumActiveNMs();
this.lostNodes = clusterMetrics.getNumLostNMs();
this.unhealthyNodes = clusterMetrics.getUnhealthyNMs();
@ -259,6 +264,10 @@ public class ClusterMetricsInfo {
return utilizedVirtualCoresPercent;
}
public int getRmSchedulerBusyPercent() {
return rmSchedulerBusyPercent;
}
public void setContainersReserved(int containersReserved) {
this.containersReserved = containersReserved;
}
@ -371,6 +380,10 @@ public class ClusterMetricsInfo {
this.utilizedVirtualCoresPercent = utilizedVirtualCoresPercent;
}
public void setRmSchedulerBusyPercent(int rmSchedulerBusyPercent) {
this.rmSchedulerBusyPercent = rmSchedulerBusyPercent;
}
public ResourceInfo getTotalClusterResourcesAcrossPartition() {
return totalClusterResourcesAcrossPartition;
}

View File

@ -52,7 +52,7 @@ public class TestNodesPage {
// Number of Actual Table Headers for NodesPage.NodesBlock might change in
// future. In that case this value should be adjusted to the new value.
private final int numberOfThInMetricsTable = 22;
private final int numberOfThInMetricsTable = 23;
private final int numberOfActualTableHeaders = 18;
private final int numberOfThForOpportunisticContainers = 4;

View File

@ -459,7 +459,7 @@ public class TestRMWebServices extends JerseyTestBase {
Exception {
assertEquals("incorrect number of elements", 1, json.length());
JSONObject clusterinfo = json.getJSONObject("clusterMetrics");
assertEquals("incorrect number of elements", 29, clusterinfo.length());
assertEquals("incorrect number of elements", 30, clusterinfo.length());
verifyClusterMetrics(
clusterinfo.getInt("appsSubmitted"), clusterinfo.getInt("appsCompleted"),
clusterinfo.getInt("reservedMB"), clusterinfo.getInt("availableMB"),