HDFS-6403. Add metrics for log warnings reported by JVM pauses. Contributed by Yongjun Zhang.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1604074 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Aaron Myers 2014-06-20 02:38:00 +00:00
parent 46dc32e125
commit af6c91a80c
10 changed files with 89 additions and 15 deletions

View File

@ -38,6 +38,7 @@ import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.Interns; import org.apache.hadoop.metrics2.lib.Interns;
import static org.apache.hadoop.metrics2.source.JvmMetricsInfo.*; import static org.apache.hadoop.metrics2.source.JvmMetricsInfo.*;
import static org.apache.hadoop.metrics2.impl.MsInfo.*; import static org.apache.hadoop.metrics2.impl.MsInfo.*;
import org.apache.hadoop.util.JvmPauseMonitor;
/** /**
* JVM and logging related metrics. * JVM and logging related metrics.
@ -65,6 +66,7 @@ public class JvmMetrics implements MetricsSource {
ManagementFactory.getGarbageCollectorMXBeans(); ManagementFactory.getGarbageCollectorMXBeans();
final ThreadMXBean threadMXBean = ManagementFactory.getThreadMXBean(); final ThreadMXBean threadMXBean = ManagementFactory.getThreadMXBean();
final String processName, sessionId; final String processName, sessionId;
private JvmPauseMonitor pauseMonitor = null;
final ConcurrentHashMap<String, MetricsInfo[]> gcInfoCache = final ConcurrentHashMap<String, MetricsInfo[]> gcInfoCache =
new ConcurrentHashMap<String, MetricsInfo[]>(); new ConcurrentHashMap<String, MetricsInfo[]>();
@ -73,6 +75,10 @@ public class JvmMetrics implements MetricsSource {
this.sessionId = sessionId; this.sessionId = sessionId;
} }
public void setPauseMonitor(final JvmPauseMonitor pauseMonitor) {
this.pauseMonitor = pauseMonitor;
}
public static JvmMetrics create(String processName, String sessionId, public static JvmMetrics create(String processName, String sessionId,
MetricsSystem ms) { MetricsSystem ms) {
return ms.register(JvmMetrics.name(), JvmMetrics.description(), return ms.register(JvmMetrics.name(), JvmMetrics.description(),
@ -120,6 +126,15 @@ public class JvmMetrics implements MetricsSource {
} }
rb.addCounter(GcCount, count) rb.addCounter(GcCount, count)
.addCounter(GcTimeMillis, timeMillis); .addCounter(GcTimeMillis, timeMillis);
if (pauseMonitor != null) {
rb.addCounter(GcNumWarnThresholdExceeded,
pauseMonitor.getNumGcWarnThreadholdExceeded());
rb.addCounter(GcNumInfoThresholdExceeded,
pauseMonitor.getNumGcInfoThresholdExceeded());
rb.addCounter(GcTotalExtraSleepTime,
pauseMonitor.getTotalGcExtraSleepTime());
}
} }
private MetricsInfo[] getGcInfo(String gcName) { private MetricsInfo[] getGcInfo(String gcName) {

View File

@ -48,7 +48,10 @@ public enum JvmMetricsInfo implements MetricsInfo {
LogFatal("Total number of fatal log events"), LogFatal("Total number of fatal log events"),
LogError("Total number of error log events"), LogError("Total number of error log events"),
LogWarn("Total number of warning log events"), LogWarn("Total number of warning log events"),
LogInfo("Total number of info log events"); LogInfo("Total number of info log events"),
GcNumWarnThresholdExceeded("Number of times that the GC warn threshold is exceeded"),
GcNumInfoThresholdExceeded("Number of times that the GC info threshold is exceeded"),
GcTotalExtraSleepTime("Total GC extra sleep time in milliseconds");
private final String desc; private final String desc;

View File

@ -62,10 +62,13 @@ public class JvmPauseMonitor {
"jvm.pause.info-threshold.ms"; "jvm.pause.info-threshold.ms";
private static final long INFO_THRESHOLD_DEFAULT = 1000; private static final long INFO_THRESHOLD_DEFAULT = 1000;
private long numGcWarnThresholdExceeded = 0;
private long numGcInfoThresholdExceeded = 0;
private long totalGcExtraSleepTime = 0;
private Thread monitorThread; private Thread monitorThread;
private volatile boolean shouldRun = true; private volatile boolean shouldRun = true;
public JvmPauseMonitor(Configuration conf) { public JvmPauseMonitor(Configuration conf) {
this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT); this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT);
this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT); this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT);
@ -87,6 +90,22 @@ public class JvmPauseMonitor {
Thread.currentThread().interrupt(); Thread.currentThread().interrupt();
} }
} }
public boolean isStarted() {
return monitorThread != null;
}
public long getNumGcWarnThreadholdExceeded() {
return numGcWarnThresholdExceeded;
}
public long getNumGcInfoThresholdExceeded() {
return numGcInfoThresholdExceeded;
}
public long getTotalGcExtraSleepTime() {
return totalGcExtraSleepTime;
}
private String formatMessage(long extraSleepTime, private String formatMessage(long extraSleepTime,
Map<String, GcTimes> gcTimesAfterSleep, Map<String, GcTimes> gcTimesAfterSleep,
@ -166,13 +185,15 @@ public class JvmPauseMonitor {
Map<String, GcTimes> gcTimesAfterSleep = getGcTimes(); Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
if (extraSleepTime > warnThresholdMs) { if (extraSleepTime > warnThresholdMs) {
++numGcWarnThresholdExceeded;
LOG.warn(formatMessage( LOG.warn(formatMessage(
extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep)); extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
} else if (extraSleepTime > infoThresholdMs) { } else if (extraSleepTime > infoThresholdMs) {
++numGcInfoThresholdExceeded;
LOG.info(formatMessage( LOG.info(formatMessage(
extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep)); extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
} }
totalGcExtraSleepTime += extraSleepTime;
gcTimesBeforeSleep = gcTimesAfterSleep; gcTimesBeforeSleep = gcTimesAfterSleep;
} }
} }

View File

@ -86,6 +86,14 @@ jvm context
*-------------------------------------+--------------------------------------+ *-------------------------------------+--------------------------------------+
|<<<LogInfo>>> | Total number of INFO logs |<<<LogInfo>>> | Total number of INFO logs
*-------------------------------------+--------------------------------------+ *-------------------------------------+--------------------------------------+
|<<<GcNumWarnThresholdExceeded>>> | Number of times that the GC warn
| threshold is exceeded
*-------------------------------------+--------------------------------------+
|<<<GcNumInfoThresholdExceeded>>> | Number of times that the GC info
| threshold is exceeded
*-------------------------------------+--------------------------------------+
|<<<GcTotalExtraSleepTime>>> | Total GC extra sleep time in msec
*-------------------------------------+--------------------------------------+
rpc context rpc context

View File

@ -19,18 +19,25 @@
package org.apache.hadoop.metrics2.source; package org.apache.hadoop.metrics2.source;
import org.junit.Test; import org.junit.Test;
import static org.mockito.Mockito.*; import static org.mockito.Mockito.*;
import static org.apache.hadoop.test.MetricsAsserts.*; import static org.apache.hadoop.test.MetricsAsserts.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.metrics2.MetricsCollector; import org.apache.hadoop.metrics2.MetricsCollector;
import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.metrics2.MetricsRecordBuilder;
import org.apache.hadoop.util.JvmPauseMonitor;
import static org.apache.hadoop.metrics2.source.JvmMetricsInfo.*; import static org.apache.hadoop.metrics2.source.JvmMetricsInfo.*;
import static org.apache.hadoop.metrics2.impl.MsInfo.*; import static org.apache.hadoop.metrics2.impl.MsInfo.*;
public class TestJvmMetrics { public class TestJvmMetrics {
@Test public void testPresence() { @Test public void testPresence() {
MetricsRecordBuilder rb = getMetrics(new JvmMetrics("test", "test")); JvmPauseMonitor pauseMonitor = new JvmPauseMonitor(new Configuration());
JvmMetrics jvmMetrics = new JvmMetrics("test", "test");
jvmMetrics.setPauseMonitor(pauseMonitor);
MetricsRecordBuilder rb = getMetrics(jvmMetrics);
MetricsCollector mc = rb.parent(); MetricsCollector mc = rb.parent();
verify(mc).addRecord(JvmMetrics); verify(mc).addRecord(JvmMetrics);

View File

@ -453,6 +453,9 @@ Release 2.5.0 - UNRELEASED
HDFS-6480. Move waitForReady() from FSDirectory to FSNamesystem. (wheat9) HDFS-6480. Move waitForReady() from FSDirectory to FSNamesystem. (wheat9)
HDFS-6403. Add metrics for log warnings reported by JVM pauses. (Yongjun
Zhang via atm)
OPTIMIZATIONS OPTIMIZATIONS
HDFS-6214. Webhdfs has poor throughput for files >2GB (daryn) HDFS-6214. Webhdfs has poor throughput for files >2GB (daryn)

View File

@ -778,7 +778,8 @@ public class DataNode extends Configured
initIpcServer(conf); initIpcServer(conf);
metrics = DataNodeMetrics.create(conf, getDisplayName()); metrics = DataNodeMetrics.create(conf, getDisplayName());
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
blockPoolManager = new BlockPoolManager(this); blockPoolManager = new BlockPoolManager(this);
blockPoolManager.refreshNamenodes(conf); blockPoolManager.refreshNamenodes(conf);

View File

@ -90,13 +90,15 @@ public class DataNodeMetrics {
final MutableQuantiles[] sendDataPacketBlockedOnNetworkNanosQuantiles; final MutableQuantiles[] sendDataPacketBlockedOnNetworkNanosQuantiles;
@Metric MutableRate sendDataPacketTransferNanos; @Metric MutableRate sendDataPacketTransferNanos;
final MutableQuantiles[] sendDataPacketTransferNanosQuantiles; final MutableQuantiles[] sendDataPacketTransferNanosQuantiles;
final MetricsRegistry registry = new MetricsRegistry("datanode"); final MetricsRegistry registry = new MetricsRegistry("datanode");
final String name; final String name;
JvmMetrics jvmMetrics = null;
public DataNodeMetrics(String name, String sessionId, int[] intervals) {
public DataNodeMetrics(String name, String sessionId, int[] intervals,
final JvmMetrics jvmMetrics) {
this.name = name; this.name = name;
this.jvmMetrics = jvmMetrics;
registry.tag(SessionId, sessionId); registry.tag(SessionId, sessionId);
final int len = intervals.length; final int len = intervals.length;
@ -131,7 +133,7 @@ public class DataNodeMetrics {
public static DataNodeMetrics create(Configuration conf, String dnName) { public static DataNodeMetrics create(Configuration conf, String dnName) {
String sessionId = conf.get(DFSConfigKeys.DFS_METRICS_SESSION_ID_KEY); String sessionId = conf.get(DFSConfigKeys.DFS_METRICS_SESSION_ID_KEY);
MetricsSystem ms = DefaultMetricsSystem.instance(); MetricsSystem ms = DefaultMetricsSystem.instance();
JvmMetrics.create("DataNode", sessionId, ms); JvmMetrics jm = JvmMetrics.create("DataNode", sessionId, ms);
String name = "DataNodeActivity-"+ (dnName.isEmpty() String name = "DataNodeActivity-"+ (dnName.isEmpty()
? "UndefinedDataNodeName"+ DFSUtil.getRandom().nextInt() ? "UndefinedDataNodeName"+ DFSUtil.getRandom().nextInt()
: dnName.replace(':', '-')); : dnName.replace(':', '-'));
@ -141,11 +143,15 @@ public class DataNodeMetrics {
conf.getInts(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY); conf.getInts(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY);
return ms.register(name, null, new DataNodeMetrics(name, sessionId, return ms.register(name, null, new DataNodeMetrics(name, sessionId,
intervals)); intervals, jm));
} }
public String name() { return name; } public String name() { return name; }
public JvmMetrics getJvmMetrics() {
return jvmMetrics;
}
public void addHeartbeat(long latency) { public void addHeartbeat(long latency) {
heartbeats.add(latency); heartbeats.add(latency);
} }

View File

@ -598,7 +598,8 @@ public class NameNode implements NameNodeStatusMXBean {
pauseMonitor = new JvmPauseMonitor(conf); pauseMonitor = new JvmPauseMonitor(conf);
pauseMonitor.start(); pauseMonitor.start();
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
startCommonServices(conf); startCommonServices(conf);
} }

View File

@ -98,7 +98,11 @@ public class NameNodeMetrics {
@Metric("GetImageServlet putImage") @Metric("GetImageServlet putImage")
MutableRate putImage; MutableRate putImage;
NameNodeMetrics(String processName, String sessionId, int[] intervals) { JvmMetrics jvmMetrics = null;
NameNodeMetrics(String processName, String sessionId, int[] intervals,
final JvmMetrics jvmMetrics) {
this.jvmMetrics = jvmMetrics;
registry.tag(ProcessName, processName).tag(SessionId, sessionId); registry.tag(ProcessName, processName).tag(SessionId, sessionId);
final int len = intervals.length; final int len = intervals.length;
@ -124,14 +128,19 @@ public class NameNodeMetrics {
String sessionId = conf.get(DFSConfigKeys.DFS_METRICS_SESSION_ID_KEY); String sessionId = conf.get(DFSConfigKeys.DFS_METRICS_SESSION_ID_KEY);
String processName = r.toString(); String processName = r.toString();
MetricsSystem ms = DefaultMetricsSystem.instance(); MetricsSystem ms = DefaultMetricsSystem.instance();
JvmMetrics.create(processName, sessionId, ms); JvmMetrics jm = JvmMetrics.create(processName, sessionId, ms);
// Percentile measurement is off by default, by watching no intervals // Percentile measurement is off by default, by watching no intervals
int[] intervals = int[] intervals =
conf.getInts(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY); conf.getInts(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY);
return ms.register(new NameNodeMetrics(processName, sessionId, intervals)); return ms.register(new NameNodeMetrics(processName, sessionId,
intervals, jm));
} }
public JvmMetrics getJvmMetrics() {
return jvmMetrics;
}
public void shutdown() { public void shutdown() {
DefaultMetricsSystem.shutdown(); DefaultMetricsSystem.shutdown();
} }