HBASE-15614 Report metrics from JvmPauseMonitor

This commit is contained in:
Andrew Purtell 2016-04-15 17:22:02 -07:00
parent 18d70bc680
commit 2c26fe37ac
6 changed files with 176 additions and 23 deletions

View File

@ -0,0 +1,63 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.metrics;
/**
* Interface for sources that will export JvmPauseMonitor metrics
*/
public interface JvmPauseMonitorSource {
String INFO_THRESHOLD_COUNT_KEY = "pauseInfoThresholdExceeded";
String INFO_THRESHOLD_COUNT_DESC = "Count of INFO level pause threshold alerts";
String WARN_THRESHOLD_COUNT_KEY = "pauseWarnThresholdExceeded";
String WARN_THRESHOLD_COUNT_DESC = "Count of WARN level pause threshold alerts";
String PAUSE_TIME_WITH_GC_KEY = "pauseTimeWithGc";
String PAUSE_TIME_WITH_GC_DESC = "Histogram for excessive pause times with GC activity detected";
String PAUSE_TIME_WITHOUT_GC_KEY = "pauseTimeWithoutGc";
String PAUSE_TIME_WITHOUT_GC_DESC =
"Histogram for excessive pause times without GC activity detected";
/**
* Increment the INFO level threshold exceeded count
* @param count the count
*/
void incInfoThresholdExceeded(int count);
/**
* Increment the WARN level threshold exceeded count
* @param count the count
*/
void incWarnThresholdExceeded(int count);
/**
* Update the pause time histogram where GC activity was detected.
*
* @param t time it took
*/
void updatePauseTimeWithGc(long t);
/**
* Update the pause time histogram where GC activity was not detected.
*
* @param t time it took
*/
void updatePauseTimeWithoutGc(long t);
}

View File

@ -19,11 +19,12 @@
package org.apache.hadoop.hbase.regionserver;
import org.apache.hadoop.hbase.metrics.BaseSource;
import org.apache.hadoop.hbase.metrics.JvmPauseMonitorSource;
/**
* Interface for classes that expose metrics about the regionserver.
*/
public interface MetricsRegionServerSource extends BaseSource {
public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSource {
/**
* The name of the metrics

View File

@ -79,6 +79,12 @@ public class MetricsRegionServerSourceImpl
private final MutableFastCounter majorCompactedInputBytes;
private final MutableFastCounter majorCompactedOutputBytes;
// pause monitor metrics
private final MutableFastCounter infoPauseThresholdExceeded;
private final MutableFastCounter warnPauseThresholdExceeded;
private final MetricHistogram pausesWithGc;
private final MetricHistogram pausesWithoutGc;
public MetricsRegionServerSourceImpl(MetricsRegionServerWrapper rsWrap) {
this(METRICS_NAME, METRICS_DESCRIPTION, METRICS_CONTEXT, METRICS_JMX_CONTEXT, rsWrap);
}
@ -153,6 +159,14 @@ public class MetricsRegionServerSourceImpl
splitTimeHisto = getMetricsRegistry().newTimeHistogram(SPLIT_KEY);
splitRequest = getMetricsRegistry().newCounter(SPLIT_REQUEST_KEY, SPLIT_REQUEST_DESC, 0L);
splitSuccess = getMetricsRegistry().newCounter(SPLIT_SUCCESS_KEY, SPLIT_SUCCESS_DESC, 0L);
// pause monitor metrics
infoPauseThresholdExceeded = getMetricsRegistry().newCounter(INFO_THRESHOLD_COUNT_KEY,
INFO_THRESHOLD_COUNT_DESC, 0L);
warnPauseThresholdExceeded = getMetricsRegistry().newCounter(WARN_THRESHOLD_COUNT_KEY,
WARN_THRESHOLD_COUNT_DESC, 0L);
pausesWithGc = getMetricsRegistry().newTimeHistogram(PAUSE_TIME_WITH_GC_KEY);
pausesWithoutGc = getMetricsRegistry().newTimeHistogram(PAUSE_TIME_WITHOUT_GC_KEY);
}
@Override
@ -492,8 +506,29 @@ public class MetricsRegionServerSourceImpl
rsWrap.getZookeeperQuorum())
.tag(Interns.info(SERVER_NAME_NAME, SERVER_NAME_DESC), rsWrap.getServerName())
.tag(Interns.info(CLUSTER_ID_NAME, CLUSTER_ID_DESC), rsWrap.getClusterId());
}
metricsRegistry.snapshot(mrb, all);
}
@Override
public void incInfoThresholdExceeded(int count) {
infoPauseThresholdExceeded.incr(count);
}
@Override
public void incWarnThresholdExceeded(int count) {
warnPauseThresholdExceeded.incr(count);
}
@Override
public void updatePauseTimeWithGc(long t) {
pausesWithGc.add(t);
}
@Override
public void updatePauseTimeWithoutGc(long t) {
pausesWithoutGc.add(t);
}
}

View File

@ -772,8 +772,6 @@ public class HRegionServer extends HasThread implements
HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
}
this.pauseMonitor = new JvmPauseMonitor(conf);
pauseMonitor.start();
initializeZooKeeper();
if (!isStopped() && !isAborted()) {
@ -1410,6 +1408,9 @@ public class HRegionServer extends HasThread implements
// Init in here rather than in constructor after thread name has been set
this.metricsRegionServer = new MetricsRegionServer(new MetricsRegionServerWrapperImpl(this));
this.metricsTable = new MetricsTable(new MetricsTableWrapperAggregateImpl(this));
// Now that we have a metrics source, start the pause monitor
this.pauseMonitor = new JvmPauseMonitor(conf, getMetrics().getMetricsSource());
pauseMonitor.start();
startServiceThreads();
startHeapMemoryManager();

View File

@ -26,6 +26,7 @@ import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.metrics.JvmPauseMonitorSource;
import org.apache.hadoop.conf.Configuration;
import com.google.common.base.Joiner;
@ -56,22 +57,28 @@ public class JvmPauseMonitor {
/** log WARN if we detect a pause longer than this threshold */
private final long warnThresholdMs;
private static final String WARN_THRESHOLD_KEY =
public static final String WARN_THRESHOLD_KEY =
"jvm.pause.warn-threshold.ms";
private static final long WARN_THRESHOLD_DEFAULT = 10000;
/** log INFO if we detect a pause longer than this threshold */
private final long infoThresholdMs;
private static final String INFO_THRESHOLD_KEY =
public static final String INFO_THRESHOLD_KEY =
"jvm.pause.info-threshold.ms";
private static final long INFO_THRESHOLD_DEFAULT = 1000;
private Thread monitorThread;
private volatile boolean shouldRun = true;
private JvmPauseMonitorSource metricsSource;
public JvmPauseMonitor(Configuration conf) {
this(conf, null);
}
public JvmPauseMonitor(Configuration conf, JvmPauseMonitorSource metricsSource) {
this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT);
this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT);
this.metricsSource = metricsSource;
}
public void start() {
@ -92,19 +99,7 @@ public class JvmPauseMonitor {
}
}
private String formatMessage(long extraSleepTime, Map<String, GcTimes> gcTimesAfterSleep,
Map<String, GcTimes> gcTimesBeforeSleep) {
Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
gcTimesBeforeSleep.keySet());
List<String> gcDiffs = Lists.newArrayList();
for (String name : gcBeanNames) {
GcTimes diff = gcTimesAfterSleep.get(name).subtract(gcTimesBeforeSleep.get(name));
if (diff.gcCount != 0) {
gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString());
}
}
private String formatMessage(long extraSleepTime, List<String> gcDiffs) {
String ret = "Detected pause in JVM or host machine (eg GC): " + "pause of approximately "
+ extraSleepTime + "ms\n";
if (gcDiffs.isEmpty()) {
@ -160,20 +155,57 @@ public class JvmPauseMonitor {
} catch (InterruptedException ie) {
return;
}
long extraSleepTime = sw.elapsedMillis() - SLEEP_INTERVAL_MS;
Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
if (extraSleepTime > warnThresholdMs) {
LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
} else if (extraSleepTime > infoThresholdMs) {
LOG.info(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
if (extraSleepTime > infoThresholdMs) {
Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
gcTimesBeforeSleep.keySet());
List<String> gcDiffs = Lists.newArrayList();
for (String name : gcBeanNames) {
GcTimes diff = gcTimesAfterSleep.get(name).subtract(gcTimesBeforeSleep.get(name));
if (diff.gcCount != 0) {
gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString());
}
}
updateMetrics(extraSleepTime, !gcDiffs.isEmpty());
if (extraSleepTime > warnThresholdMs) {
LOG.warn(formatMessage(extraSleepTime, gcDiffs));
} else {
LOG.info(formatMessage(extraSleepTime, gcDiffs));
}
}
gcTimesBeforeSleep = gcTimesAfterSleep;
}
}
}
public void updateMetrics(long sleepTime, boolean gcDetected) {
if (metricsSource != null) {
if (sleepTime > warnThresholdMs) {
metricsSource.incWarnThresholdExceeded(1);
} else {
metricsSource.incInfoThresholdExceeded(1);
}
if (gcDetected) {
metricsSource.updatePauseTimeWithGc(sleepTime);
} else {
metricsSource.updatePauseTimeWithoutGc(sleepTime);
}
}
}
public JvmPauseMonitorSource getMetricsSource() {
return metricsSource;
}
public void setMetricsSource(JvmPauseMonitorSource metricsSource) {
this.metricsSource = metricsSource;
}
/**
* Simple 'main' to facilitate manual testing of the pause monitor.
*

View File

@ -17,10 +17,12 @@
*/
package org.apache.hadoop.hbase.regionserver;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.CompatibilityFactory;
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
import org.apache.hadoop.hbase.testclassification.SmallTests;
import org.apache.hadoop.hbase.test.MetricsAssertHelper;
import org.apache.hadoop.hbase.util.JvmPauseMonitor;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
@ -202,5 +204,24 @@ public class TestMetricsRegionServer {
HELPER.assertCounter("majorCompactedInputBytes", 400, serverSource);
HELPER.assertCounter("majorCompactedoutputBytes", 500, serverSource);
}
@Test
public void testPauseMonitor() {
Configuration conf = new Configuration();
conf.setLong(JvmPauseMonitor.INFO_THRESHOLD_KEY, 1000L);
conf.setLong(JvmPauseMonitor.WARN_THRESHOLD_KEY, 10000L);
JvmPauseMonitor monitor = new JvmPauseMonitor(conf, serverSource);
monitor.updateMetrics(1500, false);
HELPER.assertCounter("pauseInfoThresholdExceeded", 1, serverSource);
HELPER.assertCounter("pauseWarnThresholdExceeded", 0, serverSource);
HELPER.assertCounter("pauseTimeWithoutGc_num_ops", 1, serverSource);
HELPER.assertCounter("pauseTimeWithGc_num_ops", 0, serverSource);
monitor.updateMetrics(15000, true);
HELPER.assertCounter("pauseInfoThresholdExceeded", 1, serverSource);
HELPER.assertCounter("pauseWarnThresholdExceeded", 1, serverSource);
HELPER.assertCounter("pauseTimeWithoutGc_num_ops", 1, serverSource);
HELPER.assertCounter("pauseTimeWithGc_num_ops", 1, serverSource);
}
}