HBASE-15614 Report metrics from JvmPauseMonitor
This commit is contained in:
parent
18d70bc680
commit
2c26fe37ac
|
@ -0,0 +1,63 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.hbase.metrics;
|
||||
|
||||
/**
|
||||
* Interface for sources that will export JvmPauseMonitor metrics
|
||||
*/
|
||||
public interface JvmPauseMonitorSource {
|
||||
|
||||
String INFO_THRESHOLD_COUNT_KEY = "pauseInfoThresholdExceeded";
|
||||
String INFO_THRESHOLD_COUNT_DESC = "Count of INFO level pause threshold alerts";
|
||||
String WARN_THRESHOLD_COUNT_KEY = "pauseWarnThresholdExceeded";
|
||||
String WARN_THRESHOLD_COUNT_DESC = "Count of WARN level pause threshold alerts";
|
||||
|
||||
String PAUSE_TIME_WITH_GC_KEY = "pauseTimeWithGc";
|
||||
String PAUSE_TIME_WITH_GC_DESC = "Histogram for excessive pause times with GC activity detected";
|
||||
|
||||
String PAUSE_TIME_WITHOUT_GC_KEY = "pauseTimeWithoutGc";
|
||||
String PAUSE_TIME_WITHOUT_GC_DESC =
|
||||
"Histogram for excessive pause times without GC activity detected";
|
||||
|
||||
/**
|
||||
* Increment the INFO level threshold exceeded count
|
||||
* @param count the count
|
||||
*/
|
||||
void incInfoThresholdExceeded(int count);
|
||||
|
||||
/**
|
||||
* Increment the WARN level threshold exceeded count
|
||||
* @param count the count
|
||||
*/
|
||||
void incWarnThresholdExceeded(int count);
|
||||
|
||||
/**
|
||||
* Update the pause time histogram where GC activity was detected.
|
||||
*
|
||||
* @param t time it took
|
||||
*/
|
||||
void updatePauseTimeWithGc(long t);
|
||||
|
||||
/**
|
||||
* Update the pause time histogram where GC activity was not detected.
|
||||
*
|
||||
* @param t time it took
|
||||
*/
|
||||
void updatePauseTimeWithoutGc(long t);
|
||||
}
|
|
@ -19,11 +19,12 @@
|
|||
package org.apache.hadoop.hbase.regionserver;
|
||||
|
||||
import org.apache.hadoop.hbase.metrics.BaseSource;
|
||||
import org.apache.hadoop.hbase.metrics.JvmPauseMonitorSource;
|
||||
|
||||
/**
|
||||
* Interface for classes that expose metrics about the regionserver.
|
||||
*/
|
||||
public interface MetricsRegionServerSource extends BaseSource {
|
||||
public interface MetricsRegionServerSource extends BaseSource, JvmPauseMonitorSource {
|
||||
|
||||
/**
|
||||
* The name of the metrics
|
||||
|
|
|
@ -79,6 +79,12 @@ public class MetricsRegionServerSourceImpl
|
|||
private final MutableFastCounter majorCompactedInputBytes;
|
||||
private final MutableFastCounter majorCompactedOutputBytes;
|
||||
|
||||
// pause monitor metrics
|
||||
private final MutableFastCounter infoPauseThresholdExceeded;
|
||||
private final MutableFastCounter warnPauseThresholdExceeded;
|
||||
private final MetricHistogram pausesWithGc;
|
||||
private final MetricHistogram pausesWithoutGc;
|
||||
|
||||
public MetricsRegionServerSourceImpl(MetricsRegionServerWrapper rsWrap) {
|
||||
this(METRICS_NAME, METRICS_DESCRIPTION, METRICS_CONTEXT, METRICS_JMX_CONTEXT, rsWrap);
|
||||
}
|
||||
|
@ -153,6 +159,14 @@ public class MetricsRegionServerSourceImpl
|
|||
splitTimeHisto = getMetricsRegistry().newTimeHistogram(SPLIT_KEY);
|
||||
splitRequest = getMetricsRegistry().newCounter(SPLIT_REQUEST_KEY, SPLIT_REQUEST_DESC, 0L);
|
||||
splitSuccess = getMetricsRegistry().newCounter(SPLIT_SUCCESS_KEY, SPLIT_SUCCESS_DESC, 0L);
|
||||
|
||||
// pause monitor metrics
|
||||
infoPauseThresholdExceeded = getMetricsRegistry().newCounter(INFO_THRESHOLD_COUNT_KEY,
|
||||
INFO_THRESHOLD_COUNT_DESC, 0L);
|
||||
warnPauseThresholdExceeded = getMetricsRegistry().newCounter(WARN_THRESHOLD_COUNT_KEY,
|
||||
WARN_THRESHOLD_COUNT_DESC, 0L);
|
||||
pausesWithGc = getMetricsRegistry().newTimeHistogram(PAUSE_TIME_WITH_GC_KEY);
|
||||
pausesWithoutGc = getMetricsRegistry().newTimeHistogram(PAUSE_TIME_WITHOUT_GC_KEY);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -492,8 +506,29 @@ public class MetricsRegionServerSourceImpl
|
|||
rsWrap.getZookeeperQuorum())
|
||||
.tag(Interns.info(SERVER_NAME_NAME, SERVER_NAME_DESC), rsWrap.getServerName())
|
||||
.tag(Interns.info(CLUSTER_ID_NAME, CLUSTER_ID_DESC), rsWrap.getClusterId());
|
||||
|
||||
}
|
||||
|
||||
metricsRegistry.snapshot(mrb, all);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void incInfoThresholdExceeded(int count) {
|
||||
infoPauseThresholdExceeded.incr(count);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void incWarnThresholdExceeded(int count) {
|
||||
warnPauseThresholdExceeded.incr(count);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updatePauseTimeWithGc(long t) {
|
||||
pausesWithGc.add(t);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updatePauseTimeWithoutGc(long t) {
|
||||
pausesWithoutGc.add(t);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -772,8 +772,6 @@ public class HRegionServer extends HasThread implements
|
|||
HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
|
||||
healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
|
||||
}
|
||||
this.pauseMonitor = new JvmPauseMonitor(conf);
|
||||
pauseMonitor.start();
|
||||
|
||||
initializeZooKeeper();
|
||||
if (!isStopped() && !isAborted()) {
|
||||
|
@ -1410,6 +1408,9 @@ public class HRegionServer extends HasThread implements
|
|||
// Init in here rather than in constructor after thread name has been set
|
||||
this.metricsRegionServer = new MetricsRegionServer(new MetricsRegionServerWrapperImpl(this));
|
||||
this.metricsTable = new MetricsTable(new MetricsTableWrapperAggregateImpl(this));
|
||||
// Now that we have a metrics source, start the pause monitor
|
||||
this.pauseMonitor = new JvmPauseMonitor(conf, getMetrics().getMetricsSource());
|
||||
pauseMonitor.start();
|
||||
|
||||
startServiceThreads();
|
||||
startHeapMemoryManager();
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.util.Set;
|
|||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.hbase.metrics.JvmPauseMonitorSource;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
|
@ -56,22 +57,28 @@ public class JvmPauseMonitor {
|
|||
|
||||
/** log WARN if we detect a pause longer than this threshold */
|
||||
private final long warnThresholdMs;
|
||||
private static final String WARN_THRESHOLD_KEY =
|
||||
public static final String WARN_THRESHOLD_KEY =
|
||||
"jvm.pause.warn-threshold.ms";
|
||||
private static final long WARN_THRESHOLD_DEFAULT = 10000;
|
||||
|
||||
/** log INFO if we detect a pause longer than this threshold */
|
||||
private final long infoThresholdMs;
|
||||
private static final String INFO_THRESHOLD_KEY =
|
||||
public static final String INFO_THRESHOLD_KEY =
|
||||
"jvm.pause.info-threshold.ms";
|
||||
private static final long INFO_THRESHOLD_DEFAULT = 1000;
|
||||
|
||||
private Thread monitorThread;
|
||||
private volatile boolean shouldRun = true;
|
||||
private JvmPauseMonitorSource metricsSource;
|
||||
|
||||
public JvmPauseMonitor(Configuration conf) {
|
||||
this(conf, null);
|
||||
}
|
||||
|
||||
public JvmPauseMonitor(Configuration conf, JvmPauseMonitorSource metricsSource) {
|
||||
this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT);
|
||||
this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT);
|
||||
this.metricsSource = metricsSource;
|
||||
}
|
||||
|
||||
public void start() {
|
||||
|
@ -92,19 +99,7 @@ public class JvmPauseMonitor {
|
|||
}
|
||||
}
|
||||
|
||||
private String formatMessage(long extraSleepTime, Map<String, GcTimes> gcTimesAfterSleep,
|
||||
Map<String, GcTimes> gcTimesBeforeSleep) {
|
||||
|
||||
Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
|
||||
gcTimesBeforeSleep.keySet());
|
||||
List<String> gcDiffs = Lists.newArrayList();
|
||||
for (String name : gcBeanNames) {
|
||||
GcTimes diff = gcTimesAfterSleep.get(name).subtract(gcTimesBeforeSleep.get(name));
|
||||
if (diff.gcCount != 0) {
|
||||
gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString());
|
||||
}
|
||||
}
|
||||
|
||||
private String formatMessage(long extraSleepTime, List<String> gcDiffs) {
|
||||
String ret = "Detected pause in JVM or host machine (eg GC): " + "pause of approximately "
|
||||
+ extraSleepTime + "ms\n";
|
||||
if (gcDiffs.isEmpty()) {
|
||||
|
@ -160,20 +155,57 @@ public class JvmPauseMonitor {
|
|||
} catch (InterruptedException ie) {
|
||||
return;
|
||||
}
|
||||
|
||||
long extraSleepTime = sw.elapsedMillis() - SLEEP_INTERVAL_MS;
|
||||
Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
|
||||
|
||||
if (extraSleepTime > warnThresholdMs) {
|
||||
LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
|
||||
} else if (extraSleepTime > infoThresholdMs) {
|
||||
LOG.info(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
|
||||
if (extraSleepTime > infoThresholdMs) {
|
||||
Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
|
||||
gcTimesBeforeSleep.keySet());
|
||||
List<String> gcDiffs = Lists.newArrayList();
|
||||
for (String name : gcBeanNames) {
|
||||
GcTimes diff = gcTimesAfterSleep.get(name).subtract(gcTimesBeforeSleep.get(name));
|
||||
if (diff.gcCount != 0) {
|
||||
gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString());
|
||||
}
|
||||
}
|
||||
|
||||
updateMetrics(extraSleepTime, !gcDiffs.isEmpty());
|
||||
|
||||
if (extraSleepTime > warnThresholdMs) {
|
||||
LOG.warn(formatMessage(extraSleepTime, gcDiffs));
|
||||
} else {
|
||||
LOG.info(formatMessage(extraSleepTime, gcDiffs));
|
||||
}
|
||||
}
|
||||
gcTimesBeforeSleep = gcTimesAfterSleep;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void updateMetrics(long sleepTime, boolean gcDetected) {
|
||||
if (metricsSource != null) {
|
||||
if (sleepTime > warnThresholdMs) {
|
||||
metricsSource.incWarnThresholdExceeded(1);
|
||||
} else {
|
||||
metricsSource.incInfoThresholdExceeded(1);
|
||||
}
|
||||
if (gcDetected) {
|
||||
metricsSource.updatePauseTimeWithGc(sleepTime);
|
||||
} else {
|
||||
metricsSource.updatePauseTimeWithoutGc(sleepTime);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public JvmPauseMonitorSource getMetricsSource() {
|
||||
return metricsSource;
|
||||
}
|
||||
|
||||
public void setMetricsSource(JvmPauseMonitorSource metricsSource) {
|
||||
this.metricsSource = metricsSource;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple 'main' to facilitate manual testing of the pause monitor.
|
||||
*
|
||||
|
|
|
@ -17,10 +17,12 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.regionserver;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.CompatibilityFactory;
|
||||
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
|
||||
import org.apache.hadoop.hbase.testclassification.SmallTests;
|
||||
import org.apache.hadoop.hbase.test.MetricsAssertHelper;
|
||||
import org.apache.hadoop.hbase.util.JvmPauseMonitor;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
@ -202,5 +204,24 @@ public class TestMetricsRegionServer {
|
|||
HELPER.assertCounter("majorCompactedInputBytes", 400, serverSource);
|
||||
HELPER.assertCounter("majorCompactedoutputBytes", 500, serverSource);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPauseMonitor() {
|
||||
Configuration conf = new Configuration();
|
||||
conf.setLong(JvmPauseMonitor.INFO_THRESHOLD_KEY, 1000L);
|
||||
conf.setLong(JvmPauseMonitor.WARN_THRESHOLD_KEY, 10000L);
|
||||
JvmPauseMonitor monitor = new JvmPauseMonitor(conf, serverSource);
|
||||
monitor.updateMetrics(1500, false);
|
||||
HELPER.assertCounter("pauseInfoThresholdExceeded", 1, serverSource);
|
||||
HELPER.assertCounter("pauseWarnThresholdExceeded", 0, serverSource);
|
||||
HELPER.assertCounter("pauseTimeWithoutGc_num_ops", 1, serverSource);
|
||||
HELPER.assertCounter("pauseTimeWithGc_num_ops", 0, serverSource);
|
||||
monitor.updateMetrics(15000, true);
|
||||
HELPER.assertCounter("pauseInfoThresholdExceeded", 1, serverSource);
|
||||
HELPER.assertCounter("pauseWarnThresholdExceeded", 1, serverSource);
|
||||
HELPER.assertCounter("pauseTimeWithoutGc_num_ops", 1, serverSource);
|
||||
HELPER.assertCounter("pauseTimeWithGc_num_ops", 1, serverSource);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue