HDFS-13641. Add metrics for edit log tailing. Contributed by Chao Sun.

This commit is contained in:
Yiqun Lin 2018-06-13 20:05:55 +08:00
parent 6307962b93
commit 8e7548d33b
5 changed files with 140 additions and 5 deletions

View File

@ -181,6 +181,18 @@ Each metrics record contains tags such as ProcessName, SessionId, and Hostname a
| `WarmUpEDEKTimeAvgTime` | Average time of warming up EDEK in milliseconds |
| `ResourceCheckTime`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of NameNode resource check latency in milliseconds. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `StorageBlockReport`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of storage block report latency in milliseconds. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `EditLogTailTimeNumOps` | Total number of times the standby NameNode tailed the edit log |
| `EditLogTailTimeAvgTime` | Average time (in milliseconds) spent by standby NameNode in tailing edit log |
| `EditLogTailTime`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of time spent in tailing edit logs by standby NameNode, in milliseconds. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `EditLogFetchTimeNumOps` | Total number of times the standby NameNode fetched remote edit streams from journal nodes |
| `EditLogFetchTimeAvgTime` | Average time (in milliseconds) spent by standby NameNode in fetching remote edit streams from journal nodes |
| `EditLogFetchTime`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of time spent in fetching edit streams from journal nodes by standby NameNode, in milliseconds. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `NumEditLogLoadedNumOps` | Total number of times edits were loaded by standby NameNode |
| `NumEditLogLoadedAvgCount` | Average number of edits loaded by standby NameNode in each edit log tailing |
| `NumEditLogLoaded`*num*`s(50/75/90/95/99)thPercentileCount` | The 50/75/90/95/99th percentile of number of edits loaded by standby NameNode in each edit log tailing. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
| `EditLogTailIntervalNumOps` | Total number of intervals between edit log tailings by standby NameNode |
| `EditLogTailIntervalAvgTime` | Average time of intervals between edit log tailings by standby NameNode in milliseconds |
| `EditLogTailInterval`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of time between edit log tailings by standby NameNode, in milliseconds. Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
FSNamesystem
------------

View File

@ -367,17 +367,31 @@ public class MetricsAsserts {
}
/**
* Asserts that the NumOps and quantiles for a metric have been changed at
* some point to a non-zero value.
* Asserts that the NumOps and quantiles for a metric with value name
* "Latency" have been changed at some point to a non-zero value.
*
* @param prefix of the metric
* @param rb MetricsRecordBuilder with the metric
*/
public static void assertQuantileGauges(String prefix,
public static void assertQuantileGauges(String prefix,
MetricsRecordBuilder rb) {
assertQuantileGauges(prefix, rb, "Latency");
}
/**
* Asserts that the NumOps and quantiles for a metric have been changed at
* some point to a non-zero value, for the specified value name of the
* metrics (e.g., "Latency", "Count").
*
* @param prefix of the metric
* @param rb MetricsRecordBuilder with the metric
* @param valueName the value name for the metric
*/
public static void assertQuantileGauges(String prefix,
MetricsRecordBuilder rb, String valueName) {
verify(rb).addGauge(eqName(info(prefix + "NumOps", "")), geq(0l));
for (Quantile q : MutableQuantiles.quantiles) {
String nameTemplate = prefix + "%dthPercentileLatency";
String nameTemplate = prefix + "%dthPercentile" + valueName;
int percentile = (int) (100 * q.quantile);
verify(rb).addGauge(
eqName(info(String.format(nameTemplate, percentile), "")),

View File

@ -62,6 +62,7 @@ import static org.apache.hadoop.util.ExitUtil.terminate;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import org.apache.hadoop.util.Time;
/**
@ -292,6 +293,7 @@ public class EditLogTailer {
LOG.debug("lastTxnId: " + lastTxnId);
}
Collection<EditLogInputStream> streams;
long startTime = Time.monotonicNow();
try {
streams = editLog.selectInputStreams(lastTxnId + 1, 0,
null, inProgressOk, true);
@ -302,6 +304,9 @@ public class EditLogTailer {
LOG.warn("Edits tailer failed to find any streams. Will try again " +
"later.", ioe);
return;
} finally {
NameNode.getNameNodeMetrics().addEditLogFetchTime(
Time.monotonicNow() - startTime);
}
if (LOG.isDebugEnabled()) {
LOG.debug("edit streams to load from: " + streams.size());
@ -322,6 +327,7 @@ public class EditLogTailer {
LOG.debug(String.format("Loaded %d edits starting from txid %d ",
editsLoaded, lastTxnId));
}
NameNode.getNameNodeMetrics().addNumEditLogLoaded(editsLoaded);
}
if (editsLoaded > 0) {
@ -446,10 +452,15 @@ public class EditLogTailer {
// name system lock will be acquired to further block even the block
// state updates.
namesystem.cpLockInterruptibly();
long startTime = Time.monotonicNow();
try {
NameNode.getNameNodeMetrics().addEditLogTailInterval(
startTime - lastLoadTimeMs);
doTailEdits();
} finally {
namesystem.cpUnlock();
NameNode.getNameNodeMetrics().addEditLogTailTime(
Time.monotonicNow() - startTime);
}
//Update NameDirSize Metric
namesystem.getFSImage().getStorage().updateNameDirSize();

View File

@ -32,6 +32,7 @@ import org.apache.hadoop.metrics2.lib.MutableCounterLong;
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
import org.apache.hadoop.metrics2.lib.MutableQuantiles;
import org.apache.hadoop.metrics2.lib.MutableRate;
import org.apache.hadoop.metrics2.lib.MutableStat;
import org.apache.hadoop.metrics2.source.JvmMetrics;
/**
@ -134,6 +135,18 @@ public class NameNodeMetrics {
@Metric("Time loading FS Image at startup in msec")
MutableGaugeInt fsImageLoadTime;
@Metric("Time tailing edit logs in msec")
MutableRate editLogTailTime;
private final MutableQuantiles[] editLogTailTimeQuantiles;
@Metric MutableRate editLogFetchTime;
private final MutableQuantiles[] editLogFetchTimeQuantiles;
@Metric(value = "Number of edits loaded", valueName = "Count")
MutableStat numEditLogLoaded;
private final MutableQuantiles[] numEditLogLoadedQuantiles;
@Metric("Time between edit log tailing in msec")
MutableRate editLogTailInterval;
private final MutableQuantiles[] editLogTailIntervalQuantiles;
@Metric("GetImageServlet getEdit")
MutableRate getEdit;
@Metric("GetImageServlet getImage")
@ -156,7 +169,11 @@ public class NameNodeMetrics {
generateEDEKTimeQuantiles = new MutableQuantiles[len];
warmUpEDEKTimeQuantiles = new MutableQuantiles[len];
resourceCheckTimeQuantiles = new MutableQuantiles[len];
editLogTailTimeQuantiles = new MutableQuantiles[len];
editLogFetchTimeQuantiles = new MutableQuantiles[len];
numEditLogLoadedQuantiles = new MutableQuantiles[len];
editLogTailIntervalQuantiles = new MutableQuantiles[len];
for (int i = 0; i < len; i++) {
int interval = intervals[i];
syncsQuantiles[i] = registry.newQuantiles(
@ -181,6 +198,18 @@ public class NameNodeMetrics {
resourceCheckTimeQuantiles[i] = registry.newQuantiles(
"resourceCheckTime" + interval + "s",
"resource check time", "ops", "latency", interval);
editLogTailTimeQuantiles[i] = registry.newQuantiles(
"editLogTailTime" + interval + "s",
"Edit log tailing time", "ops", "latency", interval);
editLogFetchTimeQuantiles[i] = registry.newQuantiles(
"editLogFetchTime" + interval + "s",
"Edit log fetch time", "ops", "latency", interval);
numEditLogLoadedQuantiles[i] = registry.newQuantiles(
"numEditLogLoaded" + interval + "s",
"Number of edits loaded", "ops", "count", interval);
editLogTailIntervalQuantiles[i] = registry.newQuantiles(
"editLogTailInterval" + interval + "s",
"Edit log tailing interval", "ops", "latency", interval);
}
}
@ -389,4 +418,32 @@ public class NameNodeMetrics {
q.add(latency);
}
}
public void addEditLogTailTime(long elapsed) {
editLogTailTime.add(elapsed);
for (MutableQuantiles q : editLogTailTimeQuantiles) {
q.add(elapsed);
}
}
public void addEditLogFetchTime(long elapsed) {
editLogFetchTime.add(elapsed);
for (MutableQuantiles q : editLogFetchTimeQuantiles) {
q.add(elapsed);
}
}
public void addNumEditLogLoaded(long loaded) {
numEditLogLoaded.add(loaded);
for (MutableQuantiles q : numEditLogLoadedQuantiles) {
q.add(loaded);
}
}
public void addEditLogTailInterval(long elapsed) {
editLogTailInterval.add(elapsed);
for (MutableQuantiles q : editLogTailIntervalQuantiles) {
q.add(elapsed);
}
}
}

View File

@ -32,6 +32,7 @@ import org.apache.hadoop.hdfs.client.HdfsAdmin;
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT;
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY;
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
import static org.apache.hadoop.test.MetricsAsserts.assertCounterGt;
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges;
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
@ -1059,4 +1060,44 @@ public class TestNameNodeMetrics {
}
}
}
@Test
public void testEditLogTailing() throws Exception {
HdfsConfiguration conf = new HdfsConfiguration();
conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
conf.setInt(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY, 60);
MiniDFSCluster dfsCluster = null;
try {
dfsCluster = new MiniDFSCluster.Builder(conf)
.numDataNodes(0)
.nnTopology(MiniDFSNNTopology.simpleHATopology())
.build();
DistributedFileSystem dfs = dfsCluster.getFileSystem(0);
dfsCluster.transitionToActive(0);
dfsCluster.waitActive();
Path testDir = new Path("/testdir");
dfs.mkdir(testDir, FsPermission.getDefault());
dfsCluster.getNameNodeRpc(0).rollEditLog();
Thread.sleep(2 * 1000);
// We need to get the metrics for the SBN (excluding the NN from dfs
// cluster created in setUp() and the ANN).
MetricsRecordBuilder rb = getMetrics(NN_METRICS+"-2");
assertQuantileGauges("EditLogTailTime60s", rb);
assertQuantileGauges("EditLogFetchTime60s", rb);
assertQuantileGauges("NumEditLogLoaded60s", rb, "Count");
assertQuantileGauges("EditLogTailInterval60s", rb);
assertCounterGt("EditLogTailTimeNumOps", 0L, rb);
assertCounterGt("EditLogFetchTimeNumOps", 0L, rb);
assertCounterGt("NumEditLogLoadedNumOps", 0L, rb);
assertCounterGt("EditLogTailIntervalNumOps", 0L, rb);
} finally {
if (dfsCluster != null) {
dfsCluster.shutdown();
}
}
}
}