diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 997fd920c9d..2c662fedb7e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -3743,9 +3743,12 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, * Perform resource checks and cache the results. */ void checkAvailableResources() { + long resourceCheckTime = monotonicNow(); Preconditions.checkState(nnResourceChecker != null, "nnResourceChecker not initialized"); hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace(); + resourceCheckTime = monotonicNow() - resourceCheckTime; + NameNode.getNameNodeMetrics().addResourceCheckTime(resourceCheckTime); } /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java index 2bc36422526..5d0e8cb5351 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java @@ -86,6 +86,7 @@ import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.JvmPauseMonitor; import org.apache.hadoop.util.ServicePlugin; import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Time; import org.apache.htrace.core.Tracer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -365,8 +366,9 @@ public class NameNode extends ReconfigurableBase implements private final boolean haEnabled; private final HAContext haContext; protected final boolean allowStaleStandbyReads; - private AtomicBoolean started = new AtomicBoolean(false); + private AtomicBoolean started = new AtomicBoolean(false); + private final static int HEALTH_MONITOR_WARN_THRESHOLD_MS = 5000; /** httpServer */ protected NameNodeHttpServer httpServer; @@ -1715,7 +1717,14 @@ public class NameNode extends ReconfigurableBase implements if (!haEnabled) { return; // no-op, if HA is not enabled } + long start = Time.monotonicNow(); getNamesystem().checkAvailableResources(); + long end = Time.monotonicNow(); + if (end - start >= HEALTH_MONITOR_WARN_THRESHOLD_MS) { + // log a warning if it take >= 5 seconds. + LOG.warn("Remote IP {} checking available resources took {}ms", + Server.getRemoteIp(), end - start); + } if (!getNamesystem().nameNodeHasResourcesAvailable()) { throw new HealthCheckFailedException( "The NameNode has no resources available"); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java index 8341c7a5684..c4cfa6adbad 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java @@ -119,6 +119,8 @@ public class NameNodeMetrics { private final MutableQuantiles[] generateEDEKTimeQuantiles; @Metric("Warm-up EDEK time") private MutableRate warmUpEDEKTime; private final MutableQuantiles[] warmUpEDEKTimeQuantiles; + @Metric("Resource check time") private MutableRate resourceCheckTime; + private final MutableQuantiles[] resourceCheckTimeQuantiles; @Metric("Duration in SafeMode at startup in msec") MutableGaugeInt safeModeTime; @@ -145,6 +147,7 @@ public class NameNodeMetrics { cacheReportQuantiles = new MutableQuantiles[len]; generateEDEKTimeQuantiles = new MutableQuantiles[len]; warmUpEDEKTimeQuantiles = new MutableQuantiles[len]; + resourceCheckTimeQuantiles = new MutableQuantiles[len]; for (int i = 0; i < len; i++) { int interval = intervals[i]; @@ -163,6 +166,9 @@ public class NameNodeMetrics { warmUpEDEKTimeQuantiles[i] = registry.newQuantiles( "warmupEDEKTime" + interval + "s", "Warm up EDEK time", "ops", "latency", interval); + resourceCheckTimeQuantiles[i] = registry.newQuantiles( + "resourceCheckTime" + interval + "s", + "resource check time", "ops", "latency", interval); } } @@ -353,4 +359,11 @@ public class NameNodeMetrics { q.add(latency); } } + + public void addResourceCheckTime(long latency) { + resourceCheckTime.add(latency); + for (MutableQuantiles q : resourceCheckTimeQuantiles) { + q.add(latency); + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java index 3974d0b8591..4ad742eaba1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java @@ -22,8 +22,13 @@ import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.FileSystemTestHelper; import org.apache.hadoop.fs.FileSystemTestWrapper; import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.ha.HAServiceProtocol; +import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.client.CreateEncryptionZoneFlag; import org.apache.hadoop.hdfs.client.HdfsAdmin; + +import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT; +import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY; import static org.apache.hadoop.test.MetricsAsserts.assertCounter; import static org.apache.hadoop.test.MetricsAsserts.assertGauge; import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges; @@ -60,9 +65,11 @@ import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker; import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil; +import org.apache.hadoop.hdfs.tools.NNHAServiceTarget; import org.apache.hadoop.metrics2.MetricsRecordBuilder; import org.apache.hadoop.metrics2.MetricsSource; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; @@ -683,4 +690,34 @@ public class TestNameNodeMetrics { } } } + + @Test + public void testResourceCheck() throws Exception { + HdfsConfiguration conf = new HdfsConfiguration(); + MiniDFSCluster tmpCluster = new MiniDFSCluster.Builder(conf) + .numDataNodes(0) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .build(); + try { + MockNameNodeResourceChecker mockResourceChecker = + new MockNameNodeResourceChecker(conf); + tmpCluster.getNameNode(0).getNamesystem() + .setNNResourceChecker(mockResourceChecker); + NNHAServiceTarget haTarget = new NNHAServiceTarget(conf, + DFSUtil.getNamenodeNameServiceId( + new HdfsConfiguration()), "nn1"); + HAServiceProtocol rpc = haTarget.getHealthMonitorProxy(conf, conf.getInt( + HA_HM_RPC_TIMEOUT_KEY, HA_HM_RPC_TIMEOUT_DEFAULT)); + + MetricsRecordBuilder rb = getMetrics(NN_METRICS); + for (long i = 0; i < 10; i++) { + rpc.monitorHealth(); + assertQuantileGauges("ResourceCheckTime1s", rb); + } + } finally { + if (tmpCluster != null) { + tmpCluster.shutdown(); + } + } + } }