From 79187b5a5ef96312e10808da6589aa1124720cf1 Mon Sep 17 00:00:00 2001 From: Inigo Goiri Date: Wed, 25 Mar 2020 11:30:54 -0700 Subject: [PATCH] HDFS-15075. Remove process command timing from BPServiceActor. Contributed by Xiaoqiao He. (cherry picked from commit cdcb77a2c5ca99502d2ac2fbf803f22463eb1343) (cherry picked from commit 5da46e197b63e4c847732a64e9694c9e485d5f2d) --- .../org/apache/hadoop/hdfs/DFSConfigKeys.java | 5 +++++ .../hdfs/server/datanode/BPServiceActor.java | 17 +++++++++-------- .../hadoop/hdfs/server/datanode/DNConf.java | 14 ++++++++++++++ .../datanode/metrics/DataNodeMetrics.java | 10 ++++++++++ .../src/main/resources/hdfs-default.xml | 9 +++++++++ .../server/datanode/TestBPOfferService.java | 4 ++++ 6 files changed, 51 insertions(+), 8 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index b02b5f4f554..18137877cac 100755 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -398,6 +398,11 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final boolean DFS_DATANODE_PMEM_CACHE_RECOVERY_DEFAULT = true; + public static final String DFS_DATANODE_PROCESS_COMMANDS_THRESHOLD_KEY = + "dfs.datanode.processcommands.threshold"; + public static final long DFS_DATANODE_PROCESS_COMMANDS_THRESHOLD_DEFAULT = + TimeUnit.SECONDS.toMillis(2); + public static final String DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_KEY = "dfs.namenode.datanode.registration.ip-hostname-check"; public static final boolean DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_DEFAULT = true; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java index e36c401d534..e4a30cb4657 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BPServiceActor.java @@ -683,15 +683,7 @@ class BPServiceActor implements Runnable { if (state == HAServiceState.ACTIVE) { handleRollingUpgradeStatus(resp); } - - long startProcessCommands = monotonicNow(); commandProcessingThread.enqueue(resp.getCommands()); - long endProcessCommands = monotonicNow(); - if (endProcessCommands - startProcessCommands > 2000) { - LOG.info("Took " + (endProcessCommands - startProcessCommands) - + "ms to process " + resp.getCommands().length - + " commands from NN"); - } } } if (!dn.areIBRDisabledForTests() && @@ -1312,6 +1304,7 @@ class BPServiceActor implements Runnable { */ private boolean processCommand(DatanodeCommand[] cmds) { if (cmds != null) { + long startProcessCommands = monotonicNow(); for (DatanodeCommand cmd : cmds) { try { if (!bpos.processCommandFromActor(cmd, actor)) { @@ -1330,6 +1323,14 @@ class BPServiceActor implements Runnable { LOG.warn("Error processing datanode Command", ioe); } } + long processCommandsMs = monotonicNow() - startProcessCommands; + if (cmds.length > 0) { + dn.getMetrics().addNumProcessedCommands(processCommandsMs); + } + if (processCommandsMs > dnConf.getProcessCommandsThresholdMs()) { + LOG.info("Took {} ms to process {} commands from NN", + processCommandsMs, cmds.length); + } } return true; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java index 5252e68dc11..ea57012b435 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java @@ -35,6 +35,8 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_OUTLIERS_REPORT_ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_PMEM_CACHE_DIRS_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_PMEM_CACHE_RECOVERY_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_PMEM_CACHE_RECOVERY_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_PROCESS_COMMANDS_THRESHOLD_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_PROCESS_COMMANDS_THRESHOLD_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_OVERWRITE_DOWNSTREAM_DERIVED_QOP_DEFAULT; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ENCRYPT_DATA_OVERWRITE_DOWNSTREAM_DERIVED_QOP_KEY; import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DFS_CLIENT_SOCKET_TIMEOUT_KEY; @@ -122,6 +124,8 @@ public class DNConf { final long xceiverStopTimeout; final long restartReplicaExpiry; + private final long processCommandsThresholdMs; + final long maxLockedMemory; private final String[] pmemDirs; @@ -298,6 +302,12 @@ public class DNConf { this.pmemCacheRecoveryEnabled = getConf().getBoolean( DFS_DATANODE_PMEM_CACHE_RECOVERY_KEY, DFS_DATANODE_PMEM_CACHE_RECOVERY_DEFAULT); + + this.processCommandsThresholdMs = getConf().getTimeDuration( + DFS_DATANODE_PROCESS_COMMANDS_THRESHOLD_KEY, + DFS_DATANODE_PROCESS_COMMANDS_THRESHOLD_DEFAULT, + TimeUnit.MILLISECONDS + ); } // We get minimumNameNodeVersion via a method so it can be mocked out in tests. @@ -460,4 +470,8 @@ public class DNConf { public boolean getPmemCacheRecoveryEnabled() { return pmemCacheRecoveryEnabled; } + + public long getProcessCommandsThresholdMs() { + return processCommandsThresholdMs; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java index 888456b5814..2565ab0b6a4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/metrics/DataNodeMetrics.java @@ -165,6 +165,8 @@ public class DataNodeMetrics { private MutableCounterLong sumOfActorCommandQueueLength; @Metric("Num of processed commands of all BPServiceActors") private MutableCounterLong numProcessedCommands; + @Metric("Rate of processed commands of all BPServiceActors") + private MutableRate processedCommandsOp; final MetricsRegistry registry = new MetricsRegistry("datanode"); final String name; @@ -553,4 +555,12 @@ public class DataNodeMetrics { public void incrNumProcessedCommands() { numProcessedCommands.incr(); } + + /** + * Add processedCommandsOp metrics. + * @param latency milliseconds of process commands + */ + public void addNumProcessedCommands(long latency) { + processedCommandsOp.add(latency); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index c36b333270d..5dd6e83fc6c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -2933,6 +2933,15 @@ + + dfs.datanode.processcommands.threshold + 2s + The threshold in milliseconds at which we will log a slow + command processing in BPServiceActor. By default, this parameter is set + to 2 seconds. + + + dfs.client.refresh.read-block-locations.ms 0 diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java index 71162cbf06c..8cf7299dfc1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBPOfferService.java @@ -25,6 +25,7 @@ import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.server.protocol.SlowDiskReports; +import static org.apache.hadoop.test.MetricsAsserts.assertCounter; import static org.apache.hadoop.test.MetricsAsserts.getLongCounter; import static org.apache.hadoop.test.MetricsAsserts.getMetrics; import static org.junit.Assert.assertEquals; @@ -1078,6 +1079,9 @@ public class TestBPOfferService { assertTrue("Process command nums is not expected.", getLongCounter("NumProcessedCommands", mrb) > 0); assertEquals(0, getLongCounter("SumOfActorCommandQueueLength", mrb)); + // Check new metric result about processedCommandsOp. + // One command send back to DataNode here is #FinalizeCommand. + assertCounter("ProcessedCommandsOpNumOps", 1L, mrb); } finally { if (cluster != null) { cluster.shutdown();