diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md index b709b2a52ac..05a4edb250d 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md @@ -219,6 +219,7 @@ Each metrics record contains tags such as HAState and Hostname as additional inf | `TotalSyncCount` | Total number of sync operations performed by edit log | | `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation| | `NameDirSize` | NameNode name directories size in bytes | +| `NumTimedOutPendingReplications` | The number of timed out replications. Not the number of unique blocks that timed out. Note: The metric name will be changed to `NumTimedOutPendingReconstructions` in Hadoop 3 release. | JournalNode ----------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index d02e9ea8479..2de8aac7dba 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -182,6 +182,10 @@ public class BlockManager implements BlockStatsMXBean { public int getPendingDataNodeMessageCount() { return pendingDNMessages.count(); } + /** Used by metrics. */ + public long getNumTimedOutPendingReplications() { + return pendingReplications.getNumTimedOuts(); + } /**replicationRecheckInterval is how often namenode checks for new replication work*/ private final long replicationRecheckInterval; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java index 71939de6ba8..88eaaca3ba4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/PendingReplicationBlocks.java @@ -50,6 +50,7 @@ class PendingReplicationBlocks { private final ArrayList timedOutItems; Daemon timerThread = null; private volatile boolean fsRunning = true; + private long timedOutCount = 0L; // // It might take anywhere between 5 to 10 minutes before @@ -125,6 +126,7 @@ class PendingReplicationBlocks { synchronized (pendingReplications) { pendingReplications.clear(); timedOutItems.clear(); + timedOutCount = 0L; } } @@ -148,6 +150,16 @@ class PendingReplicationBlocks { return 0; } + /** + * Used for metrics. + * @return The number of timeouts + */ + long getNumTimedOuts() { + synchronized (timedOutItems) { + return timedOutCount + timedOutItems.size(); + } + } + /** * Returns a list of blocks that have timed out their * replication requests. Returns null if no blocks have @@ -158,9 +170,11 @@ class PendingReplicationBlocks { if (timedOutItems.size() <= 0) { return null; } + int size = timedOutItems.size(); BlockInfo[] blockList = timedOutItems.toArray( - new BlockInfo[timedOutItems.size()]); + new BlockInfo[size]); timedOutItems.clear(); + timedOutCount += size; return blockList; } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 88ff62ed00d..9cd172015be 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -5154,7 +5154,12 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean, public long getExcessBlocks() { return blockManager.getExcessBlocksCount(); } - + + @Metric + public long getNumTimedOutPendingReplications() { + return blockManager.getNumTimedOutPendingReplications(); + } + // HA-only metric @Metric public long getPostponedMisreplicatedBlocks() { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingReplication.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingReplication.java index 18f28d53efc..0a4b2353639 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingReplication.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestPendingReplication.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hdfs.server.blockmanagement; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.util.ArrayList; @@ -117,14 +119,15 @@ public class TestPendingReplication { // // verify that nothing has timed out so far // - assertTrue(pendingReplications.getTimedOutBlocks() == null); + assertNull(pendingReplications.getTimedOutBlocks()); + assertEquals(0L, pendingReplications.getNumTimedOuts()); // // Wait for one second and then insert some more items. // try { Thread.sleep(1000); - } catch (Exception e) { + } catch (Exception ignored) { } for (int i = 10; i < 15; i++) { @@ -133,7 +136,8 @@ public class TestPendingReplication { DatanodeStorageInfo.toDatanodeDescriptors( DFSTestUtil.createDatanodeStorageInfos(i))); } - assertTrue(pendingReplications.size() == 15); + assertEquals(15, pendingReplications.size()); + assertEquals(0L, pendingReplications.getNumTimedOuts()); // // Wait for everything to timeout. @@ -153,10 +157,14 @@ public class TestPendingReplication { // Verify that everything has timed out. // assertEquals("Size of pendingReplications ", 0, pendingReplications.size()); + assertEquals(15L, pendingReplications.getNumTimedOuts()); Block[] timedOut = pendingReplications.getTimedOutBlocks(); - assertTrue(timedOut != null && timedOut.length == 15); - for (int i = 0; i < timedOut.length; i++) { - assertTrue(timedOut[i].getBlockId() < 15); + assertNotNull(timedOut); + assertEquals(15, timedOut.length); + // Verify the number is not reset + assertEquals(15L, pendingReplications.getNumTimedOuts()); + for (Block block : timedOut) { + assertTrue(block.getBlockId() < 15); } pendingReplications.stop(); }