HDFS-10341. Add a metric to expose the timeout number of pending replication blocks. (aajisaka)

(cherry picked from commit b6d5546e24)
This commit is contained in:
Akira Ajisaka 2016-06-03 16:04:11 +09:00
parent a8a2f4b500
commit d0dc5aaa2d
5 changed files with 40 additions and 8 deletions

View File

@ -219,6 +219,7 @@ Each metrics record contains tags such as HAState and Hostname as additional inf
| `TotalSyncCount` | Total number of sync operations performed by edit log | | `TotalSyncCount` | Total number of sync operations performed by edit log |
| `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation| | `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation|
| `NameDirSize` | NameNode name directories size in bytes | | `NameDirSize` | NameNode name directories size in bytes |
| `NumTimedOutPendingReplications` | The number of timed out replications. Not the number of unique blocks that timed out. Note: The metric name will be changed to `NumTimedOutPendingReconstructions` in Hadoop 3 release. |
JournalNode JournalNode
----------- -----------

View File

@ -182,6 +182,10 @@ public class BlockManager implements BlockStatsMXBean {
public int getPendingDataNodeMessageCount() { public int getPendingDataNodeMessageCount() {
return pendingDNMessages.count(); return pendingDNMessages.count();
} }
/** Used by metrics. */
public long getNumTimedOutPendingReplications() {
return pendingReplications.getNumTimedOuts();
}
/**replicationRecheckInterval is how often namenode checks for new replication work*/ /**replicationRecheckInterval is how often namenode checks for new replication work*/
private final long replicationRecheckInterval; private final long replicationRecheckInterval;

View File

@ -50,6 +50,7 @@ class PendingReplicationBlocks {
private final ArrayList<BlockInfo> timedOutItems; private final ArrayList<BlockInfo> timedOutItems;
Daemon timerThread = null; Daemon timerThread = null;
private volatile boolean fsRunning = true; private volatile boolean fsRunning = true;
private long timedOutCount = 0L;
// //
// It might take anywhere between 5 to 10 minutes before // It might take anywhere between 5 to 10 minutes before
@ -125,6 +126,7 @@ class PendingReplicationBlocks {
synchronized (pendingReplications) { synchronized (pendingReplications) {
pendingReplications.clear(); pendingReplications.clear();
timedOutItems.clear(); timedOutItems.clear();
timedOutCount = 0L;
} }
} }
@ -148,6 +150,16 @@ class PendingReplicationBlocks {
return 0; return 0;
} }
/**
* Used for metrics.
* @return The number of timeouts
*/
long getNumTimedOuts() {
synchronized (timedOutItems) {
return timedOutCount + timedOutItems.size();
}
}
/** /**
* Returns a list of blocks that have timed out their * Returns a list of blocks that have timed out their
* replication requests. Returns null if no blocks have * replication requests. Returns null if no blocks have
@ -158,9 +170,11 @@ class PendingReplicationBlocks {
if (timedOutItems.size() <= 0) { if (timedOutItems.size() <= 0) {
return null; return null;
} }
int size = timedOutItems.size();
BlockInfo[] blockList = timedOutItems.toArray( BlockInfo[] blockList = timedOutItems.toArray(
new BlockInfo[timedOutItems.size()]); new BlockInfo[size]);
timedOutItems.clear(); timedOutItems.clear();
timedOutCount += size;
return blockList; return blockList;
} }
} }

View File

@ -5155,6 +5155,11 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
return blockManager.getExcessBlocksCount(); return blockManager.getExcessBlocksCount();
} }
@Metric
public long getNumTimedOutPendingReplications() {
return blockManager.getNumTimedOutPendingReplications();
}
// HA-only metric // HA-only metric
@Metric @Metric
public long getPostponedMisreplicatedBlocks() { public long getPostponedMisreplicatedBlocks() {

View File

@ -18,6 +18,8 @@
package org.apache.hadoop.hdfs.server.blockmanagement; package org.apache.hadoop.hdfs.server.blockmanagement;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import java.util.ArrayList; import java.util.ArrayList;
@ -117,14 +119,15 @@ public class TestPendingReplication {
// //
// verify that nothing has timed out so far // verify that nothing has timed out so far
// //
assertTrue(pendingReplications.getTimedOutBlocks() == null); assertNull(pendingReplications.getTimedOutBlocks());
assertEquals(0L, pendingReplications.getNumTimedOuts());
// //
// Wait for one second and then insert some more items. // Wait for one second and then insert some more items.
// //
try { try {
Thread.sleep(1000); Thread.sleep(1000);
} catch (Exception e) { } catch (Exception ignored) {
} }
for (int i = 10; i < 15; i++) { for (int i = 10; i < 15; i++) {
@ -133,7 +136,8 @@ public class TestPendingReplication {
DatanodeStorageInfo.toDatanodeDescriptors( DatanodeStorageInfo.toDatanodeDescriptors(
DFSTestUtil.createDatanodeStorageInfos(i))); DFSTestUtil.createDatanodeStorageInfos(i)));
} }
assertTrue(pendingReplications.size() == 15); assertEquals(15, pendingReplications.size());
assertEquals(0L, pendingReplications.getNumTimedOuts());
// //
// Wait for everything to timeout. // Wait for everything to timeout.
@ -153,10 +157,14 @@ public class TestPendingReplication {
// Verify that everything has timed out. // Verify that everything has timed out.
// //
assertEquals("Size of pendingReplications ", 0, pendingReplications.size()); assertEquals("Size of pendingReplications ", 0, pendingReplications.size());
assertEquals(15L, pendingReplications.getNumTimedOuts());
Block[] timedOut = pendingReplications.getTimedOutBlocks(); Block[] timedOut = pendingReplications.getTimedOutBlocks();
assertTrue(timedOut != null && timedOut.length == 15); assertNotNull(timedOut);
for (int i = 0; i < timedOut.length; i++) { assertEquals(15, timedOut.length);
assertTrue(timedOut[i].getBlockId() < 15); // Verify the number is not reset
assertEquals(15L, pendingReplications.getNumTimedOuts());
for (Block block : timedOut) {
assertTrue(block.getBlockId() < 15);
} }
pendingReplications.stop(); pendingReplications.stop();
} }