HDFS-10341. Add a metric to expose the timeout number of pending replication blocks. (aajisaka)
This commit is contained in:
parent
cddf6b4fc7
commit
b6d5546e24
|
@ -219,6 +219,7 @@ Each metrics record contains tags such as HAState and Hostname as additional inf
|
|||
| `TotalSyncCount` | Total number of sync operations performed by edit log |
|
||||
| `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation|
|
||||
| `NameDirSize` | NameNode name directories size in bytes |
|
||||
| `NumTimedOutPendingReplications` | The number of timed out replications. Not the number of unique blocks that timed out. Note: The metric name will be changed to `NumTimedOutPendingReconstructions` in Hadoop 3 release. |
|
||||
|
||||
JournalNode
|
||||
-----------
|
||||
|
|
|
@ -186,6 +186,10 @@ public class BlockManager implements BlockStatsMXBean {
|
|||
public int getPendingDataNodeMessageCount() {
|
||||
return pendingDNMessages.count();
|
||||
}
|
||||
/** Used by metrics. */
|
||||
public long getNumTimedOutPendingReplications() {
|
||||
return pendingReplications.getNumTimedOuts();
|
||||
}
|
||||
|
||||
/**replicationRecheckInterval is how often namenode checks for new replication work*/
|
||||
private final long replicationRecheckInterval;
|
||||
|
|
|
@ -50,6 +50,7 @@ class PendingReplicationBlocks {
|
|||
private final ArrayList<BlockInfo> timedOutItems;
|
||||
Daemon timerThread = null;
|
||||
private volatile boolean fsRunning = true;
|
||||
private long timedOutCount = 0L;
|
||||
|
||||
//
|
||||
// It might take anywhere between 5 to 10 minutes before
|
||||
|
@ -125,6 +126,7 @@ class PendingReplicationBlocks {
|
|||
synchronized (pendingReplications) {
|
||||
pendingReplications.clear();
|
||||
timedOutItems.clear();
|
||||
timedOutCount = 0L;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -148,6 +150,16 @@ class PendingReplicationBlocks {
|
|||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used for metrics.
|
||||
* @return The number of timeouts
|
||||
*/
|
||||
long getNumTimedOuts() {
|
||||
synchronized (timedOutItems) {
|
||||
return timedOutCount + timedOutItems.size();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of blocks that have timed out their
|
||||
* replication requests. Returns null if no blocks have
|
||||
|
@ -158,9 +170,11 @@ class PendingReplicationBlocks {
|
|||
if (timedOutItems.size() <= 0) {
|
||||
return null;
|
||||
}
|
||||
int size = timedOutItems.size();
|
||||
BlockInfo[] blockList = timedOutItems.toArray(
|
||||
new BlockInfo[timedOutItems.size()]);
|
||||
new BlockInfo[size]);
|
||||
timedOutItems.clear();
|
||||
timedOutCount += size;
|
||||
return blockList;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4466,7 +4466,12 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
|
|||
public long getExcessBlocks() {
|
||||
return blockManager.getExcessBlocksCount();
|
||||
}
|
||||
|
||||
|
||||
@Metric
|
||||
public long getNumTimedOutPendingReplications() {
|
||||
return blockManager.getNumTimedOutPendingReplications();
|
||||
}
|
||||
|
||||
// HA-only metric
|
||||
@Metric
|
||||
public long getPostponedMisreplicatedBlocks() {
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
package org.apache.hadoop.hdfs.server.blockmanagement;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
@ -117,14 +119,15 @@ public class TestPendingReplication {
|
|||
//
|
||||
// verify that nothing has timed out so far
|
||||
//
|
||||
assertTrue(pendingReplications.getTimedOutBlocks() == null);
|
||||
assertNull(pendingReplications.getTimedOutBlocks());
|
||||
assertEquals(0L, pendingReplications.getNumTimedOuts());
|
||||
|
||||
//
|
||||
// Wait for one second and then insert some more items.
|
||||
//
|
||||
try {
|
||||
Thread.sleep(1000);
|
||||
} catch (Exception e) {
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
|
||||
for (int i = 10; i < 15; i++) {
|
||||
|
@ -133,7 +136,8 @@ public class TestPendingReplication {
|
|||
DatanodeStorageInfo.toDatanodeDescriptors(
|
||||
DFSTestUtil.createDatanodeStorageInfos(i)));
|
||||
}
|
||||
assertTrue(pendingReplications.size() == 15);
|
||||
assertEquals(15, pendingReplications.size());
|
||||
assertEquals(0L, pendingReplications.getNumTimedOuts());
|
||||
|
||||
//
|
||||
// Wait for everything to timeout.
|
||||
|
@ -153,10 +157,14 @@ public class TestPendingReplication {
|
|||
// Verify that everything has timed out.
|
||||
//
|
||||
assertEquals("Size of pendingReplications ", 0, pendingReplications.size());
|
||||
assertEquals(15L, pendingReplications.getNumTimedOuts());
|
||||
Block[] timedOut = pendingReplications.getTimedOutBlocks();
|
||||
assertTrue(timedOut != null && timedOut.length == 15);
|
||||
for (int i = 0; i < timedOut.length; i++) {
|
||||
assertTrue(timedOut[i].getBlockId() < 15);
|
||||
assertNotNull(timedOut);
|
||||
assertEquals(15, timedOut.length);
|
||||
// Verify the number is not reset
|
||||
assertEquals(15L, pendingReplications.getNumTimedOuts());
|
||||
for (Block block : timedOut) {
|
||||
assertTrue(block.getBlockId() < 15);
|
||||
}
|
||||
pendingReplications.stop();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue