HDFS-10341. Add a metric to expose the timeout number of pending replication blocks. (aajisaka)
(cherry picked from commit b6d5546e24
)
This commit is contained in:
parent
a8a2f4b500
commit
d0dc5aaa2d
|
@ -219,6 +219,7 @@ Each metrics record contains tags such as HAState and Hostname as additional inf
|
||||||
| `TotalSyncCount` | Total number of sync operations performed by edit log |
|
| `TotalSyncCount` | Total number of sync operations performed by edit log |
|
||||||
| `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation|
|
| `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation|
|
||||||
| `NameDirSize` | NameNode name directories size in bytes |
|
| `NameDirSize` | NameNode name directories size in bytes |
|
||||||
|
| `NumTimedOutPendingReplications` | The number of timed out replications. Not the number of unique blocks that timed out. Note: The metric name will be changed to `NumTimedOutPendingReconstructions` in Hadoop 3 release. |
|
||||||
|
|
||||||
JournalNode
|
JournalNode
|
||||||
-----------
|
-----------
|
||||||
|
|
|
@ -182,6 +182,10 @@ public class BlockManager implements BlockStatsMXBean {
|
||||||
public int getPendingDataNodeMessageCount() {
|
public int getPendingDataNodeMessageCount() {
|
||||||
return pendingDNMessages.count();
|
return pendingDNMessages.count();
|
||||||
}
|
}
|
||||||
|
/** Used by metrics. */
|
||||||
|
public long getNumTimedOutPendingReplications() {
|
||||||
|
return pendingReplications.getNumTimedOuts();
|
||||||
|
}
|
||||||
|
|
||||||
/**replicationRecheckInterval is how often namenode checks for new replication work*/
|
/**replicationRecheckInterval is how often namenode checks for new replication work*/
|
||||||
private final long replicationRecheckInterval;
|
private final long replicationRecheckInterval;
|
||||||
|
|
|
@ -50,6 +50,7 @@ class PendingReplicationBlocks {
|
||||||
private final ArrayList<BlockInfo> timedOutItems;
|
private final ArrayList<BlockInfo> timedOutItems;
|
||||||
Daemon timerThread = null;
|
Daemon timerThread = null;
|
||||||
private volatile boolean fsRunning = true;
|
private volatile boolean fsRunning = true;
|
||||||
|
private long timedOutCount = 0L;
|
||||||
|
|
||||||
//
|
//
|
||||||
// It might take anywhere between 5 to 10 minutes before
|
// It might take anywhere between 5 to 10 minutes before
|
||||||
|
@ -125,6 +126,7 @@ class PendingReplicationBlocks {
|
||||||
synchronized (pendingReplications) {
|
synchronized (pendingReplications) {
|
||||||
pendingReplications.clear();
|
pendingReplications.clear();
|
||||||
timedOutItems.clear();
|
timedOutItems.clear();
|
||||||
|
timedOutCount = 0L;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -148,6 +150,16 @@ class PendingReplicationBlocks {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used for metrics.
|
||||||
|
* @return The number of timeouts
|
||||||
|
*/
|
||||||
|
long getNumTimedOuts() {
|
||||||
|
synchronized (timedOutItems) {
|
||||||
|
return timedOutCount + timedOutItems.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a list of blocks that have timed out their
|
* Returns a list of blocks that have timed out their
|
||||||
* replication requests. Returns null if no blocks have
|
* replication requests. Returns null if no blocks have
|
||||||
|
@ -158,9 +170,11 @@ class PendingReplicationBlocks {
|
||||||
if (timedOutItems.size() <= 0) {
|
if (timedOutItems.size() <= 0) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
int size = timedOutItems.size();
|
||||||
BlockInfo[] blockList = timedOutItems.toArray(
|
BlockInfo[] blockList = timedOutItems.toArray(
|
||||||
new BlockInfo[timedOutItems.size()]);
|
new BlockInfo[size]);
|
||||||
timedOutItems.clear();
|
timedOutItems.clear();
|
||||||
|
timedOutCount += size;
|
||||||
return blockList;
|
return blockList;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5155,6 +5155,11 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
|
||||||
return blockManager.getExcessBlocksCount();
|
return blockManager.getExcessBlocksCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Metric
|
||||||
|
public long getNumTimedOutPendingReplications() {
|
||||||
|
return blockManager.getNumTimedOutPendingReplications();
|
||||||
|
}
|
||||||
|
|
||||||
// HA-only metric
|
// HA-only metric
|
||||||
@Metric
|
@Metric
|
||||||
public long getPostponedMisreplicatedBlocks() {
|
public long getPostponedMisreplicatedBlocks() {
|
||||||
|
|
|
@ -18,6 +18,8 @@
|
||||||
package org.apache.hadoop.hdfs.server.blockmanagement;
|
package org.apache.hadoop.hdfs.server.blockmanagement;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.junit.Assert.assertNull;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -117,14 +119,15 @@ public class TestPendingReplication {
|
||||||
//
|
//
|
||||||
// verify that nothing has timed out so far
|
// verify that nothing has timed out so far
|
||||||
//
|
//
|
||||||
assertTrue(pendingReplications.getTimedOutBlocks() == null);
|
assertNull(pendingReplications.getTimedOutBlocks());
|
||||||
|
assertEquals(0L, pendingReplications.getNumTimedOuts());
|
||||||
|
|
||||||
//
|
//
|
||||||
// Wait for one second and then insert some more items.
|
// Wait for one second and then insert some more items.
|
||||||
//
|
//
|
||||||
try {
|
try {
|
||||||
Thread.sleep(1000);
|
Thread.sleep(1000);
|
||||||
} catch (Exception e) {
|
} catch (Exception ignored) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 10; i < 15; i++) {
|
for (int i = 10; i < 15; i++) {
|
||||||
|
@ -133,7 +136,8 @@ public class TestPendingReplication {
|
||||||
DatanodeStorageInfo.toDatanodeDescriptors(
|
DatanodeStorageInfo.toDatanodeDescriptors(
|
||||||
DFSTestUtil.createDatanodeStorageInfos(i)));
|
DFSTestUtil.createDatanodeStorageInfos(i)));
|
||||||
}
|
}
|
||||||
assertTrue(pendingReplications.size() == 15);
|
assertEquals(15, pendingReplications.size());
|
||||||
|
assertEquals(0L, pendingReplications.getNumTimedOuts());
|
||||||
|
|
||||||
//
|
//
|
||||||
// Wait for everything to timeout.
|
// Wait for everything to timeout.
|
||||||
|
@ -153,10 +157,14 @@ public class TestPendingReplication {
|
||||||
// Verify that everything has timed out.
|
// Verify that everything has timed out.
|
||||||
//
|
//
|
||||||
assertEquals("Size of pendingReplications ", 0, pendingReplications.size());
|
assertEquals("Size of pendingReplications ", 0, pendingReplications.size());
|
||||||
|
assertEquals(15L, pendingReplications.getNumTimedOuts());
|
||||||
Block[] timedOut = pendingReplications.getTimedOutBlocks();
|
Block[] timedOut = pendingReplications.getTimedOutBlocks();
|
||||||
assertTrue(timedOut != null && timedOut.length == 15);
|
assertNotNull(timedOut);
|
||||||
for (int i = 0; i < timedOut.length; i++) {
|
assertEquals(15, timedOut.length);
|
||||||
assertTrue(timedOut[i].getBlockId() < 15);
|
// Verify the number is not reset
|
||||||
|
assertEquals(15L, pendingReplications.getNumTimedOuts());
|
||||||
|
for (Block block : timedOut) {
|
||||||
|
assertTrue(block.getBlockId() < 15);
|
||||||
}
|
}
|
||||||
pendingReplications.stop();
|
pendingReplications.stop();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue