HDFS-6425. Large postponedMisreplicatedBlocks has impact on blockReport latency. Contributed by Ming Ma.
This commit is contained in:
parent
07bb0b0bbb
commit
b7923a356e
|
@ -595,6 +595,9 @@ Release 2.7.0 - UNRELEASED
|
||||||
|
|
||||||
HDFS-7516. Fix findbugs warnings in hdfs-nfs project. (brandonli)
|
HDFS-7516. Fix findbugs warnings in hdfs-nfs project. (brandonli)
|
||||||
|
|
||||||
|
HDFS-6425. Large postponedMisreplicatedBlocks has impact on blockReport
|
||||||
|
latency. (Ming Ma via kihwal)
|
||||||
|
|
||||||
Release 2.6.1 - UNRELEASED
|
Release 2.6.1 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -325,6 +325,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
|
||||||
public static final String DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_KEY = "dfs.namenode.write.stale.datanode.ratio";
|
public static final String DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_KEY = "dfs.namenode.write.stale.datanode.ratio";
|
||||||
public static final float DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_DEFAULT = 0.5f;
|
public static final float DFS_NAMENODE_USE_STALE_DATANODE_FOR_WRITE_RATIO_DEFAULT = 0.5f;
|
||||||
|
|
||||||
|
// Number of blocks to rescan for each iteration of postponedMisreplicatedBlocks.
|
||||||
|
public static final String DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY = "dfs.namenode.blocks.per.postponedblocks.rescan";
|
||||||
|
public static final long DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY_DEFAULT = 10000;
|
||||||
|
|
||||||
// Replication monitoring related keys
|
// Replication monitoring related keys
|
||||||
public static final String DFS_NAMENODE_INVALIDATE_WORK_PCT_PER_ITERATION =
|
public static final String DFS_NAMENODE_INVALIDATE_WORK_PCT_PER_ITERATION =
|
||||||
"dfs.namenode.invalidate.work.pct.per.iteration";
|
"dfs.namenode.invalidate.work.pct.per.iteration";
|
||||||
|
|
|
@ -1050,21 +1050,6 @@ public class BlockManager {
|
||||||
|
|
||||||
node.resetBlocks();
|
node.resetBlocks();
|
||||||
invalidateBlocks.remove(node);
|
invalidateBlocks.remove(node);
|
||||||
|
|
||||||
// If the DN hasn't block-reported since the most recent
|
|
||||||
// failover, then we may have been holding up on processing
|
|
||||||
// over-replicated blocks because of it. But we can now
|
|
||||||
// process those blocks.
|
|
||||||
boolean stale = false;
|
|
||||||
for(DatanodeStorageInfo storage : node.getStorageInfos()) {
|
|
||||||
if (storage.areBlockContentsStale()) {
|
|
||||||
stale = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (stale) {
|
|
||||||
rescanPostponedMisreplicatedBlocks();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Remove the blocks associated to the given DatanodeStorageInfo. */
|
/** Remove the blocks associated to the given DatanodeStorageInfo. */
|
||||||
|
@ -1818,17 +1803,7 @@ public class BlockManager {
|
||||||
invalidatedBlocks = processReport(storageInfo, newReport);
|
invalidatedBlocks = processReport(storageInfo, newReport);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now that we have an up-to-date block report, we know that any
|
|
||||||
// deletions from a previous NN iteration have been accounted for.
|
|
||||||
boolean staleBefore = storageInfo.areBlockContentsStale();
|
|
||||||
storageInfo.receivedBlockReport();
|
storageInfo.receivedBlockReport();
|
||||||
if (staleBefore && !storageInfo.areBlockContentsStale()) {
|
|
||||||
LOG.info("BLOCK* processReport: Received first block report from "
|
|
||||||
+ storage + " after starting up or becoming active. Its block "
|
|
||||||
+ "contents are no longer considered stale");
|
|
||||||
rescanPostponedMisreplicatedBlocks();
|
|
||||||
}
|
|
||||||
|
|
||||||
} finally {
|
} finally {
|
||||||
endTime = Time.now();
|
endTime = Time.now();
|
||||||
namesystem.writeUnlock();
|
namesystem.writeUnlock();
|
||||||
|
@ -1857,31 +1832,74 @@ public class BlockManager {
|
||||||
/**
|
/**
|
||||||
* Rescan the list of blocks which were previously postponed.
|
* Rescan the list of blocks which were previously postponed.
|
||||||
*/
|
*/
|
||||||
private void rescanPostponedMisreplicatedBlocks() {
|
void rescanPostponedMisreplicatedBlocks() {
|
||||||
for (Iterator<Block> it = postponedMisreplicatedBlocks.iterator();
|
if (getPostponedMisreplicatedBlocksCount() == 0) {
|
||||||
it.hasNext();) {
|
return;
|
||||||
Block b = it.next();
|
}
|
||||||
|
long startTimeRescanPostponedMisReplicatedBlocks = Time.now();
|
||||||
BlockInfo bi = blocksMap.getStoredBlock(b);
|
long startPostponedMisReplicatedBlocksCount =
|
||||||
if (bi == null) {
|
getPostponedMisreplicatedBlocksCount();
|
||||||
|
namesystem.writeLock();
|
||||||
|
try {
|
||||||
|
// blocksPerRescan is the configured number of blocks per rescan.
|
||||||
|
// Randomly select blocksPerRescan consecutive blocks from the HashSet
|
||||||
|
// when the number of blocks remaining is larger than blocksPerRescan.
|
||||||
|
// The reason we don't always pick the first blocksPerRescan blocks is to
|
||||||
|
// handle the case if for some reason some datanodes remain in
|
||||||
|
// content stale state for a long time and only impact the first
|
||||||
|
// blocksPerRescan blocks.
|
||||||
|
int i = 0;
|
||||||
|
long startIndex = 0;
|
||||||
|
long blocksPerRescan =
|
||||||
|
datanodeManager.getBlocksPerPostponedMisreplicatedBlocksRescan();
|
||||||
|
long base = getPostponedMisreplicatedBlocksCount() - blocksPerRescan;
|
||||||
|
if (base > 0) {
|
||||||
|
startIndex = DFSUtil.getRandom().nextLong() % (base+1);
|
||||||
|
if (startIndex < 0) {
|
||||||
|
startIndex += (base+1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Iterator<Block> it = postponedMisreplicatedBlocks.iterator();
|
||||||
|
for (int tmp = 0; tmp < startIndex; tmp++) {
|
||||||
|
it.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (;it.hasNext(); i++) {
|
||||||
|
Block b = it.next();
|
||||||
|
if (i >= blocksPerRescan) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
BlockInfo bi = blocksMap.getStoredBlock(b);
|
||||||
|
if (bi == null) {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " +
|
||||||
|
"Postponed mis-replicated block " + b + " no longer found " +
|
||||||
|
"in block map.");
|
||||||
|
}
|
||||||
|
it.remove();
|
||||||
|
postponedMisreplicatedBlocksCount.decrementAndGet();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
MisReplicationResult res = processMisReplicatedBlock(bi);
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " +
|
LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " +
|
||||||
"Postponed mis-replicated block " + b + " no longer found " +
|
"Re-scanned block " + b + ", result is " + res);
|
||||||
"in block map.");
|
}
|
||||||
|
if (res != MisReplicationResult.POSTPONE) {
|
||||||
|
it.remove();
|
||||||
|
postponedMisreplicatedBlocksCount.decrementAndGet();
|
||||||
}
|
}
|
||||||
it.remove();
|
|
||||||
postponedMisreplicatedBlocksCount.decrementAndGet();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
MisReplicationResult res = processMisReplicatedBlock(bi);
|
|
||||||
if (LOG.isDebugEnabled()) {
|
|
||||||
LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " +
|
|
||||||
"Re-scanned block " + b + ", result is " + res);
|
|
||||||
}
|
|
||||||
if (res != MisReplicationResult.POSTPONE) {
|
|
||||||
it.remove();
|
|
||||||
postponedMisreplicatedBlocksCount.decrementAndGet();
|
|
||||||
}
|
}
|
||||||
|
} finally {
|
||||||
|
namesystem.writeUnlock();
|
||||||
|
long endPostponedMisReplicatedBlocksCount =
|
||||||
|
getPostponedMisreplicatedBlocksCount();
|
||||||
|
LOG.info("Rescan of postponedMisreplicatedBlocks completed in " +
|
||||||
|
(Time.now() - startTimeRescanPostponedMisReplicatedBlocks) +
|
||||||
|
" msecs. " + endPostponedMisReplicatedBlocksCount +
|
||||||
|
" blocks are left. " + (startPostponedMisReplicatedBlocksCount -
|
||||||
|
endPostponedMisReplicatedBlocksCount) + " blocks are removed.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3580,6 +3598,7 @@ public class BlockManager {
|
||||||
if (namesystem.isPopulatingReplQueues()) {
|
if (namesystem.isPopulatingReplQueues()) {
|
||||||
computeDatanodeWork();
|
computeDatanodeWork();
|
||||||
processPendingReplications();
|
processPendingReplications();
|
||||||
|
rescanPostponedMisreplicatedBlocks();
|
||||||
}
|
}
|
||||||
Thread.sleep(replicationRecheckInterval);
|
Thread.sleep(replicationRecheckInterval);
|
||||||
} catch (Throwable t) {
|
} catch (Throwable t) {
|
||||||
|
@ -3648,6 +3667,8 @@ public class BlockManager {
|
||||||
excessReplicateMap.clear();
|
excessReplicateMap.clear();
|
||||||
invalidateBlocks.clear();
|
invalidateBlocks.clear();
|
||||||
datanodeManager.clearPendingQueues();
|
datanodeManager.clearPendingQueues();
|
||||||
|
postponedMisreplicatedBlocks.clear();
|
||||||
|
postponedMisreplicatedBlocksCount.set(0);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -133,13 +133,18 @@ public class DatanodeManager {
|
||||||
* writing to stale datanodes, i.e., continue using stale nodes for writing.
|
* writing to stale datanodes, i.e., continue using stale nodes for writing.
|
||||||
*/
|
*/
|
||||||
private final float ratioUseStaleDataNodesForWrite;
|
private final float ratioUseStaleDataNodesForWrite;
|
||||||
|
|
||||||
/** The number of stale DataNodes */
|
/** The number of stale DataNodes */
|
||||||
private volatile int numStaleNodes;
|
private volatile int numStaleNodes;
|
||||||
|
|
||||||
/** The number of stale storages */
|
/** The number of stale storages */
|
||||||
private volatile int numStaleStorages;
|
private volatile int numStaleStorages;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of blocks to check for each postponedMisreplicatedBlocks iteration
|
||||||
|
*/
|
||||||
|
private final long blocksPerPostponedMisreplicatedBlocksRescan;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Whether or not this cluster has ever consisted of more than 1 rack,
|
* Whether or not this cluster has ever consisted of more than 1 rack,
|
||||||
* according to the NetworkTopology.
|
* according to the NetworkTopology.
|
||||||
|
@ -259,6 +264,9 @@ public class DatanodeManager {
|
||||||
this.timeBetweenResendingCachingDirectivesMs = conf.getLong(
|
this.timeBetweenResendingCachingDirectivesMs = conf.getLong(
|
||||||
DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_RETRY_INTERVAL_MS,
|
DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_RETRY_INTERVAL_MS,
|
||||||
DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_RETRY_INTERVAL_MS_DEFAULT);
|
DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_RETRY_INTERVAL_MS_DEFAULT);
|
||||||
|
this.blocksPerPostponedMisreplicatedBlocksRescan = conf.getLong(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY_DEFAULT);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static long getStaleIntervalFromConf(Configuration conf,
|
private static long getStaleIntervalFromConf(Configuration conf,
|
||||||
|
@ -1133,6 +1141,10 @@ public class DatanodeManager {
|
||||||
* ratioUseStaleDataNodesForWrite);
|
* ratioUseStaleDataNodesForWrite);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public long getBlocksPerPostponedMisreplicatedBlocksRescan() {
|
||||||
|
return blocksPerPostponedMisreplicatedBlocksRescan;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return The time interval used to mark DataNodes as stale.
|
* @return The time interval used to mark DataNodes as stale.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -2253,4 +2253,12 @@
|
||||||
</description>
|
</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>dfs.namenode.blocks.per.postponedblocks.rescan</name>
|
||||||
|
<value>10000</value>
|
||||||
|
<description>Number of blocks to rescan for each iteration of
|
||||||
|
postponedMisreplicatedBlocks.
|
||||||
|
</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
</configuration>
|
</configuration>
|
||||||
|
|
|
@ -238,6 +238,14 @@ public class BlockManagerTestUtil {
|
||||||
return dn.updateStorage(s);
|
return dn.updateStorage(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call heartbeat check function of HeartbeatManager
|
||||||
|
* @param bm the BlockManager to manipulate
|
||||||
|
*/
|
||||||
|
public static void rescanPostponedMisreplicatedBlocks(BlockManager bm) {
|
||||||
|
bm.rescanPostponedMisreplicatedBlocks();
|
||||||
|
}
|
||||||
|
|
||||||
public static DatanodeDescriptor getLocalDatanodeDescriptor(
|
public static DatanodeDescriptor getLocalDatanodeDescriptor(
|
||||||
boolean initializeStorage) {
|
boolean initializeStorage) {
|
||||||
DatanodeDescriptor dn = new DatanodeDescriptor(DFSTestUtil.getLocalDatanodeID());
|
DatanodeDescriptor dn = new DatanodeDescriptor(DFSTestUtil.getLocalDatanodeID());
|
||||||
|
|
|
@ -165,7 +165,12 @@ public class TestDNFencing {
|
||||||
|
|
||||||
banner("Metadata after nodes have all block-reported");
|
banner("Metadata after nodes have all block-reported");
|
||||||
doMetasave(nn2);
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
// Force a rescan of postponedMisreplicatedBlocks.
|
||||||
|
BlockManager nn2BM = nn2.getNamesystem().getBlockManager();
|
||||||
|
BlockManagerTestUtil.checkHeartbeat(nn2BM);
|
||||||
|
BlockManagerTestUtil.rescanPostponedMisreplicatedBlocks(nn2BM);
|
||||||
|
|
||||||
// The blocks should no longer be postponed.
|
// The blocks should no longer be postponed.
|
||||||
assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
|
assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
|
||||||
|
|
||||||
|
@ -251,7 +256,12 @@ public class TestDNFencing {
|
||||||
|
|
||||||
banner("Metadata after nodes have all block-reported");
|
banner("Metadata after nodes have all block-reported");
|
||||||
doMetasave(nn2);
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
// Force a rescan of postponedMisreplicatedBlocks.
|
||||||
|
BlockManager nn2BM = nn2.getNamesystem().getBlockManager();
|
||||||
|
BlockManagerTestUtil.checkHeartbeat(nn2BM);
|
||||||
|
BlockManagerTestUtil.rescanPostponedMisreplicatedBlocks(nn2BM);
|
||||||
|
|
||||||
// The block should no longer be postponed.
|
// The block should no longer be postponed.
|
||||||
assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
|
assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
|
||||||
|
|
||||||
|
@ -347,6 +357,11 @@ public class TestDNFencing {
|
||||||
banner("Metadata after nodes have all block-reported");
|
banner("Metadata after nodes have all block-reported");
|
||||||
doMetasave(nn2);
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
// Force a rescan of postponedMisreplicatedBlocks.
|
||||||
|
BlockManager nn2BM = nn2.getNamesystem().getBlockManager();
|
||||||
|
BlockManagerTestUtil.checkHeartbeat(nn2BM);
|
||||||
|
BlockManagerTestUtil.rescanPostponedMisreplicatedBlocks(nn2BM);
|
||||||
|
|
||||||
// The block should no longer be postponed.
|
// The block should no longer be postponed.
|
||||||
assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
|
assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
|
||||||
|
|
||||||
|
|
|
@ -109,6 +109,10 @@ public class TestDNFencingWithReplication {
|
||||||
HAStressTestHarness harness = new HAStressTestHarness();
|
HAStressTestHarness harness = new HAStressTestHarness();
|
||||||
harness.conf.setInt(
|
harness.conf.setInt(
|
||||||
DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000);
|
DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000);
|
||||||
|
harness.conf.setInt(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY, 1);
|
||||||
|
harness.conf.setInt(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 1);
|
||||||
|
|
||||||
final MiniDFSCluster cluster = harness.startCluster();
|
final MiniDFSCluster cluster = harness.startCluster();
|
||||||
try {
|
try {
|
||||||
|
|
Loading…
Reference in New Issue