HDFS-10477. Stop decommission a rack of DataNodes caused NameNode fail over to standby. Contributed by yunjiong zhao and Wei-Chiu Chuang.

This commit is contained in:
Wei-Chiu Chuang 2019-04-10 14:36:07 -07:00
parent 66ac003128
commit 8af767b069
1 changed files with 35 additions and 10 deletions

View File

@ -3623,19 +3623,44 @@ public class BlockManager implements BlockStatsMXBean {
if (!isPopulatingReplQueues()) { if (!isPopulatingReplQueues()) {
return; return;
} }
final Iterator<BlockInfo> it = srcNode.getBlockIterator();
int numOverReplicated = 0; int numOverReplicated = 0;
for (DatanodeStorageInfo datanodeStorageInfo : srcNode.getStorageInfos()) {
// the namesystem lock is released between iterations. Make sure the
// storage is not removed before continuing.
if (srcNode.getStorageInfo(datanodeStorageInfo.getStorageID()) == null) {
continue;
}
final Iterator<BlockInfo> it = datanodeStorageInfo.getBlockIterator();
while(it.hasNext()) { while(it.hasNext()) {
final BlockInfo block = it.next(); final BlockInfo block = it.next();
short expectedReplication = getExpectedReplicaNum(block); if (block.isDeleted()) {
//Orphan block, will be handled eventually, skip
continue;
}
short expectedReplication = this.getExpectedReplicaNum(block);
NumberReplicas num = countNodes(block); NumberReplicas num = countNodes(block);
int numCurrentReplica = num.liveReplicas(); int numCurrentReplica = num.liveReplicas();
if (numCurrentReplica > expectedReplication) { if (numCurrentReplica > expectedReplication) {
// over-replicated block // over-replicated block
processOverReplicatedBlock(block, expectedReplication, null, null); processOverReplicatedBlock(block, expectedReplication, null,
null);
numOverReplicated++; numOverReplicated++;
} }
} }
// When called by tests like TestDefaultBlockPlacementPolicy.
// testPlacementWithLocalRackNodesDecommissioned, it is not protected by
// lock, only when called by DatanodeManager.refreshNodes have writeLock
if (namesystem.hasWriteLock()) {
namesystem.writeUnlock();
try {
Thread.sleep(1);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
namesystem.writeLock();
}
}
LOG.info("Invalidated " + numOverReplicated + " over-replicated blocks on " + LOG.info("Invalidated " + numOverReplicated + " over-replicated blocks on " +
srcNode + " during recommissioning"); srcNode + " during recommissioning");
} }