HDFS-16479. EC: NameNode should not send a reconstruction work when the source datanodes are insufficient (#4138)

(cherry picked from commit 2efab92959)
2022-04-14 11:23:38 +09:00 · 2022-04-14 11:23:38 +09:00 · 52abc9f132
parent 52c6d77274
commit 52abc9f132
2 changed files with 106 additions and 0 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@ -2118,6 +2118,16 @@ public class BlockManager implements BlockStatsMXBean {
      return null;
    }
    // skip if source datanodes for reconstructing ec block are not enough
    if (block.isStriped()) {
      BlockInfoStriped stripedBlock = (BlockInfoStriped) block;
      if (stripedBlock.getRealDataBlockNum() > srcNodes.length) {
        LOG.debug("Block {} cannot be reconstructed due to shortage of source datanodes ", block);
        NameNode.getNameNodeMetrics().incNumTimesReReplicationNotScheduled();
        return null;
      }
    }
    // liveReplicaNodes can include READ_ONLY_SHARED replicas which are
    // not included in the numReplicas.liveReplicas() count
    assert liveReplicaNodes.size() >= numReplicas.liveReplicas();
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
@ -852,6 +852,102 @@ public class TestBlockManager {
        0, numReplicas.redundantInternalBlocks());
  }
  @Test
  public void testSkipReconstructionWithManyBusyNodes() {
    long blockId = -9223372036854775776L; // real ec block id
    // RS-3-2 EC policy
    ErasureCodingPolicy ecPolicy =
        SystemErasureCodingPolicies.getPolicies().get(1);
    // create an EC block group: 3 data blocks + 2 parity blocks
    Block aBlockGroup = new Block(blockId, ecPolicy.getCellSize() * ecPolicy.getNumDataUnits(), 0);
    BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlockGroup, ecPolicy);
    // create 4 storageInfo, which means 1 block is missing
    DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo(
        "storage1", "1.1.1.1", "rack1", "host1");
    DatanodeStorageInfo ds2 = DFSTestUtil.createDatanodeStorageInfo(
        "storage2", "2.2.2.2", "rack2", "host2");
    DatanodeStorageInfo ds3 = DFSTestUtil.createDatanodeStorageInfo(
        "storage3", "3.3.3.3", "rack3", "host3");
    DatanodeStorageInfo ds4 = DFSTestUtil.createDatanodeStorageInfo(
        "storage4", "4.4.4.4", "rack4", "host4");
    // link block with storage
    aBlockInfoStriped.addStorage(ds1, aBlockGroup);
    aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0));
    aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0));
    aBlockInfoStriped.addStorage(ds4, new Block(blockId + 3, 0, 0));
    addEcBlockToBM(blockId, ecPolicy);
    aBlockInfoStriped.setBlockCollectionId(mockINodeId);
    // reconstruction should be scheduled
    BlockReconstructionWork work = bm.scheduleReconstruction(aBlockInfoStriped, 3);
    assertNotNull(work);
    // simulate the 2 nodes reach maxReplicationStreams
    for(int i = 0; i < bm.maxReplicationStreams; i++){
      ds3.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets();
      ds4.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets();
    }
    // reconstruction should be skipped since the number of non-busy nodes are not enough
    work = bm.scheduleReconstruction(aBlockInfoStriped, 3);
    assertNull(work);
  }
  @Test
  public void testSkipReconstructionWithManyBusyNodes2() {
    long blockId = -9223372036854775776L; // real ec block id
    // RS-3-2 EC policy
    ErasureCodingPolicy ecPolicy =
        SystemErasureCodingPolicies.getPolicies().get(1);
    // create an EC block group: 2 data blocks + 2 parity blocks
    Block aBlockGroup = new Block(blockId,
        ecPolicy.getCellSize() * (ecPolicy.getNumDataUnits() - 1), 0);
    BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlockGroup, ecPolicy);
    // create 3 storageInfo, which means 1 block is missing
    DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo(
        "storage1", "1.1.1.1", "rack1", "host1");
    DatanodeStorageInfo ds2 = DFSTestUtil.createDatanodeStorageInfo(
        "storage2", "2.2.2.2", "rack2", "host2");
    DatanodeStorageInfo ds3 = DFSTestUtil.createDatanodeStorageInfo(
        "storage3", "3.3.3.3", "rack3", "host3");
    // link block with storage
    aBlockInfoStriped.addStorage(ds1, aBlockGroup);
    aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0));
    aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0));
    addEcBlockToBM(blockId, ecPolicy);
    aBlockInfoStriped.setBlockCollectionId(mockINodeId);
    // reconstruction should be scheduled
    BlockReconstructionWork work = bm.scheduleReconstruction(aBlockInfoStriped, 3);
    assertNotNull(work);
    // simulate the 1 node reaches maxReplicationStreams
    for(int i = 0; i < bm.maxReplicationStreams; i++){
      ds2.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets();
    }
    // reconstruction should still be scheduled since there are 2 source nodes to create 2 blocks
    work = bm.scheduleReconstruction(aBlockInfoStriped, 3);
    assertNotNull(work);
    // simulate the 1 more node reaches maxReplicationStreams
    for(int i = 0; i < bm.maxReplicationStreams; i++){
      ds3.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets();
    }
    // reconstruction should be skipped since the number of non-busy nodes are not enough
    work = bm.scheduleReconstruction(aBlockInfoStriped, 3);
    assertNull(work);
  }
  @Test
  public void testFavorDecomUntilHardLimit() throws Exception {
    bm.maxReplicationStreams = 0;