HDFS-15795. EC: Wrong checksum when reconstruction was failed by exception. Contributed by Yushi Hayasaka (#2657)

(cherry picked from commit 18978f2e20)
2021-02-02 18:02:09 +09:00 · 2021-02-02 18:02:09 +09:00 · e0f8462b39
parent f97709beaa
commit e0f8462b39
4 changed files with 48 additions and 3 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockChecksumHelper.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockChecksumHelper.java
@ -480,8 +480,9 @@ void compute() throws IOException {
        // Before populating the blockChecksum at this index, record the byte
        // offset where it will begin.
        blockChecksumPositions[idx] = blockChecksumBuf.getLength();
+        ExtendedBlock block = null;
        try {
-          ExtendedBlock block = getInternalBlock(numDataUnits, idx);
+          block = getInternalBlock(numDataUnits, idx);

          LiveBlockInfo liveBlkInfo = liveDns.get((byte) idx);
          if (liveBlkInfo == null) {
@ -502,7 +503,9 @@ void compute() throws IOException {
            break; // done with the computation, simply return.
          }
        } catch (IOException e) {
-          LOG.warn("Failed to get the checksum", e);
+          LOG.warn("Failed to get the checksum for block {} at index {} "
+              + "in blockGroup {}", block, idx, blockGroup, e);
+          throw e;
        }
      }

--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java
@ -96,6 +96,12 @@ public void throwTooManyOpenFiles() throws FileNotFoundException {
   */
  public void stripedBlockReconstruction() throws IOException {}

+  /**
+   * Used as a hook to inject failure in erasure coding checksum reconstruction
+   * process.
+   */
+  public void stripedBlockChecksumReconstruction() throws IOException {}
+
  /**
   * Used as a hook to inject latency when read block
   * in erasure coding reconstruction process.
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java
@ -23,6 +23,7 @@
 import java.util.Arrays;

 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector;
 import org.apache.hadoop.io.DataOutputBuffer;

 /**
@ -75,6 +76,7 @@ public void reconstruct() throws IOException {
    prepareDigester();
    long maxTargetLength = getMaxTargetLength();
    while (requestedLen > 0 && getPositionInBlock() < maxTargetLength) {
+      DataNodeFaultInjector.get().stripedBlockChecksumReconstruction();
      long remaining = maxTargetLength - getPositionInBlock();
      final int toReconstructLen = (int) Math
          .min(getStripedReader().getBufferSize(), remaining);
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java
@ -27,6 +27,7 @@
 import org.apache.hadoop.hdfs.protocol.LocatedBlock;
 import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
 import org.apache.hadoop.hdfs.server.datanode.DataNode;
+import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector;
 import org.apache.hadoop.test.GenericTestUtils;
 import org.junit.After;
 import org.junit.Assert;
@ -43,6 +44,8 @@
 import java.util.Random;

 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY;
+import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.mock;

 /**
 * This test serves a prototype to demo the idea proposed so far. It creates two
@ -534,6 +537,37 @@ public void testStripedFileChecksumWithMissedDataBlocksRangeQuery20()
        bytesPerCRC - 1);
  }

+  @Test(timeout = 90000)
+  public void testStripedFileChecksumWithReconstructFail()
+      throws Exception {
+    String stripedFile4 = ecDir + "/stripedFileChecksum4";
+    prepareTestFiles(fileSize, new String[] {stripedFile4});
+
+    // get checksum
+    FileChecksum fileChecksum = getFileChecksum(stripedFile4, -1, false);
+
+    DataNodeFaultInjector oldInjector = DataNodeFaultInjector.get();
+    DataNodeFaultInjector newInjector = mock(DataNodeFaultInjector.class);
+    doThrow(new IOException())
+        .doNothing()
+        .when(newInjector)
+        .stripedBlockChecksumReconstruction();
+    DataNodeFaultInjector.set(newInjector);
+
+    try {
+      // Get checksum again with reconstruction.
+      // If the reconstruction task fails, a client try to get checksum from
+      // another DN which has a block of the block group because of a failure of
+      // getting result.
+      FileChecksum fileChecksum1 = getFileChecksum(stripedFile4, -1, true);
+
+      Assert.assertEquals("checksum should be same", fileChecksum,
+          fileChecksum1);
+    } finally {
+      DataNodeFaultInjector.set(oldInjector);
+    }
+  }
+
  @Test(timeout = 90000)
  public void testMixedBytesPerChecksum() throws Exception {
    int fileLength = bytesPerCRC * 3;