From e0f8462b39a8ac8b2517d2e09fc04f5744e518f5 Mon Sep 17 00:00:00 2001 From: crossfire Date: Tue, 2 Feb 2021 18:02:09 +0900 Subject: [PATCH] HDFS-15795. EC: Wrong checksum when reconstruction was failed by exception. Contributed by Yushi Hayasaka (#2657) (cherry picked from commit 18978f2e204d105fb05807d33387a048e9ddb762) --- .../server/datanode/BlockChecksumHelper.java | 7 ++-- .../datanode/DataNodeFaultInjector.java | 6 ++++ .../StripedBlockChecksumReconstructor.java | 4 ++- .../apache/hadoop/hdfs/TestFileChecksum.java | 34 +++++++++++++++++++ 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockChecksumHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockChecksumHelper.java index 13681e55712..1895b449c69 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockChecksumHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockChecksumHelper.java @@ -480,8 +480,9 @@ final class BlockChecksumHelper { // Before populating the blockChecksum at this index, record the byte // offset where it will begin. blockChecksumPositions[idx] = blockChecksumBuf.getLength(); + ExtendedBlock block = null; try { - ExtendedBlock block = getInternalBlock(numDataUnits, idx); + block = getInternalBlock(numDataUnits, idx); LiveBlockInfo liveBlkInfo = liveDns.get((byte) idx); if (liveBlkInfo == null) { @@ -502,7 +503,9 @@ final class BlockChecksumHelper { break; // done with the computation, simply return. } } catch (IOException e) { - LOG.warn("Failed to get the checksum", e); + LOG.warn("Failed to get the checksum for block {} at index {} " + + "in blockGroup {}", block, idx, blockGroup, e); + throw e; } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java index b55d7939f6e..03d0585de3f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java @@ -96,6 +96,12 @@ public class DataNodeFaultInjector { */ public void stripedBlockReconstruction() throws IOException {} + /** + * Used as a hook to inject failure in erasure coding checksum reconstruction + * process. + */ + public void stripedBlockChecksumReconstruction() throws IOException {} + /** * Used as a hook to inject latency when read block * in erasure coding reconstruction process. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java index a600626f124..e28d6c556b8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/erasurecode/StripedBlockChecksumReconstructor.java @@ -23,6 +23,7 @@ import java.nio.ByteBuffer; import java.util.Arrays; import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector; import org.apache.hadoop.io.DataOutputBuffer; /** @@ -75,6 +76,7 @@ public abstract class StripedBlockChecksumReconstructor prepareDigester(); long maxTargetLength = getMaxTargetLength(); while (requestedLen > 0 && getPositionInBlock() < maxTargetLength) { + DataNodeFaultInjector.get().stripedBlockChecksumReconstruction(); long remaining = maxTargetLength - getPositionInBlock(); final int toReconstructLen = (int) Math .min(getStripedReader().getBufferSize(), remaining); @@ -225,4 +227,4 @@ public abstract class StripedBlockChecksumReconstructor getStripedReader().close(); cleanup(); } -} \ No newline at end of file +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java index 0ff2d4b3cda..83ac946ac7d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestFileChecksum.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector; import org.apache.hadoop.test.GenericTestUtils; import org.junit.After; import org.junit.Assert; @@ -43,6 +44,8 @@ import java.io.IOException; import java.util.Random; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; /** * This test serves a prototype to demo the idea proposed so far. It creates two @@ -534,6 +537,37 @@ public class TestFileChecksum { bytesPerCRC - 1); } + @Test(timeout = 90000) + public void testStripedFileChecksumWithReconstructFail() + throws Exception { + String stripedFile4 = ecDir + "/stripedFileChecksum4"; + prepareTestFiles(fileSize, new String[] {stripedFile4}); + + // get checksum + FileChecksum fileChecksum = getFileChecksum(stripedFile4, -1, false); + + DataNodeFaultInjector oldInjector = DataNodeFaultInjector.get(); + DataNodeFaultInjector newInjector = mock(DataNodeFaultInjector.class); + doThrow(new IOException()) + .doNothing() + .when(newInjector) + .stripedBlockChecksumReconstruction(); + DataNodeFaultInjector.set(newInjector); + + try { + // Get checksum again with reconstruction. + // If the reconstruction task fails, a client try to get checksum from + // another DN which has a block of the block group because of a failure of + // getting result. + FileChecksum fileChecksum1 = getFileChecksum(stripedFile4, -1, true); + + Assert.assertEquals("checksum should be same", fileChecksum, + fileChecksum1); + } finally { + DataNodeFaultInjector.set(oldInjector); + } + } + @Test(timeout = 90000) public void testMixedBytesPerChecksum() throws Exception { int fileLength = bytesPerCRC * 3;