HDFS-15795. EC: Wrong checksum when reconstruction was failed by exception. Contributed by Yushi Hayasaka (#2657)

This commit is contained in:
crossfire 2021-02-02 18:02:09 +09:00 committed by GitHub
parent 9bf2ac07bb
commit 18978f2e20
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 48 additions and 3 deletions

View File

@ -480,8 +480,9 @@ final class BlockChecksumHelper {
// Before populating the blockChecksum at this index, record the byte // Before populating the blockChecksum at this index, record the byte
// offset where it will begin. // offset where it will begin.
blockChecksumPositions[idx] = blockChecksumBuf.getLength(); blockChecksumPositions[idx] = blockChecksumBuf.getLength();
ExtendedBlock block = null;
try { try {
ExtendedBlock block = getInternalBlock(numDataUnits, idx); block = getInternalBlock(numDataUnits, idx);
LiveBlockInfo liveBlkInfo = liveDns.get((byte) idx); LiveBlockInfo liveBlkInfo = liveDns.get((byte) idx);
if (liveBlkInfo == null) { if (liveBlkInfo == null) {
@ -502,7 +503,9 @@ final class BlockChecksumHelper {
break; // done with the computation, simply return. break; // done with the computation, simply return.
} }
} catch (IOException e) { } catch (IOException e) {
LOG.warn("Failed to get the checksum", e); LOG.warn("Failed to get the checksum for block {} at index {} "
+ "in blockGroup {}", block, idx, blockGroup, e);
throw e;
} }
} }

View File

@ -106,6 +106,12 @@ public class DataNodeFaultInjector {
*/ */
public void stripedBlockReconstruction() throws IOException {} public void stripedBlockReconstruction() throws IOException {}
/**
* Used as a hook to inject failure in erasure coding checksum reconstruction
* process.
*/
public void stripedBlockChecksumReconstruction() throws IOException {}
/** /**
* Used as a hook to inject latency when read block * Used as a hook to inject latency when read block
* in erasure coding reconstruction process. * in erasure coding reconstruction process.

View File

@ -23,6 +23,7 @@ import java.nio.ByteBuffer;
import java.util.Arrays; import java.util.Arrays;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector;
import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.DataOutputBuffer;
/** /**
@ -75,6 +76,7 @@ public abstract class StripedBlockChecksumReconstructor
prepareDigester(); prepareDigester();
long maxTargetLength = getMaxTargetLength(); long maxTargetLength = getMaxTargetLength();
while (requestedLen > 0 && getPositionInBlock() < maxTargetLength) { while (requestedLen > 0 && getPositionInBlock() < maxTargetLength) {
DataNodeFaultInjector.get().stripedBlockChecksumReconstruction();
long remaining = maxTargetLength - getPositionInBlock(); long remaining = maxTargetLength - getPositionInBlock();
final int toReconstructLen = (int) Math final int toReconstructLen = (int) Math
.min(getStripedReader().getBufferSize(), remaining); .min(getStripedReader().getBufferSize(), remaining);
@ -225,4 +227,4 @@ public abstract class StripedBlockChecksumReconstructor
getStripedReader().close(); getStripedReader().close();
cleanup(); cleanup();
} }
} }

View File

@ -28,6 +28,7 @@ import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy;
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.junit.After; import org.junit.After;
import org.junit.Assert; import org.junit.Assert;
@ -46,6 +47,8 @@ import java.io.IOException;
import java.util.Random; import java.util.Random;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
/** /**
* This test serves a prototype to demo the idea proposed so far. It creates two * This test serves a prototype to demo the idea proposed so far. It creates two
@ -517,6 +520,37 @@ public class TestFileChecksum {
bytesPerCRC - 1); bytesPerCRC - 1);
} }
@Test(timeout = 90000)
public void testStripedFileChecksumWithReconstructFail()
throws Exception {
String stripedFile4 = ecDir + "/stripedFileChecksum4";
prepareTestFiles(fileSize, new String[] {stripedFile4});
// get checksum
FileChecksum fileChecksum = getFileChecksum(stripedFile4, -1, false);
DataNodeFaultInjector oldInjector = DataNodeFaultInjector.get();
DataNodeFaultInjector newInjector = mock(DataNodeFaultInjector.class);
doThrow(new IOException())
.doNothing()
.when(newInjector)
.stripedBlockChecksumReconstruction();
DataNodeFaultInjector.set(newInjector);
try {
// Get checksum again with reconstruction.
// If the reconstruction task fails, a client try to get checksum from
// another DN which has a block of the block group because of a failure of
// getting result.
FileChecksum fileChecksum1 = getFileChecksum(stripedFile4, -1, true);
Assert.assertEquals("checksum should be same", fileChecksum,
fileChecksum1);
} finally {
DataNodeFaultInjector.set(oldInjector);
}
}
@Test(timeout = 90000) @Test(timeout = 90000)
public void testMixedBytesPerChecksum() throws Exception { public void testMixedBytesPerChecksum() throws Exception {
int fileLength = bytesPerCRC * 3; int fileLength = bytesPerCRC * 3;