HDFS-15795. EC: Wrong checksum when reconstruction was failed by exception. Contributed by Yushi Hayasaka (#2657)

(cherry picked from commit 18978f2e20)
This commit is contained in:
crossfire 2021-02-02 18:02:09 +09:00 committed by S O'Donnell
parent f97709beaa
commit e0f8462b39
4 changed files with 48 additions and 3 deletions

View File

@ -480,8 +480,9 @@ final class BlockChecksumHelper {
// Before populating the blockChecksum at this index, record the byte
// offset where it will begin.
blockChecksumPositions[idx] = blockChecksumBuf.getLength();
ExtendedBlock block = null;
try {
ExtendedBlock block = getInternalBlock(numDataUnits, idx);
block = getInternalBlock(numDataUnits, idx);
LiveBlockInfo liveBlkInfo = liveDns.get((byte) idx);
if (liveBlkInfo == null) {
@ -502,7 +503,9 @@ final class BlockChecksumHelper {
break; // done with the computation, simply return.
}
} catch (IOException e) {
LOG.warn("Failed to get the checksum", e);
LOG.warn("Failed to get the checksum for block {} at index {} "
+ "in blockGroup {}", block, idx, blockGroup, e);
throw e;
}
}

View File

@ -96,6 +96,12 @@ public class DataNodeFaultInjector {
*/
public void stripedBlockReconstruction() throws IOException {}
/**
* Used as a hook to inject failure in erasure coding checksum reconstruction
* process.
*/
public void stripedBlockChecksumReconstruction() throws IOException {}
/**
* Used as a hook to inject latency when read block
* in erasure coding reconstruction process.

View File

@ -23,6 +23,7 @@ import java.nio.ByteBuffer;
import java.util.Arrays;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector;
import org.apache.hadoop.io.DataOutputBuffer;
/**
@ -75,6 +76,7 @@ public abstract class StripedBlockChecksumReconstructor
prepareDigester();
long maxTargetLength = getMaxTargetLength();
while (requestedLen > 0 && getPositionInBlock() < maxTargetLength) {
DataNodeFaultInjector.get().stripedBlockChecksumReconstruction();
long remaining = maxTargetLength - getPositionInBlock();
final int toReconstructLen = (int) Math
.min(getStripedReader().getBufferSize(), remaining);

View File

@ -27,6 +27,7 @@ import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector;
import org.apache.hadoop.test.GenericTestUtils;
import org.junit.After;
import org.junit.Assert;
@ -43,6 +44,8 @@ import java.io.IOException;
import java.util.Random;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.mock;
/**
* This test serves a prototype to demo the idea proposed so far. It creates two
@ -534,6 +537,37 @@ public class TestFileChecksum {
bytesPerCRC - 1);
}
@Test(timeout = 90000)
public void testStripedFileChecksumWithReconstructFail()
throws Exception {
String stripedFile4 = ecDir + "/stripedFileChecksum4";
prepareTestFiles(fileSize, new String[] {stripedFile4});
// get checksum
FileChecksum fileChecksum = getFileChecksum(stripedFile4, -1, false);
DataNodeFaultInjector oldInjector = DataNodeFaultInjector.get();
DataNodeFaultInjector newInjector = mock(DataNodeFaultInjector.class);
doThrow(new IOException())
.doNothing()
.when(newInjector)
.stripedBlockChecksumReconstruction();
DataNodeFaultInjector.set(newInjector);
try {
// Get checksum again with reconstruction.
// If the reconstruction task fails, a client try to get checksum from
// another DN which has a block of the block group because of a failure of
// getting result.
FileChecksum fileChecksum1 = getFileChecksum(stripedFile4, -1, true);
Assert.assertEquals("checksum should be same", fileChecksum,
fileChecksum1);
} finally {
DataNodeFaultInjector.set(oldInjector);
}
}
@Test(timeout = 90000)
public void testMixedBytesPerChecksum() throws Exception {
int fileLength = bytesPerCRC * 3;