HDFS-11160. VolumeScanner reports write-in-progress replicas as corrupt incorrectly. Contributed by Wei-Chiu Chuang and Yongjun Zhang.
This commit is contained in:
parent
6fce191549
commit
aebb9127ba
|
@ -64,7 +64,15 @@ public class BlockScanner {
|
||||||
/**
|
/**
|
||||||
* The scanner configuration.
|
* The scanner configuration.
|
||||||
*/
|
*/
|
||||||
private final Conf conf;
|
private Conf conf;
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
void setConf(Conf conf) {
|
||||||
|
this.conf = conf;
|
||||||
|
for (Entry<String, VolumeScanner> entry : scanners.entrySet()) {
|
||||||
|
entry.getValue().setConf(conf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The cached scanner configuration.
|
* The cached scanner configuration.
|
||||||
|
|
|
@ -241,13 +241,22 @@ class BlockSender implements java.io.Closeable {
|
||||||
"If verifying checksum, currently must also send it.");
|
"If verifying checksum, currently must also send it.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if there is a append write happening right after the BlockSender
|
||||||
|
// is constructed, the last partial checksum maybe overwritten by the
|
||||||
|
// append, the BlockSender need to use the partial checksum before
|
||||||
|
// the append write.
|
||||||
|
ChunkChecksum chunkChecksum = null;
|
||||||
final long replicaVisibleLength;
|
final long replicaVisibleLength;
|
||||||
try(AutoCloseableLock lock = datanode.data.acquireDatasetLock()) {
|
try(AutoCloseableLock lock = datanode.data.acquireDatasetLock()) {
|
||||||
replica = getReplica(block, datanode);
|
replica = getReplica(block, datanode);
|
||||||
replicaVisibleLength = replica.getVisibleLength();
|
replicaVisibleLength = replica.getVisibleLength();
|
||||||
|
if (replica instanceof FinalizedReplica) {
|
||||||
|
// Load last checksum in case the replica is being written
|
||||||
|
// concurrently
|
||||||
|
final FinalizedReplica frep = (FinalizedReplica) replica;
|
||||||
|
chunkChecksum = frep.getLastChecksumAndDataLen();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// if there is a write in progress
|
|
||||||
ChunkChecksum chunkChecksum = null;
|
|
||||||
if (replica.getState() == ReplicaState.RBW) {
|
if (replica.getState() == ReplicaState.RBW) {
|
||||||
final ReplicaInPipeline rbw = (ReplicaInPipeline) replica;
|
final ReplicaInPipeline rbw = (ReplicaInPipeline) replica;
|
||||||
waitForMinLength(rbw, startOffset + length);
|
waitForMinLength(rbw, startOffset + length);
|
||||||
|
@ -547,7 +556,6 @@ class BlockSender implements java.io.Closeable {
|
||||||
if (lastDataPacket && lastChunkChecksum != null) {
|
if (lastDataPacket && lastChunkChecksum != null) {
|
||||||
int start = checksumOff + checksumDataLen - checksumSize;
|
int start = checksumOff + checksumDataLen - checksumSize;
|
||||||
byte[] updatedChecksum = lastChunkChecksum.getChecksum();
|
byte[] updatedChecksum = lastChunkChecksum.getChecksum();
|
||||||
|
|
||||||
if (updatedChecksum != null) {
|
if (updatedChecksum != null) {
|
||||||
System.arraycopy(updatedChecksum, 0, buf, start, checksumSize);
|
System.arraycopy(updatedChecksum, 0, buf, start, checksumSize);
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,8 @@
|
||||||
package org.apache.hadoop.hdfs.server.datanode;
|
package org.apache.hadoop.hdfs.server.datanode;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.hadoop.hdfs.protocol.Block;
|
import org.apache.hadoop.hdfs.protocol.Block;
|
||||||
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
|
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
|
||||||
|
@ -113,4 +115,31 @@ public class FinalizedReplica extends LocalReplica {
|
||||||
throw new UnsupportedOperationException("Replica of type " + getState() +
|
throw new UnsupportedOperationException("Replica of type " + getState() +
|
||||||
" does not support createInfo");
|
" does not support createInfo");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* gets the last chunk checksum and the length of the block corresponding
|
||||||
|
* to that checksum.
|
||||||
|
* Note, need to be called with the FsDataset lock acquired. May improve to
|
||||||
|
* lock only the FsVolume in the future.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public ChunkChecksum getLastChecksumAndDataLen() throws IOException {
|
||||||
|
ChunkChecksum chunkChecksum = null;
|
||||||
|
try {
|
||||||
|
byte[] lastChecksum = getVolume().loadLastPartialChunkChecksum(
|
||||||
|
getBlockFile(), getMetaFile());
|
||||||
|
if (lastChecksum != null) {
|
||||||
|
chunkChecksum =
|
||||||
|
new ChunkChecksum(getVisibleLength(), lastChecksum);
|
||||||
|
}
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
// meta file is lost. Try to continue anyway.
|
||||||
|
DataNode.LOG.warn("meta file " + getMetaFile() +
|
||||||
|
" is missing!");
|
||||||
|
} catch (IOException ioe) {
|
||||||
|
DataNode.LOG.warn("Unable to read checksum from meta file " +
|
||||||
|
getMetaFile(), ioe);
|
||||||
|
}
|
||||||
|
return chunkChecksum;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,7 +69,12 @@ public class VolumeScanner extends Thread {
|
||||||
/**
|
/**
|
||||||
* The configuration.
|
* The configuration.
|
||||||
*/
|
*/
|
||||||
private final Conf conf;
|
private Conf conf;
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
void setConf(Conf conf) {
|
||||||
|
this.conf = conf;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The DataNode this VolumEscanner is associated with.
|
* The DataNode this VolumEscanner is associated with.
|
||||||
|
@ -429,6 +434,7 @@ public class VolumeScanner extends Thread {
|
||||||
if (block == null) {
|
if (block == null) {
|
||||||
return -1; // block not found.
|
return -1; // block not found.
|
||||||
}
|
}
|
||||||
|
LOG.debug("start scanning block {}", block);
|
||||||
BlockSender blockSender = null;
|
BlockSender blockSender = null;
|
||||||
try {
|
try {
|
||||||
blockSender = new BlockSender(block, 0, -1,
|
blockSender = new BlockSender(block, 0, -1,
|
||||||
|
@ -610,6 +616,7 @@ public class VolumeScanner extends Thread {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (timeout > 0) {
|
if (timeout > 0) {
|
||||||
|
LOG.debug("{}: wait for {} milliseconds", this, timeout);
|
||||||
wait(timeout);
|
wait(timeout);
|
||||||
if (stopping) {
|
if (stopping) {
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -401,6 +401,17 @@ public interface FsVolumeSpi
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load last partial chunk checksum from checksum file.
|
||||||
|
* Need to be called with FsDataset lock acquired.
|
||||||
|
* @param blockFile
|
||||||
|
* @param metaFile
|
||||||
|
* @return the last partial checksum
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
byte[] loadLastPartialChunkChecksum(File blockFile, File metaFile)
|
||||||
|
throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compile a list of {@link ScanInfo} for the blocks in
|
* Compile a list of {@link ScanInfo} for the blocks in
|
||||||
* the block pool with id {@code bpid}.
|
* the block pool with id {@code bpid}.
|
||||||
|
|
|
@ -1119,7 +1119,8 @@ public class FsVolumeImpl implements FsVolumeSpi {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private byte[] loadLastPartialChunkChecksum(
|
@Override
|
||||||
|
public byte[] loadLastPartialChunkChecksum(
|
||||||
File blockFile, File metaFile) throws IOException {
|
File blockFile, File metaFile) throws IOException {
|
||||||
// readHeader closes the temporary FileInputStream.
|
// readHeader closes the temporary FileInputStream.
|
||||||
DataChecksum dcs = BlockMetadataHeader
|
DataChecksum dcs = BlockMetadataHeader
|
||||||
|
@ -1135,13 +1136,22 @@ public class FsVolumeImpl implements FsVolumeSpi {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
int offsetInChecksum = BlockMetadataHeader.getHeaderSize() +
|
long offsetInChecksum = BlockMetadataHeader.getHeaderSize() +
|
||||||
(int)(onDiskLen / bytesPerChecksum * checksumSize);
|
(onDiskLen / bytesPerChecksum) * checksumSize;
|
||||||
byte[] lastChecksum = new byte[checksumSize];
|
byte[] lastChecksum = new byte[checksumSize];
|
||||||
try (RandomAccessFile raf = fileIoProvider.getRandomAccessFile(
|
try (RandomAccessFile raf = fileIoProvider.getRandomAccessFile(
|
||||||
this, metaFile, "r")) {
|
this, metaFile, "r")) {
|
||||||
raf.seek(offsetInChecksum);
|
raf.seek(offsetInChecksum);
|
||||||
raf.read(lastChecksum, 0, checksumSize);
|
int readBytes = raf.read(lastChecksum, 0, checksumSize);
|
||||||
|
if (readBytes == -1) {
|
||||||
|
throw new IOException("Expected to read " + checksumSize +
|
||||||
|
" bytes from offset " + offsetInChecksum +
|
||||||
|
" but reached end of file.");
|
||||||
|
} else if (readBytes != checksumSize) {
|
||||||
|
throw new IOException("Expected to read " + checksumSize +
|
||||||
|
" bytes from offset " + offsetInChecksum + " but read " +
|
||||||
|
readBytes + " bytes.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return lastChecksum;
|
return lastChecksum;
|
||||||
}
|
}
|
||||||
|
|
|
@ -556,6 +556,12 @@ public class SimulatedFSDataset implements FsDatasetSpi<FsVolumeSpi> {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] loadLastPartialChunkChecksum(
|
||||||
|
File blockFile, File metaFile) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public LinkedList<ScanInfo> compileReport(String bpid,
|
public LinkedList<ScanInfo> compileReport(String bpid,
|
||||||
LinkedList<ScanInfo> report, ReportCompiler reportCompiler)
|
LinkedList<ScanInfo> report, ReportCompiler reportCompiler)
|
||||||
|
|
|
@ -36,8 +36,12 @@ import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.Semaphore;
|
import java.util.concurrent.Semaphore;
|
||||||
|
import java.util.concurrent.TimeoutException;
|
||||||
|
|
||||||
import com.google.common.base.Supplier;
|
import com.google.common.base.Supplier;
|
||||||
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.hdfs.AppendTestUtil;
|
||||||
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
||||||
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
|
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.FsDatasetTestUtils.MaterializedReplica;
|
import org.apache.hadoop.hdfs.server.datanode.FsDatasetTestUtils.MaterializedReplica;
|
||||||
|
@ -870,4 +874,100 @@ public class TestBlockScanner {
|
||||||
}
|
}
|
||||||
info.sem.release(1);
|
info.sem.release(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test concurrent append and scan.
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
@Test(timeout=120000)
|
||||||
|
public void testAppendWhileScanning() throws Exception {
|
||||||
|
GenericTestUtils.setLogLevel(DataNode.LOG, Level.ALL);
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
// throttle the block scanner: 1MB per second
|
||||||
|
conf.setLong(DFS_BLOCK_SCANNER_VOLUME_BYTES_PER_SECOND, 1048576);
|
||||||
|
// Set a really long scan period.
|
||||||
|
conf.setLong(DFS_DATANODE_SCAN_PERIOD_HOURS_KEY, 100L);
|
||||||
|
conf.set(INTERNAL_VOLUME_SCANNER_SCAN_RESULT_HANDLER,
|
||||||
|
TestScanResultHandler.class.getName());
|
||||||
|
conf.setLong(INTERNAL_DFS_BLOCK_SCANNER_CURSOR_SAVE_INTERVAL_MS, 0L);
|
||||||
|
final int numExpectedFiles = 1;
|
||||||
|
final int numExpectedBlocks = 1;
|
||||||
|
final int numNameServices = 1;
|
||||||
|
// the initial file length can not be too small.
|
||||||
|
// Otherwise checksum file stream buffer will be pre-filled and
|
||||||
|
// BlockSender will not see the updated checksum.
|
||||||
|
final int initialFileLength = 2*1024*1024+100;
|
||||||
|
final TestContext ctx = new TestContext(conf, numNameServices);
|
||||||
|
// create one file, with one block.
|
||||||
|
ctx.createFiles(0, numExpectedFiles, initialFileLength);
|
||||||
|
final TestScanResultHandler.Info info =
|
||||||
|
TestScanResultHandler.getInfo(ctx.volumes.get(0));
|
||||||
|
String storageID = ctx.volumes.get(0).getStorageID();
|
||||||
|
synchronized (info) {
|
||||||
|
info.sem = new Semaphore(numExpectedBlocks*2);
|
||||||
|
info.shouldRun = true;
|
||||||
|
info.notify();
|
||||||
|
}
|
||||||
|
// VolumeScanner scans the first block when DN starts.
|
||||||
|
// Due to throttler, this should take approximately 2 seconds.
|
||||||
|
waitForRescan(info, numExpectedBlocks);
|
||||||
|
|
||||||
|
// update throttler to schedule rescan immediately.
|
||||||
|
// this number must be larger than initial file length, otherwise
|
||||||
|
// throttler prevents immediate rescan.
|
||||||
|
conf.setLong(DFS_BLOCK_SCANNER_VOLUME_BYTES_PER_SECOND,
|
||||||
|
initialFileLength+32*1024);
|
||||||
|
BlockScanner.Conf newConf = new BlockScanner.Conf(conf);
|
||||||
|
ctx.datanode.getBlockScanner().setConf(newConf);
|
||||||
|
// schedule the first block for scanning
|
||||||
|
ExtendedBlock first = ctx.getFileBlock(0, 0);
|
||||||
|
ctx.datanode.getBlockScanner().markSuspectBlock(storageID, first);
|
||||||
|
|
||||||
|
// append the file before VolumeScanner completes scanning the block,
|
||||||
|
// which takes approximately 2 seconds to complete.
|
||||||
|
FileSystem fs = ctx.cluster.getFileSystem();
|
||||||
|
FSDataOutputStream os = fs.append(ctx.getPath(0));
|
||||||
|
long seed = -1;
|
||||||
|
int size = 200;
|
||||||
|
final byte[] bytes = AppendTestUtil.randomBytes(seed, size);
|
||||||
|
os.write(bytes);
|
||||||
|
os.hflush();
|
||||||
|
os.close();
|
||||||
|
fs.close();
|
||||||
|
|
||||||
|
// verify that volume scanner does not find bad blocks after append.
|
||||||
|
waitForRescan(info, numExpectedBlocks);
|
||||||
|
|
||||||
|
GenericTestUtils.setLogLevel(DataNode.LOG, Level.INFO);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void waitForRescan(final TestScanResultHandler.Info info,
|
||||||
|
final int numExpectedBlocks)
|
||||||
|
throws TimeoutException, InterruptedException {
|
||||||
|
LOG.info("Waiting for the first 1 blocks to be scanned.");
|
||||||
|
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||||
|
@Override
|
||||||
|
public Boolean get() {
|
||||||
|
synchronized (info) {
|
||||||
|
if (info.blocksScanned >= numExpectedBlocks) {
|
||||||
|
LOG.info("info = {}. blockScanned has now reached 1.", info);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
LOG.info("info = {}. Waiting for blockScanned to reach 1.", info);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 1000, 30000);
|
||||||
|
|
||||||
|
synchronized (info) {
|
||||||
|
assertEquals("Expected 1 good block.",
|
||||||
|
numExpectedBlocks, info.goodBlocks.size());
|
||||||
|
info.goodBlocks.clear();
|
||||||
|
assertEquals("Expected 1 blocksScanned",
|
||||||
|
numExpectedBlocks, info.blocksScanned);
|
||||||
|
assertEquals("Did not expect bad blocks.", 0, info.badBlocks.size());
|
||||||
|
info.blocksScanned = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -898,6 +898,12 @@ public class TestDirectoryScanner {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] loadLastPartialChunkChecksum(
|
||||||
|
File blockFile, File metaFile) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public LinkedList<ScanInfo> compileReport(String bpid,
|
public LinkedList<ScanInfo> compileReport(String bpid,
|
||||||
LinkedList<ScanInfo> report, ReportCompiler reportCompiler)
|
LinkedList<ScanInfo> report, ReportCompiler reportCompiler)
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
|
|
||||||
package org.apache.hadoop.hdfs.server.datanode.extdataset;
|
package org.apache.hadoop.hdfs.server.datanode.extdataset;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.nio.channels.ClosedChannelException;
|
import java.nio.channels.ClosedChannelException;
|
||||||
|
@ -108,6 +109,12 @@ public class ExternalVolumeImpl implements FsVolumeSpi {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] loadLastPartialChunkChecksum(
|
||||||
|
File blockFile, File metaFile) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public LinkedList<ScanInfo> compileReport(String bpid,
|
public LinkedList<ScanInfo> compileReport(String bpid,
|
||||||
LinkedList<ScanInfo> report, ReportCompiler reportCompiler)
|
LinkedList<ScanInfo> report, ReportCompiler reportCompiler)
|
||||||
|
|
Loading…
Reference in New Issue