HDFS-8224. Schedule a block for scanning if its metadata file is corrupt. Contributed by Rushabh S Shah.
This commit is contained in:
parent
7992c0b42c
commit
d00d3add9e
|
@ -122,8 +122,8 @@ public class DataChecksum implements Checksum {
|
||||||
int bpc = in.readInt();
|
int bpc = in.readInt();
|
||||||
DataChecksum summer = newDataChecksum(Type.valueOf(type), bpc );
|
DataChecksum summer = newDataChecksum(Type.valueOf(type), bpc );
|
||||||
if ( summer == null ) {
|
if ( summer == null ) {
|
||||||
throw new IOException( "Could not create DataChecksum of type " +
|
throw new InvalidChecksumSizeException("Could not create DataChecksum "
|
||||||
type + " with bytesPerChecksum " + bpc );
|
+ "of type " + type + " with bytesPerChecksum " + bpc);
|
||||||
}
|
}
|
||||||
return summer;
|
return summer;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
/**
|
||||||
|
* Thrown when bytesPerChecksun field in the meta file is less than
|
||||||
|
* or equal to 0 or type is invalid.
|
||||||
|
**/
|
||||||
|
public class InvalidChecksumSizeException extends IOException {
|
||||||
|
|
||||||
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
|
public InvalidChecksumSizeException(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
}
|
|
@ -207,6 +207,7 @@ import org.apache.hadoop.util.Daemon;
|
||||||
import org.apache.hadoop.util.DiskChecker;
|
import org.apache.hadoop.util.DiskChecker;
|
||||||
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
|
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
|
||||||
import org.apache.hadoop.util.GenericOptionsParser;
|
import org.apache.hadoop.util.GenericOptionsParser;
|
||||||
|
import org.apache.hadoop.util.InvalidChecksumSizeException;
|
||||||
import org.apache.hadoop.util.JvmPauseMonitor;
|
import org.apache.hadoop.util.JvmPauseMonitor;
|
||||||
import org.apache.hadoop.util.ServicePlugin;
|
import org.apache.hadoop.util.ServicePlugin;
|
||||||
import org.apache.hadoop.util.StringUtils;
|
import org.apache.hadoop.util.StringUtils;
|
||||||
|
@ -348,7 +349,7 @@ public class DataNode extends ReconfigurableBase
|
||||||
BlockPoolTokenSecretManager blockPoolTokenSecretManager;
|
BlockPoolTokenSecretManager blockPoolTokenSecretManager;
|
||||||
private boolean hasAnyBlockPoolRegistered = false;
|
private boolean hasAnyBlockPoolRegistered = false;
|
||||||
|
|
||||||
private final BlockScanner blockScanner;
|
private BlockScanner blockScanner;
|
||||||
private DirectoryScanner directoryScanner = null;
|
private DirectoryScanner directoryScanner = null;
|
||||||
|
|
||||||
/** Activated plug-ins. */
|
/** Activated plug-ins. */
|
||||||
|
@ -2140,7 +2141,8 @@ public class DataNode extends ReconfigurableBase
|
||||||
LOG.warn(msg);
|
LOG.warn(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void transferBlock(ExtendedBlock block, DatanodeInfo[] xferTargets,
|
@VisibleForTesting
|
||||||
|
void transferBlock(ExtendedBlock block, DatanodeInfo[] xferTargets,
|
||||||
StorageType[] xferTargetStorageTypes) throws IOException {
|
StorageType[] xferTargetStorageTypes) throws IOException {
|
||||||
BPOfferService bpos = getBPOSForBlock(block);
|
BPOfferService bpos = getBPOSForBlock(block);
|
||||||
DatanodeRegistration bpReg = getDNRegistrationForBP(block.getBlockPoolId());
|
DatanodeRegistration bpReg = getDNRegistrationForBP(block.getBlockPoolId());
|
||||||
|
@ -2425,6 +2427,13 @@ public class DataNode extends ReconfigurableBase
|
||||||
metrics.incrBlocksReplicated();
|
metrics.incrBlocksReplicated();
|
||||||
}
|
}
|
||||||
} catch (IOException ie) {
|
} catch (IOException ie) {
|
||||||
|
if (ie instanceof InvalidChecksumSizeException) {
|
||||||
|
// Add the block to the front of the scanning queue if metadata file
|
||||||
|
// is corrupt. We already add the block to front of scanner if the
|
||||||
|
// peer disconnects.
|
||||||
|
LOG.info("Adding block: " + b + " for scanning");
|
||||||
|
blockScanner.markSuspectBlock(data.getVolume(b).getStorageID(), b);
|
||||||
|
}
|
||||||
LOG.warn(bpReg + ":Failed to transfer " + b + " to " +
|
LOG.warn(bpReg + ":Failed to transfer " + b + " to " +
|
||||||
targets[0] + " got ", ie);
|
targets[0] + " got ", ie);
|
||||||
// check if there are any disk problem
|
// check if there are any disk problem
|
||||||
|
@ -3459,4 +3468,9 @@ public class DataNode extends ReconfigurableBase
|
||||||
DiskBalancerException.Result.UNKNOWN_KEY);
|
DiskBalancerException.Result.UNKNOWN_KEY);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
void setBlockScanner(BlockScanner blockScanner) {
|
||||||
|
this.blockScanner = blockScanner;
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -23,10 +23,12 @@ import static org.junit.Assert.assertTrue;
|
||||||
import java.io.DataOutputStream;
|
import java.io.DataOutputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.RandomAccessFile;
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
import java.net.Socket;
|
import java.net.Socket;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.FileUtil;
|
import org.apache.hadoop.fs.FileUtil;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
@ -37,11 +39,13 @@ import org.apache.hadoop.hdfs.DFSTestUtil;
|
||||||
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
||||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
|
||||||
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
||||||
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
|
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
|
||||||
import org.apache.hadoop.hdfs.protocol.datatransfer.BlockConstructionStage;
|
import org.apache.hadoop.hdfs.protocol.datatransfer.BlockConstructionStage;
|
||||||
import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
|
import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
|
||||||
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
|
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
|
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
|
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||||
|
@ -50,6 +54,7 @@ import org.apache.hadoop.util.Time;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test that datanodes can correctly handle errors during block read/write.
|
* Test that datanodes can correctly handle errors during block read/write.
|
||||||
|
@ -223,4 +228,54 @@ public class TestDiskError {
|
||||||
long lastDiskErrorCheck = dataNode.getLastDiskErrorCheck();
|
long lastDiskErrorCheck = dataNode.getLastDiskErrorCheck();
|
||||||
assertTrue("Disk Error check is not performed within " + dataNode.checkDiskErrorInterval + " ms", ((Time.monotonicNow()-lastDiskErrorCheck) < (dataNode.checkDiskErrorInterval + slackTime)));
|
assertTrue("Disk Error check is not performed within " + dataNode.checkDiskErrorInterval + " ms", ((Time.monotonicNow()-lastDiskErrorCheck) < (dataNode.checkDiskErrorInterval + slackTime)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDataTransferWhenBytesPerChecksumIsZero() throws IOException {
|
||||||
|
DataNode dn0 = cluster.getDataNodes().get(0);
|
||||||
|
// Make a mock blockScanner class and return false whenever isEnabled is
|
||||||
|
// called on blockScanner
|
||||||
|
BlockScanner mockScanner = Mockito.mock(BlockScanner.class);
|
||||||
|
Mockito.when(mockScanner.isEnabled()).thenReturn(false);
|
||||||
|
dn0.setBlockScanner(mockScanner);
|
||||||
|
Path filePath = new Path("test.dat");
|
||||||
|
FSDataOutputStream out = fs.create(filePath, (short) 1);
|
||||||
|
out.write(1);
|
||||||
|
out.hflush();
|
||||||
|
out.close();
|
||||||
|
// Corrupt the metadata file. Insert all 0's in the type and
|
||||||
|
// bytesPerChecksum files of the metadata header.
|
||||||
|
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, filePath);
|
||||||
|
File metadataFile = cluster.getBlockMetadataFile(0, block);
|
||||||
|
RandomAccessFile raFile = new RandomAccessFile(metadataFile, "rw");
|
||||||
|
raFile.seek(2);
|
||||||
|
raFile.writeByte(0);
|
||||||
|
raFile.writeInt(0);
|
||||||
|
raFile.close();
|
||||||
|
String datanodeId0 = dn0.getDatanodeUuid();
|
||||||
|
LocatedBlock lb = DFSTestUtil.getAllBlocks(fs, filePath).get(0);
|
||||||
|
String storageId = lb.getStorageIDs()[0];
|
||||||
|
cluster.startDataNodes(conf, 1, true, null, null);
|
||||||
|
DataNode dn1 = null;
|
||||||
|
for (int i = 0; i < cluster.getDataNodes().size(); i++) {
|
||||||
|
if (!cluster.getDataNodes().get(i).equals(datanodeId0)) {
|
||||||
|
dn1 = cluster.getDataNodes().get(i);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DatanodeDescriptor dnd1 =
|
||||||
|
NameNodeAdapter.getDatanode(cluster.getNamesystem(),
|
||||||
|
dn1.getDatanodeId());
|
||||||
|
|
||||||
|
dn0.transferBlock(block, new DatanodeInfo[]{dnd1},
|
||||||
|
new StorageType[]{StorageType.DISK});
|
||||||
|
// Sleep for 1 second so the DataTrasnfer daemon can start transfer.
|
||||||
|
try {
|
||||||
|
Thread.sleep(1000);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
// Do nothing
|
||||||
|
}
|
||||||
|
Mockito.verify(mockScanner).markSuspectBlock(Mockito.eq(storageId),
|
||||||
|
Mockito.eq(block));
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue