HDFS-8224. Schedule a block for scanning if its metadata file is corrupt. Contributed by Rushabh S Shah.

This commit is contained in:
Wei-Chiu Chuang 2016-08-09 18:11:34 -07:00
parent 7992c0b42c
commit d00d3add9e
4 changed files with 106 additions and 5 deletions

View File

@ -122,8 +122,8 @@ public class DataChecksum implements Checksum {
int bpc = in.readInt(); int bpc = in.readInt();
DataChecksum summer = newDataChecksum(Type.valueOf(type), bpc ); DataChecksum summer = newDataChecksum(Type.valueOf(type), bpc );
if ( summer == null ) { if ( summer == null ) {
throw new IOException( "Could not create DataChecksum of type " + throw new InvalidChecksumSizeException("Could not create DataChecksum "
type + " with bytesPerChecksum " + bpc ); + "of type " + type + " with bytesPerChecksum " + bpc);
} }
return summer; return summer;
} }

View File

@ -0,0 +1,32 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.util;
import java.io.IOException;
/**
* Thrown when bytesPerChecksun field in the meta file is less than
* or equal to 0 or type is invalid.
**/
public class InvalidChecksumSizeException extends IOException {
private static final long serialVersionUID = 1L;
public InvalidChecksumSizeException(String s) {
super(s);
}
}

View File

@ -207,6 +207,7 @@ import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.DiskChecker; import org.apache.hadoop.util.DiskChecker;
import org.apache.hadoop.util.DiskChecker.DiskErrorException; import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.InvalidChecksumSizeException;
import org.apache.hadoop.util.JvmPauseMonitor; import org.apache.hadoop.util.JvmPauseMonitor;
import org.apache.hadoop.util.ServicePlugin; import org.apache.hadoop.util.ServicePlugin;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
@ -348,7 +349,7 @@ public class DataNode extends ReconfigurableBase
BlockPoolTokenSecretManager blockPoolTokenSecretManager; BlockPoolTokenSecretManager blockPoolTokenSecretManager;
private boolean hasAnyBlockPoolRegistered = false; private boolean hasAnyBlockPoolRegistered = false;
private final BlockScanner blockScanner; private BlockScanner blockScanner;
private DirectoryScanner directoryScanner = null; private DirectoryScanner directoryScanner = null;
/** Activated plug-ins. */ /** Activated plug-ins. */
@ -2140,7 +2141,8 @@ public class DataNode extends ReconfigurableBase
LOG.warn(msg); LOG.warn(msg);
} }
private void transferBlock(ExtendedBlock block, DatanodeInfo[] xferTargets, @VisibleForTesting
void transferBlock(ExtendedBlock block, DatanodeInfo[] xferTargets,
StorageType[] xferTargetStorageTypes) throws IOException { StorageType[] xferTargetStorageTypes) throws IOException {
BPOfferService bpos = getBPOSForBlock(block); BPOfferService bpos = getBPOSForBlock(block);
DatanodeRegistration bpReg = getDNRegistrationForBP(block.getBlockPoolId()); DatanodeRegistration bpReg = getDNRegistrationForBP(block.getBlockPoolId());
@ -2425,6 +2427,13 @@ public class DataNode extends ReconfigurableBase
metrics.incrBlocksReplicated(); metrics.incrBlocksReplicated();
} }
} catch (IOException ie) { } catch (IOException ie) {
if (ie instanceof InvalidChecksumSizeException) {
// Add the block to the front of the scanning queue if metadata file
// is corrupt. We already add the block to front of scanner if the
// peer disconnects.
LOG.info("Adding block: " + b + " for scanning");
blockScanner.markSuspectBlock(data.getVolume(b).getStorageID(), b);
}
LOG.warn(bpReg + ":Failed to transfer " + b + " to " + LOG.warn(bpReg + ":Failed to transfer " + b + " to " +
targets[0] + " got ", ie); targets[0] + " got ", ie);
// check if there are any disk problem // check if there are any disk problem
@ -3459,4 +3468,9 @@ public class DataNode extends ReconfigurableBase
DiskBalancerException.Result.UNKNOWN_KEY); DiskBalancerException.Result.UNKNOWN_KEY);
} }
} }
@VisibleForTesting
void setBlockScanner(BlockScanner blockScanner) {
this.blockScanner = blockScanner;
}
} }

View File

@ -23,10 +23,12 @@ import static org.junit.Assert.assertTrue;
import java.io.DataOutputStream; import java.io.DataOutputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile;
import java.net.InetSocketAddress; import java.net.InetSocketAddress;
import java.net.Socket; import java.net.Socket;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
@ -37,11 +39,13 @@ import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.protocol.datatransfer.BlockConstructionStage; import org.apache.hadoop.hdfs.protocol.datatransfer.BlockConstructionStage;
import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
@ -50,6 +54,7 @@ import org.apache.hadoop.util.Time;
import org.junit.After; import org.junit.After;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.mockito.Mockito;
/** /**
* Test that datanodes can correctly handle errors during block read/write. * Test that datanodes can correctly handle errors during block read/write.
@ -223,4 +228,54 @@ public class TestDiskError {
long lastDiskErrorCheck = dataNode.getLastDiskErrorCheck(); long lastDiskErrorCheck = dataNode.getLastDiskErrorCheck();
assertTrue("Disk Error check is not performed within " + dataNode.checkDiskErrorInterval + " ms", ((Time.monotonicNow()-lastDiskErrorCheck) < (dataNode.checkDiskErrorInterval + slackTime))); assertTrue("Disk Error check is not performed within " + dataNode.checkDiskErrorInterval + " ms", ((Time.monotonicNow()-lastDiskErrorCheck) < (dataNode.checkDiskErrorInterval + slackTime)));
} }
@Test
public void testDataTransferWhenBytesPerChecksumIsZero() throws IOException {
DataNode dn0 = cluster.getDataNodes().get(0);
// Make a mock blockScanner class and return false whenever isEnabled is
// called on blockScanner
BlockScanner mockScanner = Mockito.mock(BlockScanner.class);
Mockito.when(mockScanner.isEnabled()).thenReturn(false);
dn0.setBlockScanner(mockScanner);
Path filePath = new Path("test.dat");
FSDataOutputStream out = fs.create(filePath, (short) 1);
out.write(1);
out.hflush();
out.close();
// Corrupt the metadata file. Insert all 0's in the type and
// bytesPerChecksum files of the metadata header.
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, filePath);
File metadataFile = cluster.getBlockMetadataFile(0, block);
RandomAccessFile raFile = new RandomAccessFile(metadataFile, "rw");
raFile.seek(2);
raFile.writeByte(0);
raFile.writeInt(0);
raFile.close();
String datanodeId0 = dn0.getDatanodeUuid();
LocatedBlock lb = DFSTestUtil.getAllBlocks(fs, filePath).get(0);
String storageId = lb.getStorageIDs()[0];
cluster.startDataNodes(conf, 1, true, null, null);
DataNode dn1 = null;
for (int i = 0; i < cluster.getDataNodes().size(); i++) {
if (!cluster.getDataNodes().get(i).equals(datanodeId0)) {
dn1 = cluster.getDataNodes().get(i);
break;
}
}
DatanodeDescriptor dnd1 =
NameNodeAdapter.getDatanode(cluster.getNamesystem(),
dn1.getDatanodeId());
dn0.transferBlock(block, new DatanodeInfo[]{dnd1},
new StorageType[]{StorageType.DISK});
// Sleep for 1 second so the DataTrasnfer daemon can start transfer.
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// Do nothing
}
Mockito.verify(mockScanner).markSuspectBlock(Mockito.eq(storageId),
Mockito.eq(block));
}
} }