HDFS-7917. Use file to replace data dirs in test to simulate a disk failure. Contributed by Lei (Eddy) Xu.
This commit is contained in:
parent
972f1f1ab9
commit
2c238ae4e0
|
@ -774,6 +774,9 @@ Release 2.7.0 - UNRELEASED
|
||||||
|
|
||||||
HDFS-7962. Remove duplicated logs in BlockManager. (yliu)
|
HDFS-7962. Remove duplicated logs in BlockManager. (yliu)
|
||||||
|
|
||||||
|
HDFS-7917. Use file to replace data dirs in test to simulate a disk failure.
|
||||||
|
(Lei (Eddy) Xu via cnauroth)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
HDFS-7454. Reduce memory footprint for AclEntries in NameNode.
|
HDFS-7454. Reduce memory footprint for AclEntries in NameNode.
|
||||||
|
|
|
@ -41,6 +41,8 @@ import com.google.common.base.Preconditions;
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class DataNodeTestUtils {
|
public class DataNodeTestUtils {
|
||||||
|
private static final String DIR_FAILURE_SUFFIX = ".origin";
|
||||||
|
|
||||||
public static DatanodeRegistration
|
public static DatanodeRegistration
|
||||||
getDNRegistrationForBP(DataNode dn, String bpid) throws IOException {
|
getDNRegistrationForBP(DataNode dn, String bpid) throws IOException {
|
||||||
return dn.getDNRegistrationForBP(bpid);
|
return dn.getDNRegistrationForBP(bpid);
|
||||||
|
@ -159,4 +161,61 @@ public class DataNodeTestUtils {
|
||||||
final String bpid, final long blkId) {
|
final String bpid, final long blkId) {
|
||||||
return FsDatasetTestUtil.fetchReplicaInfo(dn.getFSDataset(), bpid, blkId);
|
return FsDatasetTestUtil.fetchReplicaInfo(dn.getFSDataset(), bpid, blkId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* It injects disk failures to data dirs by replacing these data dirs with
|
||||||
|
* regular files.
|
||||||
|
*
|
||||||
|
* @param dirs data directories.
|
||||||
|
* @throws IOException on I/O error.
|
||||||
|
*/
|
||||||
|
public static void injectDataDirFailure(File... dirs) throws IOException {
|
||||||
|
for (File dir : dirs) {
|
||||||
|
File renamedTo = new File(dir.getPath() + DIR_FAILURE_SUFFIX);
|
||||||
|
if (renamedTo.exists()) {
|
||||||
|
throw new IOException(String.format(
|
||||||
|
"Can not inject failure to dir: %s because %s exists.",
|
||||||
|
dir, renamedTo));
|
||||||
|
}
|
||||||
|
if (!dir.renameTo(renamedTo)) {
|
||||||
|
throw new IOException(String.format("Failed to rename %s to %s.",
|
||||||
|
dir, renamedTo));
|
||||||
|
}
|
||||||
|
if (!dir.createNewFile()) {
|
||||||
|
throw new IOException(String.format(
|
||||||
|
"Failed to create file %s to inject disk failure.", dir));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Restore the injected data dir failures.
|
||||||
|
*
|
||||||
|
* @see {@link #injectDataDirFailures}.
|
||||||
|
* @param dirs data directories.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public static void restoreDataDirFromFailure(File... dirs)
|
||||||
|
throws IOException {
|
||||||
|
for (File dir : dirs) {
|
||||||
|
File renamedDir = new File(dir.getPath() + DIR_FAILURE_SUFFIX);
|
||||||
|
if (renamedDir.exists()) {
|
||||||
|
if (dir.exists()) {
|
||||||
|
if (!dir.isFile()) {
|
||||||
|
throw new IOException(
|
||||||
|
"Injected failure data dir is supposed to be file: " + dir);
|
||||||
|
}
|
||||||
|
if (!dir.delete()) {
|
||||||
|
throw new IOException(
|
||||||
|
"Failed to delete injected failure data dir: " + dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!renamedDir.renameTo(dir)) {
|
||||||
|
throw new IOException(String.format(
|
||||||
|
"Failed to recover injected failure data dir %s to %s.",
|
||||||
|
renamedDir, dir));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.hadoop.fs.BlockLocation;
|
||||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.FileUtil;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.hdfs.BlockMissingException;
|
import org.apache.hadoop.hdfs.BlockMissingException;
|
||||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||||
|
@ -682,9 +681,7 @@ public class TestDataNodeHotSwapVolumes {
|
||||||
failedVolume != null);
|
failedVolume != null);
|
||||||
long used = failedVolume.getDfsUsed();
|
long used = failedVolume.getDfsUsed();
|
||||||
|
|
||||||
try {
|
DataNodeTestUtils.injectDataDirFailure(dirToFail);
|
||||||
assertTrue("Couldn't chmod local vol: " + dirToFail,
|
|
||||||
FileUtil.setExecutable(dirToFail, false));
|
|
||||||
// Call and wait DataNode to detect disk failure.
|
// Call and wait DataNode to detect disk failure.
|
||||||
long lastDiskErrorCheck = dn.getLastDiskErrorCheck();
|
long lastDiskErrorCheck = dn.getLastDiskErrorCheck();
|
||||||
dn.checkDiskErrorAsync();
|
dn.checkDiskErrorAsync();
|
||||||
|
@ -694,14 +691,8 @@ public class TestDataNodeHotSwapVolumes {
|
||||||
|
|
||||||
createFile(new Path("/test1"), 32, (short)2);
|
createFile(new Path("/test1"), 32, (short)2);
|
||||||
assertEquals(used, failedVolume.getDfsUsed());
|
assertEquals(used, failedVolume.getDfsUsed());
|
||||||
} finally {
|
|
||||||
// Need to restore the mode on dirToFail. Otherwise, if an Exception
|
|
||||||
// is thrown above, the following tests can not delete this data directory
|
|
||||||
// and thus fail to start MiniDFSCluster.
|
|
||||||
assertTrue("Couldn't restore executable for: " + dirToFail,
|
|
||||||
FileUtil.setExecutable(dirToFail, true));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
DataNodeTestUtils.restoreDataDirFromFailure(dirToFail);
|
||||||
dn.reconfigurePropertyImpl(DFS_DATANODE_DATA_DIR_KEY, oldDataDir);
|
dn.reconfigurePropertyImpl(DFS_DATANODE_DATA_DIR_KEY, oldDataDir);
|
||||||
|
|
||||||
createFile(new Path("/test2"), 32, (short)2);
|
createFile(new Path("/test2"), 32, (short)2);
|
||||||
|
|
|
@ -121,10 +121,6 @@ public class TestDataNodeVolumeFailure {
|
||||||
if(cluster != null) {
|
if(cluster != null) {
|
||||||
cluster.shutdown();
|
cluster.shutdown();
|
||||||
}
|
}
|
||||||
for (int i = 0; i < 3; i++) {
|
|
||||||
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+1)), true);
|
|
||||||
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+2)), true);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -217,7 +213,7 @@ public class TestDataNodeVolumeFailure {
|
||||||
DFSTestUtil.waitReplication(fs, file1, (short) 2);
|
DFSTestUtil.waitReplication(fs, file1, (short) 2);
|
||||||
|
|
||||||
File dn0Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
|
File dn0Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
|
||||||
assertTrue(FileUtil.setExecutable(dn0Vol1, false));
|
DataNodeTestUtils.injectDataDirFailure(dn0Vol1);
|
||||||
DataNode dn0 = cluster.getDataNodes().get(0);
|
DataNode dn0 = cluster.getDataNodes().get(0);
|
||||||
long lastDiskErrorCheck = dn0.getLastDiskErrorCheck();
|
long lastDiskErrorCheck = dn0.getLastDiskErrorCheck();
|
||||||
dn0.checkDiskErrorAsync();
|
dn0.checkDiskErrorAsync();
|
||||||
|
@ -291,8 +287,7 @@ public class TestDataNodeVolumeFailure {
|
||||||
// Fail the first volume on both datanodes
|
// Fail the first volume on both datanodes
|
||||||
File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
|
File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
|
||||||
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
|
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
|
DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
|
|
||||||
|
|
||||||
Path file2 = new Path("/test2");
|
Path file2 = new Path("/test2");
|
||||||
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
|
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
|
||||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.commons.logging.impl.Log4JLogger;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.conf.ReconfigurationException;
|
import org.apache.hadoop.conf.ReconfigurationException;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.FileUtil;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||||
import org.apache.hadoop.hdfs.DFSTestUtil;
|
import org.apache.hadoop.hdfs.DFSTestUtil;
|
||||||
|
@ -87,19 +86,6 @@ public class TestDataNodeVolumeFailureReporting {
|
||||||
|
|
||||||
@After
|
@After
|
||||||
public void tearDown() throws Exception {
|
public void tearDown() throws Exception {
|
||||||
// Restore executable permission on all directories where a failure may have
|
|
||||||
// been simulated by denying execute access. This is based on the maximum
|
|
||||||
// number of datanodes and the maximum number of storages per data node used
|
|
||||||
// throughout the tests in this suite.
|
|
||||||
assumeTrue(!Path.WINDOWS);
|
|
||||||
int maxDataNodes = 3;
|
|
||||||
int maxStoragesPerDataNode = 4;
|
|
||||||
for (int i = 0; i < maxDataNodes; i++) {
|
|
||||||
for (int j = 1; j <= maxStoragesPerDataNode; j++) {
|
|
||||||
String subDir = "data" + ((i * maxStoragesPerDataNode) + j);
|
|
||||||
FileUtil.setExecutable(new File(dataDir, subDir), true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
IOUtils.cleanup(LOG, fs);
|
IOUtils.cleanup(LOG, fs);
|
||||||
if (cluster != null) {
|
if (cluster != null) {
|
||||||
cluster.shutdown();
|
cluster.shutdown();
|
||||||
|
@ -141,8 +127,7 @@ public class TestDataNodeVolumeFailureReporting {
|
||||||
* fail. The client does not retry failed nodes even though
|
* fail. The client does not retry failed nodes even though
|
||||||
* perhaps they could succeed because just a single volume failed.
|
* perhaps they could succeed because just a single volume failed.
|
||||||
*/
|
*/
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
|
DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create file1 and wait for 3 replicas (ie all DNs can still
|
* Create file1 and wait for 3 replicas (ie all DNs can still
|
||||||
|
@ -179,7 +164,7 @@ public class TestDataNodeVolumeFailureReporting {
|
||||||
* Now fail a volume on the third datanode. We should be able to get
|
* Now fail a volume on the third datanode. We should be able to get
|
||||||
* three replicas since we've already identified the other failures.
|
* three replicas since we've already identified the other failures.
|
||||||
*/
|
*/
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, false));
|
DataNodeTestUtils.injectDataDirFailure(dn3Vol1);
|
||||||
Path file2 = new Path("/test2");
|
Path file2 = new Path("/test2");
|
||||||
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
|
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
|
||||||
DFSTestUtil.waitReplication(fs, file2, (short)3);
|
DFSTestUtil.waitReplication(fs, file2, (short)3);
|
||||||
|
@ -208,7 +193,7 @@ public class TestDataNodeVolumeFailureReporting {
|
||||||
* and that it's no longer up. Only wait for two replicas since
|
* and that it's no longer up. Only wait for two replicas since
|
||||||
* we'll never get a third.
|
* we'll never get a third.
|
||||||
*/
|
*/
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, false));
|
DataNodeTestUtils.injectDataDirFailure(dn3Vol2);
|
||||||
Path file3 = new Path("/test3");
|
Path file3 = new Path("/test3");
|
||||||
DFSTestUtil.createFile(fs, file3, 1024, (short)3, 1L);
|
DFSTestUtil.createFile(fs, file3, 1024, (short)3, 1L);
|
||||||
DFSTestUtil.waitReplication(fs, file3, (short)2);
|
DFSTestUtil.waitReplication(fs, file3, (short)2);
|
||||||
|
@ -233,10 +218,8 @@ public class TestDataNodeVolumeFailureReporting {
|
||||||
* restart, so file creation should be able to succeed after
|
* restart, so file creation should be able to succeed after
|
||||||
* restoring the data directories and restarting the datanodes.
|
* restoring the data directories and restarting the datanodes.
|
||||||
*/
|
*/
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true));
|
DataNodeTestUtils.restoreDataDirFromFailure(
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true));
|
dn1Vol1, dn2Vol1, dn3Vol1, dn3Vol2);
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, true));
|
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, true));
|
|
||||||
cluster.restartDataNodes();
|
cluster.restartDataNodes();
|
||||||
cluster.waitActive();
|
cluster.waitActive();
|
||||||
Path file4 = new Path("/test4");
|
Path file4 = new Path("/test4");
|
||||||
|
@ -275,8 +258,7 @@ public class TestDataNodeVolumeFailureReporting {
|
||||||
// third healthy so one node in the pipeline will not fail).
|
// third healthy so one node in the pipeline will not fail).
|
||||||
File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
|
File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
|
||||||
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
|
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
|
DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
|
|
||||||
|
|
||||||
Path file1 = new Path("/test1");
|
Path file1 = new Path("/test1");
|
||||||
DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L);
|
DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L);
|
||||||
|
@ -323,14 +305,7 @@ public class TestDataNodeVolumeFailureReporting {
|
||||||
|
|
||||||
// Make the first two volume directories on the first two datanodes
|
// Make the first two volume directories on the first two datanodes
|
||||||
// non-accessible.
|
// non-accessible.
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1,
|
DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn1Vol2, dn2Vol1, dn2Vol2);
|
||||||
false));
|
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol2,
|
|
||||||
false));
|
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1,
|
|
||||||
false));
|
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol2,
|
|
||||||
false));
|
|
||||||
|
|
||||||
// Create file1 and wait for 3 replicas (ie all DNs can still store a block).
|
// Create file1 and wait for 3 replicas (ie all DNs can still store a block).
|
||||||
// Then assert that all DNs are up, despite the volume failures.
|
// Then assert that all DNs are up, despite the volume failures.
|
||||||
|
@ -380,8 +355,8 @@ public class TestDataNodeVolumeFailureReporting {
|
||||||
File dn1Vol2 = new File(dataDir, "data"+(2*0+2));
|
File dn1Vol2 = new File(dataDir, "data"+(2*0+2));
|
||||||
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
|
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
|
||||||
File dn2Vol2 = new File(dataDir, "data"+(2*1+2));
|
File dn2Vol2 = new File(dataDir, "data"+(2*1+2));
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false));
|
DataNodeTestUtils.injectDataDirFailure(dn1Vol1);
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
|
DataNodeTestUtils.injectDataDirFailure(dn2Vol1);
|
||||||
|
|
||||||
Path file1 = new Path("/test1");
|
Path file1 = new Path("/test1");
|
||||||
DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L);
|
DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L);
|
||||||
|
@ -449,8 +424,7 @@ public class TestDataNodeVolumeFailureReporting {
|
||||||
|
|
||||||
// Replace failed volume with healthy volume and run reconfigure DataNode.
|
// Replace failed volume with healthy volume and run reconfigure DataNode.
|
||||||
// The failed volume information should be cleared.
|
// The failed volume information should be cleared.
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true));
|
DataNodeTestUtils.restoreDataDirFromFailure(dn1Vol1, dn2Vol1);
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true));
|
|
||||||
reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2);
|
reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2);
|
||||||
reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2);
|
reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2);
|
||||||
|
|
||||||
|
|
|
@ -76,10 +76,6 @@ public class TestDataNodeVolumeFailureToleration {
|
||||||
|
|
||||||
@After
|
@After
|
||||||
public void tearDown() throws Exception {
|
public void tearDown() throws Exception {
|
||||||
for (int i = 0; i < 3; i++) {
|
|
||||||
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+1)), true);
|
|
||||||
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+2)), true);
|
|
||||||
}
|
|
||||||
cluster.shutdown();
|
cluster.shutdown();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -152,7 +148,7 @@ public class TestDataNodeVolumeFailureToleration {
|
||||||
|
|
||||||
// Fail a volume on the 2nd DN
|
// Fail a volume on the 2nd DN
|
||||||
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
|
File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
|
DataNodeTestUtils.injectDataDirFailure(dn2Vol1);
|
||||||
|
|
||||||
// Should only get two replicas (the first DN and the 3rd)
|
// Should only get two replicas (the first DN and the 3rd)
|
||||||
Path file1 = new Path("/test1");
|
Path file1 = new Path("/test1");
|
||||||
|
@ -165,7 +161,7 @@ public class TestDataNodeVolumeFailureToleration {
|
||||||
|
|
||||||
// If we restore the volume we should still only be able to get
|
// If we restore the volume we should still only be able to get
|
||||||
// two replicas since the DN is still considered dead.
|
// two replicas since the DN is still considered dead.
|
||||||
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true));
|
DataNodeTestUtils.restoreDataDirFromFailure(dn2Vol1);
|
||||||
Path file2 = new Path("/test2");
|
Path file2 = new Path("/test2");
|
||||||
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
|
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
|
||||||
DFSTestUtil.waitReplication(fs, file2, (short)2);
|
DFSTestUtil.waitReplication(fs, file2, (short)2);
|
||||||
|
|
Loading…
Reference in New Issue