HDFS-7917. Use file to replace data dirs in test to simulate a disk failure. Contributed by Lei (Eddy) Xu.

(cherry picked from commit 2c238ae4e0)
(cherry picked from commit 01c0bcb176)
This commit is contained in:
cnauroth 2015-03-23 16:29:51 -07:00
parent 2742f12b58
commit 8e1c33e703
6 changed files with 88 additions and 70 deletions

View File

@ -444,6 +444,9 @@ Release 2.7.0 - UNRELEASED
HDFS-7962. Remove duplicated logs in BlockManager. (yliu) HDFS-7962. Remove duplicated logs in BlockManager. (yliu)
HDFS-7917. Use file to replace data dirs in test to simulate a disk failure.
(Lei (Eddy) Xu via cnauroth)
OPTIMIZATIONS OPTIMIZATIONS
HDFS-7454. Reduce memory footprint for AclEntries in NameNode. HDFS-7454. Reduce memory footprint for AclEntries in NameNode.

View File

@ -40,7 +40,9 @@ import com.google.common.base.Preconditions;
* Utility class for accessing package-private DataNode information during tests. * Utility class for accessing package-private DataNode information during tests.
* *
*/ */
public class DataNodeTestUtils { public class DataNodeTestUtils {
private static final String DIR_FAILURE_SUFFIX = ".origin";
public static DatanodeRegistration public static DatanodeRegistration
getDNRegistrationForBP(DataNode dn, String bpid) throws IOException { getDNRegistrationForBP(DataNode dn, String bpid) throws IOException {
return dn.getDNRegistrationForBP(bpid); return dn.getDNRegistrationForBP(bpid);
@ -159,4 +161,61 @@ public class DataNodeTestUtils {
final String bpid, final long blkId) { final String bpid, final long blkId) {
return FsDatasetTestUtil.fetchReplicaInfo(dn.getFSDataset(), bpid, blkId); return FsDatasetTestUtil.fetchReplicaInfo(dn.getFSDataset(), bpid, blkId);
} }
/**
* It injects disk failures to data dirs by replacing these data dirs with
* regular files.
*
* @param dirs data directories.
* @throws IOException on I/O error.
*/
public static void injectDataDirFailure(File... dirs) throws IOException {
for (File dir : dirs) {
File renamedTo = new File(dir.getPath() + DIR_FAILURE_SUFFIX);
if (renamedTo.exists()) {
throw new IOException(String.format(
"Can not inject failure to dir: %s because %s exists.",
dir, renamedTo));
}
if (!dir.renameTo(renamedTo)) {
throw new IOException(String.format("Failed to rename %s to %s.",
dir, renamedTo));
}
if (!dir.createNewFile()) {
throw new IOException(String.format(
"Failed to create file %s to inject disk failure.", dir));
}
}
}
/**
* Restore the injected data dir failures.
*
* @see {@link #injectDataDirFailures}.
* @param dirs data directories.
* @throws IOException
*/
public static void restoreDataDirFromFailure(File... dirs)
throws IOException {
for (File dir : dirs) {
File renamedDir = new File(dir.getPath() + DIR_FAILURE_SUFFIX);
if (renamedDir.exists()) {
if (dir.exists()) {
if (!dir.isFile()) {
throw new IOException(
"Injected failure data dir is supposed to be file: " + dir);
}
if (!dir.delete()) {
throw new IOException(
"Failed to delete injected failure data dir: " + dir);
}
}
if (!renamedDir.renameTo(dir)) {
throw new IOException(String.format(
"Failed to recover injected failure data dir %s to %s.",
renamedDir, dir));
}
}
}
}
} }

View File

@ -26,7 +26,6 @@ import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.BlockMissingException; import org.apache.hadoop.hdfs.BlockMissingException;
import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSConfigKeys;
@ -682,26 +681,18 @@ public class TestDataNodeHotSwapVolumes {
failedVolume != null); failedVolume != null);
long used = failedVolume.getDfsUsed(); long used = failedVolume.getDfsUsed();
try { DataNodeTestUtils.injectDataDirFailure(dirToFail);
assertTrue("Couldn't chmod local vol: " + dirToFail, // Call and wait DataNode to detect disk failure.
FileUtil.setExecutable(dirToFail, false)); long lastDiskErrorCheck = dn.getLastDiskErrorCheck();
// Call and wait DataNode to detect disk failure. dn.checkDiskErrorAsync();
long lastDiskErrorCheck = dn.getLastDiskErrorCheck(); while (dn.getLastDiskErrorCheck() == lastDiskErrorCheck) {
dn.checkDiskErrorAsync(); Thread.sleep(100);
while (dn.getLastDiskErrorCheck() == lastDiskErrorCheck) {
Thread.sleep(100);
}
createFile(new Path("/test1"), 32, (short)2);
assertEquals(used, failedVolume.getDfsUsed());
} finally {
// Need to restore the mode on dirToFail. Otherwise, if an Exception
// is thrown above, the following tests can not delete this data directory
// and thus fail to start MiniDFSCluster.
assertTrue("Couldn't restore executable for: " + dirToFail,
FileUtil.setExecutable(dirToFail, true));
} }
createFile(new Path("/test1"), 32, (short)2);
assertEquals(used, failedVolume.getDfsUsed());
DataNodeTestUtils.restoreDataDirFromFailure(dirToFail);
dn.reconfigurePropertyImpl(DFS_DATANODE_DATA_DIR_KEY, oldDataDir); dn.reconfigurePropertyImpl(DFS_DATANODE_DATA_DIR_KEY, oldDataDir);
createFile(new Path("/test2"), 32, (short)2); createFile(new Path("/test2"), 32, (short)2);

View File

@ -121,10 +121,6 @@ public class TestDataNodeVolumeFailure {
if(cluster != null) { if(cluster != null) {
cluster.shutdown(); cluster.shutdown();
} }
for (int i = 0; i < 3; i++) {
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+1)), true);
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+2)), true);
}
} }
/* /*
@ -159,7 +155,7 @@ public class TestDataNodeVolumeFailure {
!deteteBlocks(failedDir) !deteteBlocks(failedDir)
) { ) {
throw new IOException("Could not delete hdfs directory '" + failedDir + "'"); throw new IOException("Could not delete hdfs directory '" + failedDir + "'");
} }
data_fail.setReadOnly(); data_fail.setReadOnly();
failedDir.setReadOnly(); failedDir.setReadOnly();
System.out.println("Deleteing " + failedDir.getPath() + "; exist=" + failedDir.exists()); System.out.println("Deleteing " + failedDir.getPath() + "; exist=" + failedDir.exists());
@ -217,7 +213,7 @@ public class TestDataNodeVolumeFailure {
DFSTestUtil.waitReplication(fs, file1, (short) 2); DFSTestUtil.waitReplication(fs, file1, (short) 2);
File dn0Vol1 = new File(dataDir, "data" + (2 * 0 + 1)); File dn0Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
assertTrue(FileUtil.setExecutable(dn0Vol1, false)); DataNodeTestUtils.injectDataDirFailure(dn0Vol1);
DataNode dn0 = cluster.getDataNodes().get(0); DataNode dn0 = cluster.getDataNodes().get(0);
long lastDiskErrorCheck = dn0.getLastDiskErrorCheck(); long lastDiskErrorCheck = dn0.getLastDiskErrorCheck();
dn0.checkDiskErrorAsync(); dn0.checkDiskErrorAsync();
@ -291,8 +287,7 @@ public class TestDataNodeVolumeFailure {
// Fail the first volume on both datanodes // Fail the first volume on both datanodes
File dn1Vol1 = new File(dataDir, "data"+(2*0+1)); File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
File dn2Vol1 = new File(dataDir, "data"+(2*1+1)); File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false)); DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
Path file2 = new Path("/test2"); Path file2 = new Path("/test2");
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L); DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);

View File

@ -34,7 +34,6 @@ import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.ReconfigurationException; import org.apache.hadoop.conf.ReconfigurationException;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DFSTestUtil;
@ -87,19 +86,6 @@ public class TestDataNodeVolumeFailureReporting {
@After @After
public void tearDown() throws Exception { public void tearDown() throws Exception {
// Restore executable permission on all directories where a failure may have
// been simulated by denying execute access. This is based on the maximum
// number of datanodes and the maximum number of storages per data node used
// throughout the tests in this suite.
assumeTrue(!Path.WINDOWS);
int maxDataNodes = 3;
int maxStoragesPerDataNode = 4;
for (int i = 0; i < maxDataNodes; i++) {
for (int j = 1; j <= maxStoragesPerDataNode; j++) {
String subDir = "data" + ((i * maxStoragesPerDataNode) + j);
FileUtil.setExecutable(new File(dataDir, subDir), true);
}
}
IOUtils.cleanup(LOG, fs); IOUtils.cleanup(LOG, fs);
if (cluster != null) { if (cluster != null) {
cluster.shutdown(); cluster.shutdown();
@ -141,8 +127,7 @@ public class TestDataNodeVolumeFailureReporting {
* fail. The client does not retry failed nodes even though * fail. The client does not retry failed nodes even though
* perhaps they could succeed because just a single volume failed. * perhaps they could succeed because just a single volume failed.
*/ */
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false)); DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
/* /*
* Create file1 and wait for 3 replicas (ie all DNs can still * Create file1 and wait for 3 replicas (ie all DNs can still
@ -179,7 +164,7 @@ public class TestDataNodeVolumeFailureReporting {
* Now fail a volume on the third datanode. We should be able to get * Now fail a volume on the third datanode. We should be able to get
* three replicas since we've already identified the other failures. * three replicas since we've already identified the other failures.
*/ */
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, false)); DataNodeTestUtils.injectDataDirFailure(dn3Vol1);
Path file2 = new Path("/test2"); Path file2 = new Path("/test2");
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L); DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
DFSTestUtil.waitReplication(fs, file2, (short)3); DFSTestUtil.waitReplication(fs, file2, (short)3);
@ -208,7 +193,7 @@ public class TestDataNodeVolumeFailureReporting {
* and that it's no longer up. Only wait for two replicas since * and that it's no longer up. Only wait for two replicas since
* we'll never get a third. * we'll never get a third.
*/ */
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, false)); DataNodeTestUtils.injectDataDirFailure(dn3Vol2);
Path file3 = new Path("/test3"); Path file3 = new Path("/test3");
DFSTestUtil.createFile(fs, file3, 1024, (short)3, 1L); DFSTestUtil.createFile(fs, file3, 1024, (short)3, 1L);
DFSTestUtil.waitReplication(fs, file3, (short)2); DFSTestUtil.waitReplication(fs, file3, (short)2);
@ -233,10 +218,8 @@ public class TestDataNodeVolumeFailureReporting {
* restart, so file creation should be able to succeed after * restart, so file creation should be able to succeed after
* restoring the data directories and restarting the datanodes. * restoring the data directories and restarting the datanodes.
*/ */
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true)); DataNodeTestUtils.restoreDataDirFromFailure(
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true)); dn1Vol1, dn2Vol1, dn3Vol1, dn3Vol2);
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, true));
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, true));
cluster.restartDataNodes(); cluster.restartDataNodes();
cluster.waitActive(); cluster.waitActive();
Path file4 = new Path("/test4"); Path file4 = new Path("/test4");
@ -275,8 +258,7 @@ public class TestDataNodeVolumeFailureReporting {
// third healthy so one node in the pipeline will not fail). // third healthy so one node in the pipeline will not fail).
File dn1Vol1 = new File(dataDir, "data"+(2*0+1)); File dn1Vol1 = new File(dataDir, "data"+(2*0+1));
File dn2Vol1 = new File(dataDir, "data"+(2*1+1)); File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false)); DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1);
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false));
Path file1 = new Path("/test1"); Path file1 = new Path("/test1");
DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L); DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L);
@ -323,14 +305,7 @@ public class TestDataNodeVolumeFailureReporting {
// Make the first two volume directories on the first two datanodes // Make the first two volume directories on the first two datanodes
// non-accessible. // non-accessible.
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn1Vol2, dn2Vol1, dn2Vol2);
false));
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol2,
false));
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1,
false));
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol2,
false));
// Create file1 and wait for 3 replicas (ie all DNs can still store a block). // Create file1 and wait for 3 replicas (ie all DNs can still store a block).
// Then assert that all DNs are up, despite the volume failures. // Then assert that all DNs are up, despite the volume failures.
@ -380,8 +355,8 @@ public class TestDataNodeVolumeFailureReporting {
File dn1Vol2 = new File(dataDir, "data"+(2*0+2)); File dn1Vol2 = new File(dataDir, "data"+(2*0+2));
File dn2Vol1 = new File(dataDir, "data"+(2*1+1)); File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
File dn2Vol2 = new File(dataDir, "data"+(2*1+2)); File dn2Vol2 = new File(dataDir, "data"+(2*1+2));
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false)); DataNodeTestUtils.injectDataDirFailure(dn1Vol1);
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false)); DataNodeTestUtils.injectDataDirFailure(dn2Vol1);
Path file1 = new Path("/test1"); Path file1 = new Path("/test1");
DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L); DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L);
@ -449,8 +424,7 @@ public class TestDataNodeVolumeFailureReporting {
// Replace failed volume with healthy volume and run reconfigure DataNode. // Replace failed volume with healthy volume and run reconfigure DataNode.
// The failed volume information should be cleared. // The failed volume information should be cleared.
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true)); DataNodeTestUtils.restoreDataDirFromFailure(dn1Vol1, dn2Vol1);
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true));
reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2); reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2);
reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2); reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2);

View File

@ -76,10 +76,6 @@ public class TestDataNodeVolumeFailureToleration {
@After @After
public void tearDown() throws Exception { public void tearDown() throws Exception {
for (int i = 0; i < 3; i++) {
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+1)), true);
FileUtil.setExecutable(new File(dataDir, "data"+(2*i+2)), true);
}
cluster.shutdown(); cluster.shutdown();
} }
@ -152,7 +148,7 @@ public class TestDataNodeVolumeFailureToleration {
// Fail a volume on the 2nd DN // Fail a volume on the 2nd DN
File dn2Vol1 = new File(dataDir, "data"+(2*1+1)); File dn2Vol1 = new File(dataDir, "data"+(2*1+1));
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false)); DataNodeTestUtils.injectDataDirFailure(dn2Vol1);
// Should only get two replicas (the first DN and the 3rd) // Should only get two replicas (the first DN and the 3rd)
Path file1 = new Path("/test1"); Path file1 = new Path("/test1");
@ -165,7 +161,7 @@ public class TestDataNodeVolumeFailureToleration {
// If we restore the volume we should still only be able to get // If we restore the volume we should still only be able to get
// two replicas since the DN is still considered dead. // two replicas since the DN is still considered dead.
assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true)); DataNodeTestUtils.restoreDataDirFromFailure(dn2Vol1);
Path file2 = new Path("/test2"); Path file2 = new Path("/test2");
DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L); DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L);
DFSTestUtil.waitReplication(fs, file2, (short)2); DFSTestUtil.waitReplication(fs, file2, (short)2);