From 01c0bcb176e22ddefbc8086e382dd1ebd105f9c6 Mon Sep 17 00:00:00 2001 From: cnauroth Date: Mon, 23 Mar 2015 16:29:51 -0700 Subject: [PATCH] HDFS-7917. Use file to replace data dirs in test to simulate a disk failure. Contributed by Lei (Eddy) Xu. (cherry picked from commit 2c238ae4e00371ef76582b007bb0e20ac8455d9c) --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../server/datanode/DataNodeTestUtils.java | 61 ++++++++++++++++++- .../datanode/TestDataNodeHotSwapVolumes.java | 29 +++------ .../datanode/TestDataNodeVolumeFailure.java | 11 +--- .../TestDataNodeVolumeFailureReporting.java | 46 +++----------- .../TestDataNodeVolumeFailureToleration.java | 8 +-- 6 files changed, 88 insertions(+), 70 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 9981d4fdcbb..febec024272 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -469,6 +469,9 @@ Release 2.7.0 - UNRELEASED HDFS-7962. Remove duplicated logs in BlockManager. (yliu) + HDFS-7917. Use file to replace data dirs in test to simulate a disk failure. + (Lei (Eddy) Xu via cnauroth) + OPTIMIZATIONS HDFS-7454. Reduce memory footprint for AclEntries in NameNode. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java index fd51e523d9c..f9a2ba139bf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/DataNodeTestUtils.java @@ -40,7 +40,9 @@ import com.google.common.base.Preconditions; * Utility class for accessing package-private DataNode information during tests. * */ -public class DataNodeTestUtils { +public class DataNodeTestUtils { + private static final String DIR_FAILURE_SUFFIX = ".origin"; + public static DatanodeRegistration getDNRegistrationForBP(DataNode dn, String bpid) throws IOException { return dn.getDNRegistrationForBP(bpid); @@ -159,4 +161,61 @@ public class DataNodeTestUtils { final String bpid, final long blkId) { return FsDatasetTestUtil.fetchReplicaInfo(dn.getFSDataset(), bpid, blkId); } + + /** + * It injects disk failures to data dirs by replacing these data dirs with + * regular files. + * + * @param dirs data directories. + * @throws IOException on I/O error. + */ + public static void injectDataDirFailure(File... dirs) throws IOException { + for (File dir : dirs) { + File renamedTo = new File(dir.getPath() + DIR_FAILURE_SUFFIX); + if (renamedTo.exists()) { + throw new IOException(String.format( + "Can not inject failure to dir: %s because %s exists.", + dir, renamedTo)); + } + if (!dir.renameTo(renamedTo)) { + throw new IOException(String.format("Failed to rename %s to %s.", + dir, renamedTo)); + } + if (!dir.createNewFile()) { + throw new IOException(String.format( + "Failed to create file %s to inject disk failure.", dir)); + } + } + } + + /** + * Restore the injected data dir failures. + * + * @see {@link #injectDataDirFailures}. + * @param dirs data directories. + * @throws IOException + */ + public static void restoreDataDirFromFailure(File... dirs) + throws IOException { + for (File dir : dirs) { + File renamedDir = new File(dir.getPath() + DIR_FAILURE_SUFFIX); + if (renamedDir.exists()) { + if (dir.exists()) { + if (!dir.isFile()) { + throw new IOException( + "Injected failure data dir is supposed to be file: " + dir); + } + if (!dir.delete()) { + throw new IOException( + "Failed to delete injected failure data dir: " + dir); + } + } + if (!renamedDir.renameTo(dir)) { + throw new IOException(String.format( + "Failed to recover injected failure data dir %s to %s.", + renamedDir, dir)); + } + } + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java index 8ab3dd28101..2f51d457b4f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeHotSwapVolumes.java @@ -26,7 +26,6 @@ import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.BlockMissingException; import org.apache.hadoop.hdfs.DFSConfigKeys; @@ -682,26 +681,18 @@ public class TestDataNodeHotSwapVolumes { failedVolume != null); long used = failedVolume.getDfsUsed(); - try { - assertTrue("Couldn't chmod local vol: " + dirToFail, - FileUtil.setExecutable(dirToFail, false)); - // Call and wait DataNode to detect disk failure. - long lastDiskErrorCheck = dn.getLastDiskErrorCheck(); - dn.checkDiskErrorAsync(); - while (dn.getLastDiskErrorCheck() == lastDiskErrorCheck) { - Thread.sleep(100); - } - - createFile(new Path("/test1"), 32, (short)2); - assertEquals(used, failedVolume.getDfsUsed()); - } finally { - // Need to restore the mode on dirToFail. Otherwise, if an Exception - // is thrown above, the following tests can not delete this data directory - // and thus fail to start MiniDFSCluster. - assertTrue("Couldn't restore executable for: " + dirToFail, - FileUtil.setExecutable(dirToFail, true)); + DataNodeTestUtils.injectDataDirFailure(dirToFail); + // Call and wait DataNode to detect disk failure. + long lastDiskErrorCheck = dn.getLastDiskErrorCheck(); + dn.checkDiskErrorAsync(); + while (dn.getLastDiskErrorCheck() == lastDiskErrorCheck) { + Thread.sleep(100); } + createFile(new Path("/test1"), 32, (short)2); + assertEquals(used, failedVolume.getDfsUsed()); + + DataNodeTestUtils.restoreDataDirFromFailure(dirToFail); dn.reconfigurePropertyImpl(DFS_DATANODE_DATA_DIR_KEY, oldDataDir); createFile(new Path("/test2"), 32, (short)2); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java index 9cbad6d8666..0428b814f29 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java @@ -121,10 +121,6 @@ public class TestDataNodeVolumeFailure { if(cluster != null) { cluster.shutdown(); } - for (int i = 0; i < 3; i++) { - FileUtil.setExecutable(new File(dataDir, "data"+(2*i+1)), true); - FileUtil.setExecutable(new File(dataDir, "data"+(2*i+2)), true); - } } /* @@ -159,7 +155,7 @@ public class TestDataNodeVolumeFailure { !deteteBlocks(failedDir) ) { throw new IOException("Could not delete hdfs directory '" + failedDir + "'"); - } + } data_fail.setReadOnly(); failedDir.setReadOnly(); System.out.println("Deleteing " + failedDir.getPath() + "; exist=" + failedDir.exists()); @@ -217,7 +213,7 @@ public class TestDataNodeVolumeFailure { DFSTestUtil.waitReplication(fs, file1, (short) 2); File dn0Vol1 = new File(dataDir, "data" + (2 * 0 + 1)); - assertTrue(FileUtil.setExecutable(dn0Vol1, false)); + DataNodeTestUtils.injectDataDirFailure(dn0Vol1); DataNode dn0 = cluster.getDataNodes().get(0); long lastDiskErrorCheck = dn0.getLastDiskErrorCheck(); dn0.checkDiskErrorAsync(); @@ -291,8 +287,7 @@ public class TestDataNodeVolumeFailure { // Fail the first volume on both datanodes File dn1Vol1 = new File(dataDir, "data"+(2*0+1)); File dn2Vol1 = new File(dataDir, "data"+(2*1+1)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false)); + DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1); Path file2 = new Path("/test2"); DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureReporting.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureReporting.java index 9842f25b5ff..aac288aedd7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureReporting.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureReporting.java @@ -34,7 +34,6 @@ import org.apache.commons.logging.impl.Log4JLogger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.ReconfigurationException; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSTestUtil; @@ -87,19 +86,6 @@ public class TestDataNodeVolumeFailureReporting { @After public void tearDown() throws Exception { - // Restore executable permission on all directories where a failure may have - // been simulated by denying execute access. This is based on the maximum - // number of datanodes and the maximum number of storages per data node used - // throughout the tests in this suite. - assumeTrue(!Path.WINDOWS); - int maxDataNodes = 3; - int maxStoragesPerDataNode = 4; - for (int i = 0; i < maxDataNodes; i++) { - for (int j = 1; j <= maxStoragesPerDataNode; j++) { - String subDir = "data" + ((i * maxStoragesPerDataNode) + j); - FileUtil.setExecutable(new File(dataDir, subDir), true); - } - } IOUtils.cleanup(LOG, fs); if (cluster != null) { cluster.shutdown(); @@ -141,8 +127,7 @@ public class TestDataNodeVolumeFailureReporting { * fail. The client does not retry failed nodes even though * perhaps they could succeed because just a single volume failed. */ - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false)); + DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1); /* * Create file1 and wait for 3 replicas (ie all DNs can still @@ -179,7 +164,7 @@ public class TestDataNodeVolumeFailureReporting { * Now fail a volume on the third datanode. We should be able to get * three replicas since we've already identified the other failures. */ - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, false)); + DataNodeTestUtils.injectDataDirFailure(dn3Vol1); Path file2 = new Path("/test2"); DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L); DFSTestUtil.waitReplication(fs, file2, (short)3); @@ -208,7 +193,7 @@ public class TestDataNodeVolumeFailureReporting { * and that it's no longer up. Only wait for two replicas since * we'll never get a third. */ - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, false)); + DataNodeTestUtils.injectDataDirFailure(dn3Vol2); Path file3 = new Path("/test3"); DFSTestUtil.createFile(fs, file3, 1024, (short)3, 1L); DFSTestUtil.waitReplication(fs, file3, (short)2); @@ -233,10 +218,8 @@ public class TestDataNodeVolumeFailureReporting { * restart, so file creation should be able to succeed after * restoring the data directories and restarting the datanodes. */ - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol1, true)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn3Vol2, true)); + DataNodeTestUtils.restoreDataDirFromFailure( + dn1Vol1, dn2Vol1, dn3Vol1, dn3Vol2); cluster.restartDataNodes(); cluster.waitActive(); Path file4 = new Path("/test4"); @@ -275,8 +258,7 @@ public class TestDataNodeVolumeFailureReporting { // third healthy so one node in the pipeline will not fail). File dn1Vol1 = new File(dataDir, "data"+(2*0+1)); File dn2Vol1 = new File(dataDir, "data"+(2*1+1)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false)); + DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn2Vol1); Path file1 = new Path("/test1"); DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L); @@ -323,14 +305,7 @@ public class TestDataNodeVolumeFailureReporting { // Make the first two volume directories on the first two datanodes // non-accessible. - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, - false)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol2, - false)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, - false)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol2, - false)); + DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn1Vol2, dn2Vol1, dn2Vol2); // Create file1 and wait for 3 replicas (ie all DNs can still store a block). // Then assert that all DNs are up, despite the volume failures. @@ -380,8 +355,8 @@ public class TestDataNodeVolumeFailureReporting { File dn1Vol2 = new File(dataDir, "data"+(2*0+2)); File dn2Vol1 = new File(dataDir, "data"+(2*1+1)); File dn2Vol2 = new File(dataDir, "data"+(2*1+2)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, false)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false)); + DataNodeTestUtils.injectDataDirFailure(dn1Vol1); + DataNodeTestUtils.injectDataDirFailure(dn2Vol1); Path file1 = new Path("/test1"); DFSTestUtil.createFile(fs, file1, 1024, (short)2, 1L); @@ -449,8 +424,7 @@ public class TestDataNodeVolumeFailureReporting { // Replace failed volume with healthy volume and run reconfigure DataNode. // The failed volume information should be cleared. - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn1Vol1, true)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true)); + DataNodeTestUtils.restoreDataDirFromFailure(dn1Vol1, dn2Vol1); reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2); reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureToleration.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureToleration.java index 73dc77c3b2f..5b7ac307c01 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureToleration.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureToleration.java @@ -76,10 +76,6 @@ public class TestDataNodeVolumeFailureToleration { @After public void tearDown() throws Exception { - for (int i = 0; i < 3; i++) { - FileUtil.setExecutable(new File(dataDir, "data"+(2*i+1)), true); - FileUtil.setExecutable(new File(dataDir, "data"+(2*i+2)), true); - } cluster.shutdown(); } @@ -152,7 +148,7 @@ public class TestDataNodeVolumeFailureToleration { // Fail a volume on the 2nd DN File dn2Vol1 = new File(dataDir, "data"+(2*1+1)); - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, false)); + DataNodeTestUtils.injectDataDirFailure(dn2Vol1); // Should only get two replicas (the first DN and the 3rd) Path file1 = new Path("/test1"); @@ -165,7 +161,7 @@ public class TestDataNodeVolumeFailureToleration { // If we restore the volume we should still only be able to get // two replicas since the DN is still considered dead. - assertTrue("Couldn't chmod local vol", FileUtil.setExecutable(dn2Vol1, true)); + DataNodeTestUtils.restoreDataDirFromFailure(dn2Vol1); Path file2 = new Path("/test2"); DFSTestUtil.createFile(fs, file2, 1024, (short)3, 1L); DFSTestUtil.waitReplication(fs, file2, (short)2);