From 5147e283ad9757ac2cabaf282ae5cbba76826ede Mon Sep 17 00:00:00 2001 From: Matthew Foley Date: Thu, 30 Jun 2011 18:38:01 +0000 Subject: [PATCH] HDFS-1955. FSImage.doUpgrade() was made too fault-tolerant by HDFS-1826. Contributed by Matt Foley. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1141658 13f79535-47bb-0310-9956-ffa450edef68 --- hdfs/CHANGES.txt | 3 + .../hdfs/server/namenode/FSEditLog.java | 5 ++ .../hadoop/hdfs/server/namenode/FSImage.java | 7 +- .../hdfs/server/namenode/NNStorage.java | 6 +- .../apache/hadoop/hdfs/TestDFSUpgrade.java | 82 ++++++++++++++++--- 5 files changed, 88 insertions(+), 15 deletions(-) diff --git a/hdfs/CHANGES.txt b/hdfs/CHANGES.txt index c98f7220512..48e020aaa66 100644 --- a/hdfs/CHANGES.txt +++ b/hdfs/CHANGES.txt @@ -554,6 +554,9 @@ Trunk (unreleased changes) BUG FIXES + HDFS-1955. FSImage.doUpgrade() was made too fault-tolerant by HDFS-1826. + (mattf) + HDFS-2061. Two minor bugs in BlockManager block report processing. (mattf) HDFS-1449. Fix test failures - ExtendedBlock must return diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java index 9bdec22a785..b8316db9b18 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java @@ -1228,6 +1228,11 @@ public class FSEditLog implements NNStorageListener { @Override // NNStorageListener public synchronized void errorOccurred(StorageDirectory sd) throws IOException { + if (editStreams == null) { + //errors can occur on storage directories + //before edit streams have been set up + return; + } ArrayList errorStreams = new ArrayList(); diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java index 5874c27b987..bce5db1d6a3 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java @@ -397,7 +397,12 @@ public class FSImage implements NNStorageListener, Closeable { LOG.info("Upgrade of " + sd.getRoot() + " is complete."); } isUpgradeFinalized = false; - storage.reportErrorsOnDirectories(errorSDs); + if (!errorSDs.isEmpty()) { + storage.reportErrorsOnDirectories(errorSDs); + //during upgrade, it's a fatal error to fail any storage directory + throw new IOException("Upgrade failed in " + errorSDs.size() + + " storage directory(ies), previously logged."); + } storage.initializeDistributedUpgrade(); editLog.open(); } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java index cef7f015428..392e631e3a4 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java @@ -905,7 +905,7 @@ public class NNStorage extends Storage implements Closeable { */ void reportErrorsOnDirectory(StorageDirectory sd) throws IOException { - LOG.warn("Error reported on storage directory " + sd); + LOG.error("Error reported on storage directory " + sd); String lsd = listStorageDirectories(); LOG.debug("current list of storage dirs:" + lsd); @@ -914,12 +914,12 @@ public class NNStorage extends Storage implements Closeable { listener.errorOccurred(sd); } - LOG.info("About to remove corresponding storage: " + LOG.warn("About to remove corresponding storage: " + sd.getRoot().getAbsolutePath()); try { sd.unlock(); } catch (Exception e) { - LOG.info("Unable to unlock bad storage directory: " + LOG.warn("Unable to unlock bad storage directory: " + sd.getRoot().getPath(), e); } diff --git a/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/TestDFSUpgrade.java b/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/TestDFSUpgrade.java index 71abbe494e1..f5e20a9247f 100644 --- a/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/TestDFSUpgrade.java +++ b/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/TestDFSUpgrade.java @@ -22,18 +22,19 @@ import static org.apache.hadoop.hdfs.server.common.HdfsConstants.NodeType.NAME_N import java.io.File; import java.io.IOException; - -import junit.framework.TestCase; +import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.StorageInfo; import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption; import org.apache.hadoop.hdfs.server.namenode.TestParallelImageWrite; +import org.apache.hadoop.util.StringUtils; +import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; import static org.junit.Assert.*; @@ -44,8 +45,7 @@ import static org.junit.Assert.*; */ public class TestDFSUpgrade { - private static final Log LOG = LogFactory.getLog( - "org.apache.hadoop.hdfs.TestDFSUpgrade"); + private static final Log LOG = LogFactory.getLog(TestDFSUpgrade.class.getName()); private Configuration conf; private int testCounter = 0; private MiniDFSCluster cluster = null; @@ -111,11 +111,27 @@ public class TestDFSUpgrade { } } + /** * Attempts to start a NameNode with the given operation. Starting * the NameNode should throw an exception. */ void startNameNodeShouldFail(StartupOption operation) { + startNameNodeShouldFail(operation, null, null); + } + + /** + * Attempts to start a NameNode with the given operation. Starting + * the NameNode should throw an exception. + * @param operation - NameNode startup operation + * @param exceptionClass - if non-null, will check that the caught exception + * is assignment-compatible with exceptionClass + * @param messagePattern - if non-null, will check that a substring of the + * message from the caught exception matches this pattern, via the + * {@link Matcher#find()} method. + */ + void startNameNodeShouldFail(StartupOption operation, + Class exceptionClass, Pattern messagePattern) { try { cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0) .startupOption(operation) @@ -123,9 +139,23 @@ public class TestDFSUpgrade { .manageDataDfsDirs(false) .manageNameDfsDirs(false) .build(); // should fail - throw new AssertionError("NameNode should have failed to start"); - } catch (Exception expected) { - // expected + fail("NameNode should have failed to start"); + + } catch (Exception e) { + // expect exception + if (exceptionClass != null) { + assertTrue("Caught exception is not of expected class " + + exceptionClass.getSimpleName() + ": " + + StringUtils.stringifyException(e), + exceptionClass.isInstance(e)); + } + if (messagePattern != null) { + assertTrue("Caught exception message string does not match expected pattern \"" + + messagePattern.pattern() + "\" : " + + StringUtils.stringifyException(e), + messagePattern.matcher(e.getMessage()).find()); + } + LOG.info("Successfully detected expected NameNode startup failure."); } } @@ -155,6 +185,11 @@ public class TestDFSUpgrade { .build(); } + @BeforeClass + public static void initialize() throws Exception { + UpgradeUtilities.initialize(); + } + /** * This test attempts to upgrade the NameNode and DataNode under * a number of valid and invalid conditions. @@ -162,8 +197,6 @@ public class TestDFSUpgrade { @Test public void testUpgrade() throws Exception { File[] baseDirs; - UpgradeUtilities.initialize(); - StorageInfo storageInfo = null; for (int numDirs = 1; numDirs <= 2; numDirs++) { conf = new HdfsConfiguration(); @@ -311,6 +344,30 @@ public class TestDFSUpgrade { UpgradeUtilities.createEmptyDirs(nameNodeDirs); } } + + /* + * Stand-alone test to detect failure of one SD during parallel upgrade. + * At this time, can only be done with manual hack of {@link FSImage.doUpgrade()} + */ + @Ignore + public void testUpgrade4() throws Exception { + int numDirs = 4; + conf = new HdfsConfiguration(); + conf.setInt(DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOURS_KEY, -1); + conf = UpgradeUtilities.initializeStorageStateConf(numDirs, conf); + String[] nameNodeDirs = conf.getStrings(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY); + + log("NameNode upgrade with one bad storage dir", numDirs); + UpgradeUtilities.createNameNodeStorageDirs(nameNodeDirs, "current"); + try { + // assert("storage dir has been prepared for failure before reaching this point"); + startNameNodeShouldFail(StartupOption.UPGRADE, IOException.class, + Pattern.compile("failed in 1 storage")); + } finally { + // assert("storage dir shall be returned to normal state before exiting"); + UpgradeUtilities.createEmptyDirs(nameNodeDirs); + } + } @Test(expected=IOException.class) public void testUpgradeFromPreUpgradeLVFails() throws IOException { @@ -320,6 +377,7 @@ public class TestDFSUpgrade { fail("Expected IOException is not thrown"); } + @Ignore public void test203LayoutVersion() { for (int lv : Storage.LAYOUT_VERSIONS_203) { assertTrue(Storage.is203LayoutVersion(lv)); @@ -327,7 +385,9 @@ public class TestDFSUpgrade { } public static void main(String[] args) throws Exception { - new TestDFSUpgrade().testUpgrade(); + TestDFSUpgrade t = new TestDFSUpgrade(); + TestDFSUpgrade.initialize(); + t.testUpgrade(); } }