HDFS-1955. FSImage.doUpgrade() was made too fault-tolerant by HDFS-1826. Contributed by Matt Foley.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1141658 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Matthew Foley 2011-06-30 18:38:01 +00:00
parent f853b52a3b
commit 5147e283ad
5 changed files with 88 additions and 15 deletions

View File

@ -554,6 +554,9 @@ Trunk (unreleased changes)
BUG FIXES BUG FIXES
HDFS-1955. FSImage.doUpgrade() was made too fault-tolerant by HDFS-1826.
(mattf)
HDFS-2061. Two minor bugs in BlockManager block report processing. (mattf) HDFS-2061. Two minor bugs in BlockManager block report processing. (mattf)
HDFS-1449. Fix test failures - ExtendedBlock must return HDFS-1449. Fix test failures - ExtendedBlock must return

View File

@ -1228,6 +1228,11 @@ public class FSEditLog implements NNStorageListener {
@Override // NNStorageListener @Override // NNStorageListener
public synchronized void errorOccurred(StorageDirectory sd) public synchronized void errorOccurred(StorageDirectory sd)
throws IOException { throws IOException {
if (editStreams == null) {
//errors can occur on storage directories
//before edit streams have been set up
return;
}
ArrayList<EditLogOutputStream> errorStreams ArrayList<EditLogOutputStream> errorStreams
= new ArrayList<EditLogOutputStream>(); = new ArrayList<EditLogOutputStream>();

View File

@ -397,7 +397,12 @@ public class FSImage implements NNStorageListener, Closeable {
LOG.info("Upgrade of " + sd.getRoot() + " is complete."); LOG.info("Upgrade of " + sd.getRoot() + " is complete.");
} }
isUpgradeFinalized = false; isUpgradeFinalized = false;
storage.reportErrorsOnDirectories(errorSDs); if (!errorSDs.isEmpty()) {
storage.reportErrorsOnDirectories(errorSDs);
//during upgrade, it's a fatal error to fail any storage directory
throw new IOException("Upgrade failed in " + errorSDs.size()
+ " storage directory(ies), previously logged.");
}
storage.initializeDistributedUpgrade(); storage.initializeDistributedUpgrade();
editLog.open(); editLog.open();
} }

View File

@ -905,7 +905,7 @@ public class NNStorage extends Storage implements Closeable {
*/ */
void reportErrorsOnDirectory(StorageDirectory sd) void reportErrorsOnDirectory(StorageDirectory sd)
throws IOException { throws IOException {
LOG.warn("Error reported on storage directory " + sd); LOG.error("Error reported on storage directory " + sd);
String lsd = listStorageDirectories(); String lsd = listStorageDirectories();
LOG.debug("current list of storage dirs:" + lsd); LOG.debug("current list of storage dirs:" + lsd);
@ -914,12 +914,12 @@ public class NNStorage extends Storage implements Closeable {
listener.errorOccurred(sd); listener.errorOccurred(sd);
} }
LOG.info("About to remove corresponding storage: " LOG.warn("About to remove corresponding storage: "
+ sd.getRoot().getAbsolutePath()); + sd.getRoot().getAbsolutePath());
try { try {
sd.unlock(); sd.unlock();
} catch (Exception e) { } catch (Exception e) {
LOG.info("Unable to unlock bad storage directory: " LOG.warn("Unable to unlock bad storage directory: "
+ sd.getRoot().getPath(), e); + sd.getRoot().getPath(), e);
} }

View File

@ -22,18 +22,19 @@ import static org.apache.hadoop.hdfs.server.common.HdfsConstants.NodeType.NAME_N
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.regex.Pattern;
import junit.framework.TestCase;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.StorageInfo; import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption; import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.namenode.TestParallelImageWrite; import org.apache.hadoop.hdfs.server.namenode.TestParallelImageWrite;
import org.apache.hadoop.util.StringUtils;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import static org.junit.Assert.*; import static org.junit.Assert.*;
@ -44,8 +45,7 @@ import static org.junit.Assert.*;
*/ */
public class TestDFSUpgrade { public class TestDFSUpgrade {
private static final Log LOG = LogFactory.getLog( private static final Log LOG = LogFactory.getLog(TestDFSUpgrade.class.getName());
"org.apache.hadoop.hdfs.TestDFSUpgrade");
private Configuration conf; private Configuration conf;
private int testCounter = 0; private int testCounter = 0;
private MiniDFSCluster cluster = null; private MiniDFSCluster cluster = null;
@ -111,11 +111,27 @@ public class TestDFSUpgrade {
} }
} }
/** /**
* Attempts to start a NameNode with the given operation. Starting * Attempts to start a NameNode with the given operation. Starting
* the NameNode should throw an exception. * the NameNode should throw an exception.
*/ */
void startNameNodeShouldFail(StartupOption operation) { void startNameNodeShouldFail(StartupOption operation) {
startNameNodeShouldFail(operation, null, null);
}
/**
* Attempts to start a NameNode with the given operation. Starting
* the NameNode should throw an exception.
* @param operation - NameNode startup operation
* @param exceptionClass - if non-null, will check that the caught exception
* is assignment-compatible with exceptionClass
* @param messagePattern - if non-null, will check that a substring of the
* message from the caught exception matches this pattern, via the
* {@link Matcher#find()} method.
*/
void startNameNodeShouldFail(StartupOption operation,
Class<? extends Exception> exceptionClass, Pattern messagePattern) {
try { try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0) cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
.startupOption(operation) .startupOption(operation)
@ -123,9 +139,23 @@ public class TestDFSUpgrade {
.manageDataDfsDirs(false) .manageDataDfsDirs(false)
.manageNameDfsDirs(false) .manageNameDfsDirs(false)
.build(); // should fail .build(); // should fail
throw new AssertionError("NameNode should have failed to start"); fail("NameNode should have failed to start");
} catch (Exception expected) {
// expected } catch (Exception e) {
// expect exception
if (exceptionClass != null) {
assertTrue("Caught exception is not of expected class "
+ exceptionClass.getSimpleName() + ": "
+ StringUtils.stringifyException(e),
exceptionClass.isInstance(e));
}
if (messagePattern != null) {
assertTrue("Caught exception message string does not match expected pattern \""
+ messagePattern.pattern() + "\" : "
+ StringUtils.stringifyException(e),
messagePattern.matcher(e.getMessage()).find());
}
LOG.info("Successfully detected expected NameNode startup failure.");
} }
} }
@ -155,6 +185,11 @@ public class TestDFSUpgrade {
.build(); .build();
} }
@BeforeClass
public static void initialize() throws Exception {
UpgradeUtilities.initialize();
}
/** /**
* This test attempts to upgrade the NameNode and DataNode under * This test attempts to upgrade the NameNode and DataNode under
* a number of valid and invalid conditions. * a number of valid and invalid conditions.
@ -162,8 +197,6 @@ public class TestDFSUpgrade {
@Test @Test
public void testUpgrade() throws Exception { public void testUpgrade() throws Exception {
File[] baseDirs; File[] baseDirs;
UpgradeUtilities.initialize();
StorageInfo storageInfo = null; StorageInfo storageInfo = null;
for (int numDirs = 1; numDirs <= 2; numDirs++) { for (int numDirs = 1; numDirs <= 2; numDirs++) {
conf = new HdfsConfiguration(); conf = new HdfsConfiguration();
@ -311,6 +344,30 @@ public class TestDFSUpgrade {
UpgradeUtilities.createEmptyDirs(nameNodeDirs); UpgradeUtilities.createEmptyDirs(nameNodeDirs);
} }
} }
/*
* Stand-alone test to detect failure of one SD during parallel upgrade.
* At this time, can only be done with manual hack of {@link FSImage.doUpgrade()}
*/
@Ignore
public void testUpgrade4() throws Exception {
int numDirs = 4;
conf = new HdfsConfiguration();
conf.setInt(DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOURS_KEY, -1);
conf = UpgradeUtilities.initializeStorageStateConf(numDirs, conf);
String[] nameNodeDirs = conf.getStrings(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY);
log("NameNode upgrade with one bad storage dir", numDirs);
UpgradeUtilities.createNameNodeStorageDirs(nameNodeDirs, "current");
try {
// assert("storage dir has been prepared for failure before reaching this point");
startNameNodeShouldFail(StartupOption.UPGRADE, IOException.class,
Pattern.compile("failed in 1 storage"));
} finally {
// assert("storage dir shall be returned to normal state before exiting");
UpgradeUtilities.createEmptyDirs(nameNodeDirs);
}
}
@Test(expected=IOException.class) @Test(expected=IOException.class)
public void testUpgradeFromPreUpgradeLVFails() throws IOException { public void testUpgradeFromPreUpgradeLVFails() throws IOException {
@ -320,6 +377,7 @@ public class TestDFSUpgrade {
fail("Expected IOException is not thrown"); fail("Expected IOException is not thrown");
} }
@Ignore
public void test203LayoutVersion() { public void test203LayoutVersion() {
for (int lv : Storage.LAYOUT_VERSIONS_203) { for (int lv : Storage.LAYOUT_VERSIONS_203) {
assertTrue(Storage.is203LayoutVersion(lv)); assertTrue(Storage.is203LayoutVersion(lv));
@ -327,7 +385,9 @@ public class TestDFSUpgrade {
} }
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
new TestDFSUpgrade().testUpgrade(); TestDFSUpgrade t = new TestDFSUpgrade();
TestDFSUpgrade.initialize();
t.testUpgrade();
} }
} }