HDFS-1955. FSImage.doUpgrade() was made too fault-tolerant by HDFS-1826. Contributed by Matt Foley.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1141658 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f853b52a3b
commit
5147e283ad
|
@ -554,6 +554,9 @@ Trunk (unreleased changes)
|
||||||
|
|
||||||
BUG FIXES
|
BUG FIXES
|
||||||
|
|
||||||
|
HDFS-1955. FSImage.doUpgrade() was made too fault-tolerant by HDFS-1826.
|
||||||
|
(mattf)
|
||||||
|
|
||||||
HDFS-2061. Two minor bugs in BlockManager block report processing. (mattf)
|
HDFS-2061. Two minor bugs in BlockManager block report processing. (mattf)
|
||||||
|
|
||||||
HDFS-1449. Fix test failures - ExtendedBlock must return
|
HDFS-1449. Fix test failures - ExtendedBlock must return
|
||||||
|
|
|
@ -1228,6 +1228,11 @@ public class FSEditLog implements NNStorageListener {
|
||||||
@Override // NNStorageListener
|
@Override // NNStorageListener
|
||||||
public synchronized void errorOccurred(StorageDirectory sd)
|
public synchronized void errorOccurred(StorageDirectory sd)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
if (editStreams == null) {
|
||||||
|
//errors can occur on storage directories
|
||||||
|
//before edit streams have been set up
|
||||||
|
return;
|
||||||
|
}
|
||||||
ArrayList<EditLogOutputStream> errorStreams
|
ArrayList<EditLogOutputStream> errorStreams
|
||||||
= new ArrayList<EditLogOutputStream>();
|
= new ArrayList<EditLogOutputStream>();
|
||||||
|
|
||||||
|
|
|
@ -397,7 +397,12 @@ public class FSImage implements NNStorageListener, Closeable {
|
||||||
LOG.info("Upgrade of " + sd.getRoot() + " is complete.");
|
LOG.info("Upgrade of " + sd.getRoot() + " is complete.");
|
||||||
}
|
}
|
||||||
isUpgradeFinalized = false;
|
isUpgradeFinalized = false;
|
||||||
|
if (!errorSDs.isEmpty()) {
|
||||||
storage.reportErrorsOnDirectories(errorSDs);
|
storage.reportErrorsOnDirectories(errorSDs);
|
||||||
|
//during upgrade, it's a fatal error to fail any storage directory
|
||||||
|
throw new IOException("Upgrade failed in " + errorSDs.size()
|
||||||
|
+ " storage directory(ies), previously logged.");
|
||||||
|
}
|
||||||
storage.initializeDistributedUpgrade();
|
storage.initializeDistributedUpgrade();
|
||||||
editLog.open();
|
editLog.open();
|
||||||
}
|
}
|
||||||
|
|
|
@ -905,7 +905,7 @@ public class NNStorage extends Storage implements Closeable {
|
||||||
*/
|
*/
|
||||||
void reportErrorsOnDirectory(StorageDirectory sd)
|
void reportErrorsOnDirectory(StorageDirectory sd)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
LOG.warn("Error reported on storage directory " + sd);
|
LOG.error("Error reported on storage directory " + sd);
|
||||||
|
|
||||||
String lsd = listStorageDirectories();
|
String lsd = listStorageDirectories();
|
||||||
LOG.debug("current list of storage dirs:" + lsd);
|
LOG.debug("current list of storage dirs:" + lsd);
|
||||||
|
@ -914,12 +914,12 @@ public class NNStorage extends Storage implements Closeable {
|
||||||
listener.errorOccurred(sd);
|
listener.errorOccurred(sd);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG.info("About to remove corresponding storage: "
|
LOG.warn("About to remove corresponding storage: "
|
||||||
+ sd.getRoot().getAbsolutePath());
|
+ sd.getRoot().getAbsolutePath());
|
||||||
try {
|
try {
|
||||||
sd.unlock();
|
sd.unlock();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
LOG.info("Unable to unlock bad storage directory: "
|
LOG.warn("Unable to unlock bad storage directory: "
|
||||||
+ sd.getRoot().getPath(), e);
|
+ sd.getRoot().getPath(), e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,18 +22,19 @@ import static org.apache.hadoop.hdfs.server.common.HdfsConstants.NodeType.NAME_N
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
import junit.framework.TestCase;
|
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileUtil;
|
import org.apache.hadoop.fs.FileUtil;
|
||||||
import org.apache.hadoop.hdfs.protocol.FSConstants;
|
|
||||||
import org.apache.hadoop.hdfs.server.common.Storage;
|
import org.apache.hadoop.hdfs.server.common.Storage;
|
||||||
import org.apache.hadoop.hdfs.server.common.StorageInfo;
|
import org.apache.hadoop.hdfs.server.common.StorageInfo;
|
||||||
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
|
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.TestParallelImageWrite;
|
import org.apache.hadoop.hdfs.server.namenode.TestParallelImageWrite;
|
||||||
|
import org.apache.hadoop.util.StringUtils;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import static org.junit.Assert.*;
|
import static org.junit.Assert.*;
|
||||||
|
|
||||||
|
@ -44,8 +45,7 @@ import static org.junit.Assert.*;
|
||||||
*/
|
*/
|
||||||
public class TestDFSUpgrade {
|
public class TestDFSUpgrade {
|
||||||
|
|
||||||
private static final Log LOG = LogFactory.getLog(
|
private static final Log LOG = LogFactory.getLog(TestDFSUpgrade.class.getName());
|
||||||
"org.apache.hadoop.hdfs.TestDFSUpgrade");
|
|
||||||
private Configuration conf;
|
private Configuration conf;
|
||||||
private int testCounter = 0;
|
private int testCounter = 0;
|
||||||
private MiniDFSCluster cluster = null;
|
private MiniDFSCluster cluster = null;
|
||||||
|
@ -111,11 +111,27 @@ public class TestDFSUpgrade {
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Attempts to start a NameNode with the given operation. Starting
|
* Attempts to start a NameNode with the given operation. Starting
|
||||||
* the NameNode should throw an exception.
|
* the NameNode should throw an exception.
|
||||||
*/
|
*/
|
||||||
void startNameNodeShouldFail(StartupOption operation) {
|
void startNameNodeShouldFail(StartupOption operation) {
|
||||||
|
startNameNodeShouldFail(operation, null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Attempts to start a NameNode with the given operation. Starting
|
||||||
|
* the NameNode should throw an exception.
|
||||||
|
* @param operation - NameNode startup operation
|
||||||
|
* @param exceptionClass - if non-null, will check that the caught exception
|
||||||
|
* is assignment-compatible with exceptionClass
|
||||||
|
* @param messagePattern - if non-null, will check that a substring of the
|
||||||
|
* message from the caught exception matches this pattern, via the
|
||||||
|
* {@link Matcher#find()} method.
|
||||||
|
*/
|
||||||
|
void startNameNodeShouldFail(StartupOption operation,
|
||||||
|
Class<? extends Exception> exceptionClass, Pattern messagePattern) {
|
||||||
try {
|
try {
|
||||||
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
|
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
|
||||||
.startupOption(operation)
|
.startupOption(operation)
|
||||||
|
@ -123,9 +139,23 @@ public class TestDFSUpgrade {
|
||||||
.manageDataDfsDirs(false)
|
.manageDataDfsDirs(false)
|
||||||
.manageNameDfsDirs(false)
|
.manageNameDfsDirs(false)
|
||||||
.build(); // should fail
|
.build(); // should fail
|
||||||
throw new AssertionError("NameNode should have failed to start");
|
fail("NameNode should have failed to start");
|
||||||
} catch (Exception expected) {
|
|
||||||
// expected
|
} catch (Exception e) {
|
||||||
|
// expect exception
|
||||||
|
if (exceptionClass != null) {
|
||||||
|
assertTrue("Caught exception is not of expected class "
|
||||||
|
+ exceptionClass.getSimpleName() + ": "
|
||||||
|
+ StringUtils.stringifyException(e),
|
||||||
|
exceptionClass.isInstance(e));
|
||||||
|
}
|
||||||
|
if (messagePattern != null) {
|
||||||
|
assertTrue("Caught exception message string does not match expected pattern \""
|
||||||
|
+ messagePattern.pattern() + "\" : "
|
||||||
|
+ StringUtils.stringifyException(e),
|
||||||
|
messagePattern.matcher(e.getMessage()).find());
|
||||||
|
}
|
||||||
|
LOG.info("Successfully detected expected NameNode startup failure.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -155,6 +185,11 @@ public class TestDFSUpgrade {
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void initialize() throws Exception {
|
||||||
|
UpgradeUtilities.initialize();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This test attempts to upgrade the NameNode and DataNode under
|
* This test attempts to upgrade the NameNode and DataNode under
|
||||||
* a number of valid and invalid conditions.
|
* a number of valid and invalid conditions.
|
||||||
|
@ -162,8 +197,6 @@ public class TestDFSUpgrade {
|
||||||
@Test
|
@Test
|
||||||
public void testUpgrade() throws Exception {
|
public void testUpgrade() throws Exception {
|
||||||
File[] baseDirs;
|
File[] baseDirs;
|
||||||
UpgradeUtilities.initialize();
|
|
||||||
|
|
||||||
StorageInfo storageInfo = null;
|
StorageInfo storageInfo = null;
|
||||||
for (int numDirs = 1; numDirs <= 2; numDirs++) {
|
for (int numDirs = 1; numDirs <= 2; numDirs++) {
|
||||||
conf = new HdfsConfiguration();
|
conf = new HdfsConfiguration();
|
||||||
|
@ -312,6 +345,30 @@ public class TestDFSUpgrade {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Stand-alone test to detect failure of one SD during parallel upgrade.
|
||||||
|
* At this time, can only be done with manual hack of {@link FSImage.doUpgrade()}
|
||||||
|
*/
|
||||||
|
@Ignore
|
||||||
|
public void testUpgrade4() throws Exception {
|
||||||
|
int numDirs = 4;
|
||||||
|
conf = new HdfsConfiguration();
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_DATANODE_SCAN_PERIOD_HOURS_KEY, -1);
|
||||||
|
conf = UpgradeUtilities.initializeStorageStateConf(numDirs, conf);
|
||||||
|
String[] nameNodeDirs = conf.getStrings(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY);
|
||||||
|
|
||||||
|
log("NameNode upgrade with one bad storage dir", numDirs);
|
||||||
|
UpgradeUtilities.createNameNodeStorageDirs(nameNodeDirs, "current");
|
||||||
|
try {
|
||||||
|
// assert("storage dir has been prepared for failure before reaching this point");
|
||||||
|
startNameNodeShouldFail(StartupOption.UPGRADE, IOException.class,
|
||||||
|
Pattern.compile("failed in 1 storage"));
|
||||||
|
} finally {
|
||||||
|
// assert("storage dir shall be returned to normal state before exiting");
|
||||||
|
UpgradeUtilities.createEmptyDirs(nameNodeDirs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test(expected=IOException.class)
|
@Test(expected=IOException.class)
|
||||||
public void testUpgradeFromPreUpgradeLVFails() throws IOException {
|
public void testUpgradeFromPreUpgradeLVFails() throws IOException {
|
||||||
// Upgrade from versions prior to Storage#LAST_UPGRADABLE_LAYOUT_VERSION
|
// Upgrade from versions prior to Storage#LAST_UPGRADABLE_LAYOUT_VERSION
|
||||||
|
@ -320,6 +377,7 @@ public class TestDFSUpgrade {
|
||||||
fail("Expected IOException is not thrown");
|
fail("Expected IOException is not thrown");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Ignore
|
||||||
public void test203LayoutVersion() {
|
public void test203LayoutVersion() {
|
||||||
for (int lv : Storage.LAYOUT_VERSIONS_203) {
|
for (int lv : Storage.LAYOUT_VERSIONS_203) {
|
||||||
assertTrue(Storage.is203LayoutVersion(lv));
|
assertTrue(Storage.is203LayoutVersion(lv));
|
||||||
|
@ -327,7 +385,9 @@ public class TestDFSUpgrade {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
new TestDFSUpgrade().testUpgrade();
|
TestDFSUpgrade t = new TestDFSUpgrade();
|
||||||
|
TestDFSUpgrade.initialize();
|
||||||
|
t.testUpgrade();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue