From 7bd41f031f3852ac44b911a71d1d57ea3629b134 Mon Sep 17 00:00:00 2001 From: Matthew Foley Date: Thu, 30 Jun 2011 23:21:58 +0000 Subject: [PATCH] HDFS-2011. Removal and restoration of storage directories on checkpointing failure doesn't work properly. Contributed by Ravi Prakash. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1141748 13f79535-47bb-0310-9956-ffa450edef68 --- hdfs/CHANGES.txt | 3 + .../namenode/EditLogFileOutputStream.java | 32 ++++++--- .../hdfs/server/namenode/NNStorage.java | 6 ++ .../hdfs/server/namenode/TestCheckpoint.java | 67 +++++++++++++++++++ 4 files changed, 98 insertions(+), 10 deletions(-) diff --git a/hdfs/CHANGES.txt b/hdfs/CHANGES.txt index 3acf54faad0..988b8634959 100644 --- a/hdfs/CHANGES.txt +++ b/hdfs/CHANGES.txt @@ -556,6 +556,9 @@ Trunk (unreleased changes) BUG FIXES + HDFS-2011. Removal and restoration of storage directories on checkpointing + failure doesn't work properly. (Ravi Prakash via mattf) + HDFS-1955. FSImage.doUpgrade() was made too fault-tolerant by HDFS-1826. (mattf) diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java index 08364ac83e1..bf196a23692 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java @@ -122,19 +122,31 @@ class EditLogFileOutputStream extends EditLogOutputStream { public void close() throws IOException { // close should have been called after all pending transactions // have been flushed & synced. - int bufSize = bufCurrent.size(); - if (bufSize != 0) { - throw new IOException("FSEditStream has " + bufSize - + " bytes still to be flushed and cannot " + "be closed."); + // if already closed, just skip + if(bufCurrent != null) + { + int bufSize = bufCurrent.size(); + if (bufSize != 0) { + throw new IOException("FSEditStream has " + bufSize + + " bytes still to be flushed and cannot " + "be closed."); + } + bufCurrent.close(); + bufCurrent = null; + } + + if(bufReady != null) { + bufReady.close(); + bufReady = null; } - bufCurrent.close(); - bufReady.close(); // remove the last INVALID marker from transaction log. - fc.truncate(fc.position()); - fp.close(); - - bufCurrent = bufReady = null; + if (fc != null && fc.isOpen()) { + fc.truncate(fc.position()); + fc.close(); + } + if (fp != null) { + fp.close(); + } } /** diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java index 392e631e3a4..57fdef96552 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java @@ -511,6 +511,12 @@ public class NNStorage extends Storage implements Closeable { // Close any edits stream associated with this dir and remove directory LOG.warn("incrementCheckpointTime failed on " + sd.getRoot().getPath() + ";type="+sd.getStorageDirType()); + try { + reportErrorsOnDirectory(sd); + } catch (IOException ioe) { + LOG.error("Failed to report and remove NN storage directory " + + sd.getRoot().getPath(), ioe); + } } } } diff --git a/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java b/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java index 2187a9ef59f..0845cacf08b 100644 --- a/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java +++ b/hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java @@ -21,6 +21,7 @@ import junit.framework.TestCase; import java.io.*; import java.net.InetSocketAddress; import java.net.URI; +import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Iterator; @@ -137,6 +138,72 @@ public class TestCheckpoint extends TestCase { resurrectNameDir(first); // put back namedir } + /** + * Tests EditLogFileOutputStream doesn't throw NullPointerException on being + * closed twice. + * See https://issues.apache.org/jira/browse/HDFS-2011 + */ + public void testEditLogFileOutputStreamCloses() + throws IOException,NullPointerException { + System.out.println("Testing EditLogFileOutputStream doesn't throw " + + "NullPointerException on being closed twice"); + File editLogStreamFile = null; + try { + editLogStreamFile = new File(System.getProperty("test.build.data","/tmp"), + "editLogStream.dat"); + EditLogFileOutputStream editLogStream = + new EditLogFileOutputStream(editLogStreamFile, 0); + editLogStream.close(); + //Closing an twice should not throw a NullPointerException + editLogStream.close(); + } finally { + if (editLogStreamFile != null) + // Cleanup the editLogStream.dat file we created + editLogStreamFile.delete(); + } + System.out.println("Successfully tested EditLogFileOutputStream doesn't " + + "throw NullPointerException on being closed twice"); + } + + /** + * Checks that an IOException in NNStorage.setCheckpointTimeInStorage is handled + * correctly (by removing the storage directory) + * See https://issues.apache.org/jira/browse/HDFS-2011 + */ + public void testSetCheckpointTimeInStorageHandlesIOException() throws Exception { + System.out.println("Check IOException handled correctly by setCheckpointTimeInStorage"); + NNStorage nnStorage = new NNStorage(new HdfsConfiguration()); + ArrayList fsImageDirs = new ArrayList(); + ArrayList editsDirs = new ArrayList(); + File filePath = + new File(System.getProperty("test.build.data","/tmp"), "storageDirToCheck"); + assertTrue("Couldn't create directory storageDirToCheck", + filePath.exists() || filePath.mkdirs()); + try { + fsImageDirs.add(filePath.toURI()); + editsDirs.add(filePath.toURI()); + // Initialize NNStorage + nnStorage.setStorageDirectories(fsImageDirs, editsDirs); + assertTrue("List of storage directories didn't have storageDirToCheck.", + nnStorage.getEditsDirectories().iterator().next(). + toString().indexOf("storageDirToCheck") != -1); + assertTrue("List of removed storage directories wasn't empty", + nnStorage.getRemovedStorageDirs().isEmpty()); + } finally { + // Delete storage directory to cause IOException in setCheckpointTimeInStorage + assertTrue("Couldn't remove directory " + filePath.getAbsolutePath(), + filePath.delete()); + } + // Just call setCheckpointTimeInStorage using any random number + nnStorage.setCheckpointTimeInStorage(1); + List listRsd = nnStorage.getRemovedStorageDirs(); + assertTrue("Removed directory wasn't what was expected", + listRsd.size() > 0 && listRsd.get(listRsd.size() - 1).getRoot(). + toString().indexOf("storageDirToCheck") != -1); + System.out.println("Successfully checked IOException is handled correctly " + + "by setCheckpointTimeInStorage"); + } + /* * Simulate namenode crashing after rolling edit log. */