From 5a5473b29f8892ac380b6d8c468d54a3a51f316e Mon Sep 17 00:00:00 2001 From: Eli Collins Date: Sun, 7 Oct 2012 21:35:51 +0000 Subject: [PATCH] HDFS-4006. TestCheckpoint#testSecondaryHasVeryOutOfDateImage occasionally fails due to unexpected exit. Contributed by Todd Lipcon git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1395387 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 +++ .../server/namenode/SecondaryNameNode.java | 26 ++++++++++++++++--- .../hdfs/server/namenode/TestCheckpoint.java | 24 ++++++++++++++--- 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 3aa7c25db84..9b65d1ec542 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -300,6 +300,9 @@ Release 2.0.3-alpha - Unreleased HDFS-3999. HttpFS OPEN operation expects len parameter, it should be length. (tucu) + HDFS-4006. TestCheckpoint#testSecondaryHasVeryOutOfDateImage + occasionally fails due to unexpected exit. (todd via eli) + Release 2.0.2-alpha - 2012-09-07 INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java index 9a44b1ec8a3..9d0629c3bff 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java @@ -78,6 +78,7 @@ import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Time; import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; /********************************************************** @@ -122,6 +123,8 @@ public class SecondaryNameNode implements Runnable { private CheckpointConf checkpointConf; private FSNamesystem namesystem; + private Thread checkpointThread; + @Override public String toString() { @@ -277,6 +280,15 @@ public class SecondaryNameNode implements Runnable { */ public void shutdown() { shouldRun = false; + if (checkpointThread != null) { + checkpointThread.interrupt(); + try { + checkpointThread.join(10000); + } catch (InterruptedException e) { + LOG.info("Interrupted waiting to join on checkpointer thread"); + Thread.currentThread().interrupt(); // maintain status + } + } try { if (infoServer != null) infoServer.stop(); } catch (Exception e) { @@ -586,12 +598,20 @@ public class SecondaryNameNode implements Runnable { terminate(ret); } - // Create a never ending deamon - Daemon checkpointThread = new Daemon(secondary); - checkpointThread.start(); + secondary.startCheckpointThread(); } + public void startCheckpointThread() { + Preconditions.checkState(checkpointThread == null, + "Should not already have a thread"); + Preconditions.checkState(shouldRun, "shouldRun should be true"); + + checkpointThread = new Daemon(this); + checkpointThread.start(); + } + + /** * Container for parsed command-line options. */ diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java index 826564e63f5..8a91de7acdb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java @@ -30,6 +30,8 @@ import static org.junit.Assert.fail; import java.io.File; import java.io.IOException; import java.lang.management.ManagementFactory; +import java.lang.management.ThreadInfo; +import java.lang.management.ThreadMXBean; import java.net.InetSocketAddress; import java.net.URI; import java.util.ArrayList; @@ -74,6 +76,7 @@ import org.apache.hadoop.test.GenericTestUtils.DelayAnswer; import org.apache.hadoop.test.GenericTestUtils.LogCapturer; import org.apache.hadoop.util.StringUtils; import org.apache.log4j.Level; +import org.junit.After; import org.junit.Before; import org.junit.Test; import org.mockito.ArgumentMatcher; @@ -115,6 +118,22 @@ public class TestCheckpoint { CheckpointFaultInjector.instance = faultInjector; } + @After + public void checkForSNNThreads() { + ThreadMXBean threadBean = ManagementFactory.getThreadMXBean(); + + ThreadInfo[] infos = threadBean.getThreadInfo(threadBean.getAllThreadIds(), 20); + for (ThreadInfo info : infos) { + if (info == null) continue; + LOG.info("Check thread: " + info.getThreadName()); + if (info.getThreadName().contains("SecondaryNameNode")) { + fail("Leaked thread: " + info + "\n" + + Joiner.on("\n").join(info.getStackTrace())); + } + } + LOG.info("--------"); + } + static void checkFile(FileSystem fileSys, Path name, int repl) throws IOException { assertTrue(fileSys.exists(name)); @@ -1738,7 +1757,7 @@ public class TestCheckpoint { /** * Test that the 2NN triggers a checkpoint after the configurable interval */ - @Test + @Test(timeout=30000) public void testCheckpointTriggerOnTxnCount() throws Exception { MiniDFSCluster cluster = null; SecondaryNameNode secondary = null; @@ -1752,8 +1771,7 @@ public class TestCheckpoint { .format(true).build(); FileSystem fs = cluster.getFileSystem(); secondary = startSecondaryNameNode(conf); - Thread t = new Thread(secondary); - t.start(); + secondary.startCheckpointThread(); final NNStorage storage = secondary.getFSImage().getStorage(); // 2NN should checkpoint at startup