From baf4dc811421ec74064750741eaa30364b8cefc0 Mon Sep 17 00:00:00 2001 From: Todd Lipcon Date: Tue, 14 May 2013 15:37:18 +0000 Subject: [PATCH] HADOOP-9220. Unnecessary transition to standby in ActiveStandbyElector. Contributed by Tom White and Todd Lipcon. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1482402 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop-common/CHANGES.txt | 3 +++ .../hadoop/ha/ActiveStandbyElector.java | 20 ++++++++++--------- .../org/apache/hadoop/ha/DummyHAService.java | 2 ++ .../hadoop/ha/TestZKFailoverController.java | 9 +++++++-- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index ea25de15289..fcd1b212d88 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -165,6 +165,9 @@ Release 2.0.5-beta - UNRELEASED HADOOP-9307. BufferedFSInputStream.read returns wrong results after certain seeks. (todd) + HADOOP-9220. Unnecessary transition to standby in ActiveStandbyElector. + (tom and todd via todd) + Release 2.0.4-alpha - 2013-04-25 INCOMPATIBLE CHANGES diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java index 42bbed3084f..857335c0f03 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java @@ -159,6 +159,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback { private int createRetryCount = 0; private int statRetryCount = 0; private ZooKeeper zkClient; + private WatcherWithClientRef watcher; private ConnectionState zkConnectionState = ConnectionState.TERMINATED; private final ActiveStandbyElectorCallback appClient; @@ -246,6 +247,11 @@ public class ActiveStandbyElector implements StatCallback, StringCallback { if (data == null) { throw new HadoopIllegalArgumentException("data cannot be null"); } + + if (wantToBeInElection) { + LOG.info("Already in election. Not re-connecting."); + return; + } appData = new byte[data.length]; System.arraycopy(data, 0, appData, 0, data.length); @@ -615,7 +621,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback { // watcher after constructing ZooKeeper, we may miss that event. Instead, // we construct the watcher first, and have it block any events it receives // before we can set its ZooKeeper reference. - WatcherWithClientRef watcher = new WatcherWithClientRef(); + watcher = new WatcherWithClientRef(); ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher); watcher.setZooKeeperRef(zk); @@ -753,6 +759,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback { e); } zkClient = null; + watcher = null; } zkClient = getNewZooKeeper(); LOG.debug("Created new connection for " + this); @@ -765,12 +772,14 @@ public class ActiveStandbyElector implements StatCallback, StringCallback { LOG.debug("Terminating ZK connection for " + this); ZooKeeper tempZk = zkClient; zkClient = null; + watcher = null; try { tempZk.close(); } catch(InterruptedException e) { LOG.warn(e); } zkConnectionState = ConnectionState.TERMINATED; + wantToBeInElection = false; } private void reset() { @@ -914,7 +923,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback { private void monitorLockNodeAsync() { zkClient.exists(zkLockFilePath, - new WatcherWithClientRef(zkClient), this, + watcher, this, zkClient); } @@ -1015,13 +1024,6 @@ public class ActiveStandbyElector implements StatCallback, StringCallback { * Latch used to wait until the reference to ZooKeeper is set. */ private CountDownLatch hasSetZooKeeper = new CountDownLatch(1); - - private WatcherWithClientRef() { - } - - private WatcherWithClientRef(ZooKeeper zk) { - setZooKeeperRef(zk); - } /** * Waits for the next event from ZooKeeper to arrive. diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/DummyHAService.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/DummyHAService.java index 0985af18c68..e9189e27243 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/DummyHAService.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/DummyHAService.java @@ -49,6 +49,7 @@ class DummyHAService extends HAServiceTarget { DummySharedResource sharedResource; public int fenceCount = 0; + public int activeTransitionCount = 0; static ArrayList instances = Lists.newArrayList(); int index; @@ -139,6 +140,7 @@ class DummyHAService extends HAServiceTarget { @Override public void transitionToActive(StateChangeRequestInfo req) throws ServiceFailedException, AccessControlException, IOException { + activeTransitionCount++; checkUnreachable(); if (failToBecomeActive) { throw new ServiceFailedException("injected failure"); diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java index ed44d8a0336..404a8a6e890 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java @@ -418,7 +418,7 @@ public class TestZKFailoverController extends ClientBaseWithFixes { } } - @Test(timeout=15000) + @Test(timeout=25000) public void testGracefulFailover() throws Exception { try { cluster.start(); @@ -426,11 +426,16 @@ public class TestZKFailoverController extends ClientBaseWithFixes { cluster.waitForActiveLockHolder(0); cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover(); cluster.waitForActiveLockHolder(1); + cluster.getService(0).getZKFCProxy(conf, 5000).gracefulFailover(); cluster.waitForActiveLockHolder(0); - + + Thread.sleep(10000); // allow to quiesce + assertEquals(0, cluster.getService(0).fenceCount); assertEquals(0, cluster.getService(1).fenceCount); + assertEquals(2, cluster.getService(0).activeTransitionCount); + assertEquals(1, cluster.getService(1).activeTransitionCount); } finally { cluster.stop(); }