HADOOP-8245. Fix flakiness in TestZKFailoverController. Contributed by Todd Lipcon.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-3042@1309554 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2012-04-04 19:21:01 +00:00
parent 30e1b3bba8
commit b74d742785
9 changed files with 140 additions and 68 deletions

View File

@ -9,3 +9,5 @@ HADOOP-8220. ZKFailoverController doesn't handle failure to become active correc
HADOOP-8228. Auto HA: Refactor tests and add stress tests. (todd) HADOOP-8228. Auto HA: Refactor tests and add stress tests. (todd)
HADOOP-8215. Security support for ZK Failover controller (todd) HADOOP-8215. Security support for ZK Failover controller (todd)
HADOOP-8245. Fix flakiness in TestZKFailoverController (todd)

View File

@ -240,8 +240,6 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
public synchronized void joinElection(byte[] data) public synchronized void joinElection(byte[] data)
throws HadoopIllegalArgumentException { throws HadoopIllegalArgumentException {
LOG.debug("Attempting active election");
if (data == null) { if (data == null) {
throw new HadoopIllegalArgumentException("data cannot be null"); throw new HadoopIllegalArgumentException("data cannot be null");
} }
@ -249,6 +247,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
appData = new byte[data.length]; appData = new byte[data.length];
System.arraycopy(data, 0, appData, 0, data.length); System.arraycopy(data, 0, appData, 0, data.length);
LOG.debug("Attempting active election for " + this);
joinElectionInternal(); joinElectionInternal();
} }
@ -272,6 +271,9 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
*/ */
public synchronized void ensureParentZNode() public synchronized void ensureParentZNode()
throws IOException, InterruptedException { throws IOException, InterruptedException {
Preconditions.checkState(!wantToBeInElection,
"ensureParentZNode() may not be called while in the election");
String pathParts[] = znodeWorkingDir.split("/"); String pathParts[] = znodeWorkingDir.split("/");
Preconditions.checkArgument(pathParts.length >= 1 && Preconditions.checkArgument(pathParts.length >= 1 &&
"".equals(pathParts[0]), "".equals(pathParts[0]),
@ -305,6 +307,9 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
*/ */
public synchronized void clearParentZNode() public synchronized void clearParentZNode()
throws IOException, InterruptedException { throws IOException, InterruptedException {
Preconditions.checkState(!wantToBeInElection,
"clearParentZNode() may not be called while in the election");
try { try {
LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK..."); LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
@ -393,7 +398,8 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
String name) { String name) {
if (isStaleClient(ctx)) return; if (isStaleClient(ctx)) return;
LOG.debug("CreateNode result: " + rc + " for path: " + path LOG.debug("CreateNode result: " + rc + " for path: " + path
+ " connectionState: " + zkConnectionState); + " connectionState: " + zkConnectionState +
" for " + this);
Code code = Code.get(rc); Code code = Code.get(rc);
if (isSuccess(code)) { if (isSuccess(code)) {
@ -449,8 +455,13 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
public synchronized void processResult(int rc, String path, Object ctx, public synchronized void processResult(int rc, String path, Object ctx,
Stat stat) { Stat stat) {
if (isStaleClient(ctx)) return; if (isStaleClient(ctx)) return;
assert wantToBeInElection :
"Got a StatNode result after quitting election";
LOG.debug("StatNode result: " + rc + " for path: " + path LOG.debug("StatNode result: " + rc + " for path: " + path
+ " connectionState: " + zkConnectionState); + " connectionState: " + zkConnectionState + " for " + this);
Code code = Code.get(rc); Code code = Code.get(rc);
if (isSuccess(code)) { if (isSuccess(code)) {
@ -517,7 +528,8 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
if (isStaleClient(zk)) return; if (isStaleClient(zk)) return;
LOG.debug("Watcher event type: " + eventType + " with state:" LOG.debug("Watcher event type: " + eventType + " with state:"
+ event.getState() + " for path:" + event.getPath() + event.getState() + " for path:" + event.getPath()
+ " connectionState: " + zkConnectionState); + " connectionState: " + zkConnectionState
+ " for " + this);
if (eventType == Event.EventType.None) { if (eventType == Event.EventType.None) {
// the connection state has changed // the connection state has changed
@ -528,7 +540,8 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
// be undone // be undone
ConnectionState prevConnectionState = zkConnectionState; ConnectionState prevConnectionState = zkConnectionState;
zkConnectionState = ConnectionState.CONNECTED; zkConnectionState = ConnectionState.CONNECTED;
if (prevConnectionState == ConnectionState.DISCONNECTED) { if (prevConnectionState == ConnectionState.DISCONNECTED &&
wantToBeInElection) {
monitorActiveStatus(); monitorActiveStatus();
} }
break; break;
@ -600,12 +613,14 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
} }
private void fatalError(String errorMessage) { private void fatalError(String errorMessage) {
LOG.fatal(errorMessage);
reset(); reset();
appClient.notifyFatalError(errorMessage); appClient.notifyFatalError(errorMessage);
} }
private void monitorActiveStatus() { private void monitorActiveStatus() {
LOG.debug("Monitoring active leader"); assert wantToBeInElection;
LOG.debug("Monitoring active leader for " + this);
statRetryCount = 0; statRetryCount = 0;
monitorLockNodeAsync(); monitorLockNodeAsync();
} }
@ -688,7 +703,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
int connectionRetryCount = 0; int connectionRetryCount = 0;
boolean success = false; boolean success = false;
while(!success && connectionRetryCount < NUM_RETRIES) { while(!success && connectionRetryCount < NUM_RETRIES) {
LOG.debug("Establishing zookeeper connection"); LOG.debug("Establishing zookeeper connection for " + this);
try { try {
createConnection(); createConnection();
success = true; success = true;
@ -703,13 +718,14 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
private void createConnection() throws IOException { private void createConnection() throws IOException {
zkClient = getNewZooKeeper(); zkClient = getNewZooKeeper();
LOG.debug("Created new connection for " + this);
} }
private void terminateConnection() { private void terminateConnection() {
if (zkClient == null) { if (zkClient == null) {
return; return;
} }
LOG.debug("Terminating ZK connection"); LOG.debug("Terminating ZK connection for " + this);
ZooKeeper tempZk = zkClient; ZooKeeper tempZk = zkClient;
zkClient = null; zkClient = null;
try { try {
@ -735,7 +751,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
Stat oldBreadcrumbStat = fenceOldActive(); Stat oldBreadcrumbStat = fenceOldActive();
writeBreadCrumbNode(oldBreadcrumbStat); writeBreadCrumbNode(oldBreadcrumbStat);
LOG.debug("Becoming active"); LOG.debug("Becoming active for " + this);
appClient.becomeActive(); appClient.becomeActive();
state = State.ACTIVE; state = State.ACTIVE;
return true; return true;
@ -838,7 +854,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
private void becomeStandby() { private void becomeStandby() {
if (state != State.STANDBY) { if (state != State.STANDBY) {
LOG.debug("Becoming standby"); LOG.debug("Becoming standby for " + this);
state = State.STANDBY; state = State.STANDBY;
appClient.becomeStandby(); appClient.becomeStandby();
} }
@ -846,7 +862,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
private void enterNeutralMode() { private void enterNeutralMode() {
if (state != State.NEUTRAL) { if (state != State.NEUTRAL) {
LOG.debug("Entering neutral mode"); LOG.debug("Entering neutral mode for " + this);
state = State.NEUTRAL; state = State.NEUTRAL;
appClient.enterNeutralMode(); appClient.enterNeutralMode();
} }
@ -943,8 +959,14 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
@Override @Override
public void process(WatchedEvent event) { public void process(WatchedEvent event) {
ActiveStandbyElector.this.processWatchEvent( try {
zk, event); ActiveStandbyElector.this.processWatchEvent(
zk, event);
} catch (Throwable t) {
fatalError(
"Failed to process watcher event " + event + ": " +
StringUtils.stringifyException(t));
}
} }
} }
@ -972,5 +994,13 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
} }
return false; return false;
} }
@Override
public String toString() {
return "elector id=" + System.identityHashCode(this) +
" appData=" +
((appData == null) ? "null" : StringUtils.byteToHexString(appData)) +
" cb=" + appClient;
}
} }

View File

@ -154,6 +154,7 @@ public abstract class ZKFailoverController implements Tool {
try { try {
mainLoop(); mainLoop();
} finally { } finally {
elector.quitElection(true);
healthMonitor.shutdown(); healthMonitor.shutdown();
healthMonitor.join(); healthMonitor.join();
} }
@ -379,6 +380,11 @@ public abstract class ZKFailoverController implements Tool {
throw new RuntimeException("Unable to fence " + target); throw new RuntimeException("Unable to fence " + target);
} }
} }
@Override
public String toString() {
return "Elector callbacks for " + localTarget;
}
} }
/** /**

View File

@ -0,0 +1,64 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ha;
import java.io.File;
import java.util.Set;
import javax.management.ObjectName;
import org.apache.zookeeper.test.ClientBase;
import org.apache.zookeeper.test.JMXEnv;
/**
* A subclass of ZK's ClientBase testing utility, with some fixes
* necessary for running in the Hadoop context.
*/
public class ClientBaseWithFixes extends ClientBase {
/**
* When running on the Jenkins setup, we need to ensure that this
* build directory exists before running the tests.
*/
@Override
public void setUp() throws Exception {
// build.test.dir is used by zookeeper
new File(System.getProperty("build.test.dir", "build")).mkdirs();
super.setUp();
}
/**
* ZK seems to have a bug when we muck with its sessions
* behind its back, causing disconnects, etc. This bug
* ends up leaving JMX beans around at the end of the test,
* and ClientBase's teardown method will throw an exception
* if it finds JMX beans leaked. So, clear them out there
* to workaround the ZK bug. See ZOOKEEPER-1438.
*/
@Override
public void tearDown() throws Exception {
Set<ObjectName> names = JMXEnv.ensureAll();
for (ObjectName n : names) {
try {
JMXEnv.conn().unregisterMBean(n);
} catch (Throwable t) {
// ignore
}
}
}
}

View File

@ -389,6 +389,7 @@ public class TestActiveStandbyElector {
*/ */
@Test @Test
public void testStatNodeRetry() { public void testStatNodeRetry() {
elector.joinElection(data);
elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK, elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK,
(Stat) null); (Stat) null);
elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK, elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK,
@ -409,6 +410,7 @@ public class TestActiveStandbyElector {
*/ */
@Test @Test
public void testStatNodeError() { public void testStatNodeError() {
elector.joinElection(data);
elector.processResult(Code.RUNTIMEINCONSISTENCY.intValue(), ZK_LOCK_NAME, elector.processResult(Code.RUNTIMEINCONSISTENCY.intValue(), ZK_LOCK_NAME,
mockZK, (Stat) null); mockZK, (Stat) null);
Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode(); Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode();
@ -592,6 +594,8 @@ public class TestActiveStandbyElector {
*/ */
@Test @Test
public void testQuitElection() throws Exception { public void testQuitElection() throws Exception {
elector.joinElection(data);
Mockito.verify(mockZK, Mockito.times(0)).close();
elector.quitElection(true); elector.quitElection(true);
Mockito.verify(mockZK, Mockito.times(1)).close(); Mockito.verify(mockZK, Mockito.times(1)).close();
// no watches added // no watches added

View File

@ -21,7 +21,6 @@ package org.apache.hadoop.ha;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import java.io.File;
import java.util.Collections; import java.util.Collections;
import java.util.UUID; import java.util.UUID;
@ -32,7 +31,6 @@ import org.apache.hadoop.ha.HAZKUtil.ZKAuthInfo;
import org.apache.log4j.Level; import org.apache.log4j.Level;
import org.apache.zookeeper.ZooDefs.Ids; import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.zookeeper.server.ZooKeeperServer; import org.apache.zookeeper.server.ZooKeeperServer;
import org.apache.zookeeper.test.ClientBase;
import org.junit.Test; import org.junit.Test;
import org.mockito.AdditionalMatchers; import org.mockito.AdditionalMatchers;
import org.mockito.Mockito; import org.mockito.Mockito;
@ -42,7 +40,7 @@ import com.google.common.primitives.Ints;
/** /**
* Test for {@link ActiveStandbyElector} using real zookeeper. * Test for {@link ActiveStandbyElector} using real zookeeper.
*/ */
public class TestActiveStandbyElectorRealZK extends ClientBase { public class TestActiveStandbyElectorRealZK extends ClientBaseWithFixes {
static final int NUM_ELECTORS = 2; static final int NUM_ELECTORS = 2;
static { static {
@ -61,8 +59,6 @@ public class TestActiveStandbyElectorRealZK extends ClientBase {
@Override @Override
public void setUp() throws Exception { public void setUp() throws Exception {
// build.test.dir is used by zookeeper
new File(System.getProperty("build.test.dir", "build")).mkdirs();
super.setUp(); super.setUp();
zkServer = getServer(serverFactory); zkServer = getServer(serverFactory);
@ -244,4 +240,19 @@ public class TestActiveStandbyElectorRealZK extends ClientBase {
checkFatalsAndReset(); checkFatalsAndReset();
} }
@Test(timeout=15000)
public void testDontJoinElectionOnDisconnectAndReconnect() throws Exception {
electors[0].ensureParentZNode();
stopServer();
ActiveStandbyElectorTestUtil.waitForElectorState(
null, electors[0], State.NEUTRAL);
startServer();
waitForServerUp(hostPort, CONNECTION_TIMEOUT);
// Have to sleep to allow time for the clients to reconnect.
Thread.sleep(2000);
Mockito.verify(cbs[0], Mockito.never()).becomeActive();
Mockito.verify(cbs[1], Mockito.never()).becomeActive();
checkFatalsAndReset();
}
} }

View File

@ -19,7 +19,6 @@ package org.apache.hadoop.ha;
import static org.junit.Assert.*; import static org.junit.Assert.*;
import java.io.File;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import org.apache.commons.logging.impl.Log4JLogger; import org.apache.commons.logging.impl.Log4JLogger;
@ -32,12 +31,11 @@ import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooKeeper; import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.data.Stat; import org.apache.zookeeper.data.Stat;
import org.apache.zookeeper.server.auth.DigestAuthenticationProvider; import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
import org.apache.zookeeper.test.ClientBase;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.mockito.Mockito; import org.mockito.Mockito;
public class TestZKFailoverController extends ClientBase { public class TestZKFailoverController extends ClientBaseWithFixes {
private Configuration conf; private Configuration conf;
private MiniZKFCCluster cluster; private MiniZKFCCluster cluster;
@ -63,13 +61,6 @@ public class TestZKFailoverController extends ClientBase {
((Log4JLogger)ActiveStandbyElector.LOG).getLogger().setLevel(Level.ALL); ((Log4JLogger)ActiveStandbyElector.LOG).getLogger().setLevel(Level.ALL);
} }
@Override
public void setUp() throws Exception {
// build.test.dir is used by zookeeper
new File(System.getProperty("build.test.dir", "build")).mkdirs();
super.setUp();
}
@Before @Before
public void setupConfAndServices() { public void setupConfAndServices() {
conf = new Configuration(); conf = new Configuration();

View File

@ -17,15 +17,10 @@
*/ */
package org.apache.hadoop.ha; package org.apache.hadoop.ha;
import java.io.File;
import java.util.Random; import java.util.Random;
import java.util.Set;
import javax.management.ObjectName;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.zookeeper.test.ClientBase;
import org.apache.zookeeper.test.JMXEnv;
import org.junit.After; import org.junit.After;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
@ -39,7 +34,7 @@ import org.mockito.stubbing.Answer;
* failovers. While doing so, ensures that a fake "shared resource" * failovers. While doing so, ensures that a fake "shared resource"
* (simulating the shared edits dir) is only owned by one service at a time. * (simulating the shared edits dir) is only owned by one service at a time.
*/ */
public class TestZKFailoverControllerStress extends ClientBase { public class TestZKFailoverControllerStress extends ClientBaseWithFixes {
private static final int STRESS_RUNTIME_SECS = 30; private static final int STRESS_RUNTIME_SECS = 30;
private static final int EXTRA_TIMEOUT_SECS = 10; private static final int EXTRA_TIMEOUT_SECS = 10;
@ -47,13 +42,6 @@ public class TestZKFailoverControllerStress extends ClientBase {
private Configuration conf; private Configuration conf;
private MiniZKFCCluster cluster; private MiniZKFCCluster cluster;
@Override
public void setUp() throws Exception {
// build.test.dir is used by zookeeper
new File(System.getProperty("build.test.dir", "build")).mkdirs();
super.setUp();
}
@Before @Before
public void setupConfAndServices() throws Exception { public void setupConfAndServices() throws Exception {
conf = new Configuration(); conf = new Configuration();
@ -67,22 +55,6 @@ public class TestZKFailoverControllerStress extends ClientBase {
cluster.stop(); cluster.stop();
} }
/**
* ZK seems to have a bug when we muck with its sessions
* behind its back, causing disconnects, etc. This bug
* ends up leaving JMX beans around at the end of the test,
* and ClientBase's teardown method will throw an exception
* if it finds JMX beans leaked. So, clear them out there
* to workaround the ZK bug. See ZOOKEEPER-1438.
*/
@After
public void clearZKJMX() throws Exception {
Set<ObjectName> names = JMXEnv.ensureAll();
for (ObjectName n : names) {
JMXEnv.conn().unregisterMBean(n);
}
}
/** /**
* Simply fail back and forth between two services for the * Simply fail back and forth between two services for the
* configured amount of time, via expiring their ZK sessions. * configured amount of time, via expiring their ZK sessions.

View File

@ -19,12 +19,12 @@ package org.apache.hadoop.hdfs.server.namenode.ha;
import static org.junit.Assert.*; import static org.junit.Assert.*;
import java.io.File;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ha.ClientBaseWithFixes;
import org.apache.hadoop.ha.NodeFencer; import org.apache.hadoop.ha.NodeFencer;
import org.apache.hadoop.ha.ZKFailoverController; import org.apache.hadoop.ha.ZKFailoverController;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
@ -36,7 +36,6 @@ import org.apache.hadoop.hdfs.tools.DFSZKFailoverController;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.test.MultithreadedTestUtil.TestContext; import org.apache.hadoop.test.MultithreadedTestUtil.TestContext;
import org.apache.hadoop.test.MultithreadedTestUtil.TestingThread; import org.apache.hadoop.test.MultithreadedTestUtil.TestingThread;
import org.apache.zookeeper.test.ClientBase;
import org.junit.After; import org.junit.After;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
@ -44,20 +43,13 @@ import org.junit.Test;
import com.google.common.base.Supplier; import com.google.common.base.Supplier;
public class TestDFSZKFailoverController extends ClientBase { public class TestDFSZKFailoverController extends ClientBaseWithFixes {
private Configuration conf; private Configuration conf;
private MiniDFSCluster cluster; private MiniDFSCluster cluster;
private TestContext ctx; private TestContext ctx;
private ZKFCThread thr1, thr2; private ZKFCThread thr1, thr2;
private FileSystem fs; private FileSystem fs;
@Override
public void setUp() throws Exception {
// build.test.dir is used by zookeeper
new File(System.getProperty("build.test.dir", "build")).mkdirs();
super.setUp();
}
@Before @Before
public void setup() throws Exception { public void setup() throws Exception {
conf = new Configuration(); conf = new Configuration();