HADOOP-8245. Fix flakiness in TestZKFailoverController. Contributed by Todd Lipcon.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-3042@1309554 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2012-04-04 19:21:01 +00:00
parent 30e1b3bba8
commit b74d742785
9 changed files with 140 additions and 68 deletions

View File

@ -9,3 +9,5 @@ HADOOP-8220. ZKFailoverController doesn't handle failure to become active correc
HADOOP-8228. Auto HA: Refactor tests and add stress tests. (todd)
HADOOP-8215. Security support for ZK Failover controller (todd)
HADOOP-8245. Fix flakiness in TestZKFailoverController (todd)

View File

@ -240,8 +240,6 @@ public ActiveStandbyElector(String zookeeperHostPorts,
public synchronized void joinElection(byte[] data)
throws HadoopIllegalArgumentException {
LOG.debug("Attempting active election");
if (data == null) {
throw new HadoopIllegalArgumentException("data cannot be null");
}
@ -249,6 +247,7 @@ public synchronized void joinElection(byte[] data)
appData = new byte[data.length];
System.arraycopy(data, 0, appData, 0, data.length);
LOG.debug("Attempting active election for " + this);
joinElectionInternal();
}
@ -272,6 +271,9 @@ public synchronized boolean parentZNodeExists()
*/
public synchronized void ensureParentZNode()
throws IOException, InterruptedException {
Preconditions.checkState(!wantToBeInElection,
"ensureParentZNode() may not be called while in the election");
String pathParts[] = znodeWorkingDir.split("/");
Preconditions.checkArgument(pathParts.length >= 1 &&
"".equals(pathParts[0]),
@ -305,6 +307,9 @@ public synchronized void ensureParentZNode()
*/
public synchronized void clearParentZNode()
throws IOException, InterruptedException {
Preconditions.checkState(!wantToBeInElection,
"clearParentZNode() may not be called while in the election");
try {
LOG.info("Recursively deleting " + znodeWorkingDir + " from ZK...");
@ -393,7 +398,8 @@ public synchronized void processResult(int rc, String path, Object ctx,
String name) {
if (isStaleClient(ctx)) return;
LOG.debug("CreateNode result: " + rc + " for path: " + path
+ " connectionState: " + zkConnectionState);
+ " connectionState: " + zkConnectionState +
" for " + this);
Code code = Code.get(rc);
if (isSuccess(code)) {
@ -449,8 +455,13 @@ public synchronized void processResult(int rc, String path, Object ctx,
public synchronized void processResult(int rc, String path, Object ctx,
Stat stat) {
if (isStaleClient(ctx)) return;
assert wantToBeInElection :
"Got a StatNode result after quitting election";
LOG.debug("StatNode result: " + rc + " for path: " + path
+ " connectionState: " + zkConnectionState);
+ " connectionState: " + zkConnectionState + " for " + this);
Code code = Code.get(rc);
if (isSuccess(code)) {
@ -517,7 +528,8 @@ synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
if (isStaleClient(zk)) return;
LOG.debug("Watcher event type: " + eventType + " with state:"
+ event.getState() + " for path:" + event.getPath()
+ " connectionState: " + zkConnectionState);
+ " connectionState: " + zkConnectionState
+ " for " + this);
if (eventType == Event.EventType.None) {
// the connection state has changed
@ -528,7 +540,8 @@ synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
// be undone
ConnectionState prevConnectionState = zkConnectionState;
zkConnectionState = ConnectionState.CONNECTED;
if (prevConnectionState == ConnectionState.DISCONNECTED) {
if (prevConnectionState == ConnectionState.DISCONNECTED &&
wantToBeInElection) {
monitorActiveStatus();
}
break;
@ -600,12 +613,14 @@ protected synchronized ZooKeeper getNewZooKeeper() throws IOException {
}
private void fatalError(String errorMessage) {
LOG.fatal(errorMessage);
reset();
appClient.notifyFatalError(errorMessage);
}
private void monitorActiveStatus() {
LOG.debug("Monitoring active leader");
assert wantToBeInElection;
LOG.debug("Monitoring active leader for " + this);
statRetryCount = 0;
monitorLockNodeAsync();
}
@ -688,7 +703,7 @@ private boolean reEstablishSession() {
int connectionRetryCount = 0;
boolean success = false;
while(!success && connectionRetryCount < NUM_RETRIES) {
LOG.debug("Establishing zookeeper connection");
LOG.debug("Establishing zookeeper connection for " + this);
try {
createConnection();
success = true;
@ -703,13 +718,14 @@ private boolean reEstablishSession() {
private void createConnection() throws IOException {
zkClient = getNewZooKeeper();
LOG.debug("Created new connection for " + this);
}
private void terminateConnection() {
if (zkClient == null) {
return;
}
LOG.debug("Terminating ZK connection");
LOG.debug("Terminating ZK connection for " + this);
ZooKeeper tempZk = zkClient;
zkClient = null;
try {
@ -735,7 +751,7 @@ private boolean becomeActive() {
Stat oldBreadcrumbStat = fenceOldActive();
writeBreadCrumbNode(oldBreadcrumbStat);
LOG.debug("Becoming active");
LOG.debug("Becoming active for " + this);
appClient.becomeActive();
state = State.ACTIVE;
return true;
@ -838,7 +854,7 @@ public byte[] run() throws KeeperException, InterruptedException {
private void becomeStandby() {
if (state != State.STANDBY) {
LOG.debug("Becoming standby");
LOG.debug("Becoming standby for " + this);
state = State.STANDBY;
appClient.becomeStandby();
}
@ -846,7 +862,7 @@ private void becomeStandby() {
private void enterNeutralMode() {
if (state != State.NEUTRAL) {
LOG.debug("Entering neutral mode");
LOG.debug("Entering neutral mode for " + this);
state = State.NEUTRAL;
appClient.enterNeutralMode();
}
@ -943,8 +959,14 @@ private WatcherWithClientRef(ZooKeeper zk) {
@Override
public void process(WatchedEvent event) {
ActiveStandbyElector.this.processWatchEvent(
zk, event);
try {
ActiveStandbyElector.this.processWatchEvent(
zk, event);
} catch (Throwable t) {
fatalError(
"Failed to process watcher event " + event + ": " +
StringUtils.stringifyException(t));
}
}
}
@ -972,5 +994,13 @@ private static boolean shouldRetry(Code code) {
}
return false;
}
@Override
public String toString() {
return "elector id=" + System.identityHashCode(this) +
" appData=" +
((appData == null) ? "null" : StringUtils.byteToHexString(appData)) +
" cb=" + appClient;
}
}

View File

@ -154,6 +154,7 @@ private int doRun(String[] args)
try {
mainLoop();
} finally {
elector.quitElection(true);
healthMonitor.shutdown();
healthMonitor.join();
}
@ -379,6 +380,11 @@ public void fenceOldActive(byte[] data) {
throw new RuntimeException("Unable to fence " + target);
}
}
@Override
public String toString() {
return "Elector callbacks for " + localTarget;
}
}
/**

View File

@ -0,0 +1,64 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ha;
import java.io.File;
import java.util.Set;
import javax.management.ObjectName;
import org.apache.zookeeper.test.ClientBase;
import org.apache.zookeeper.test.JMXEnv;
/**
* A subclass of ZK's ClientBase testing utility, with some fixes
* necessary for running in the Hadoop context.
*/
public class ClientBaseWithFixes extends ClientBase {
/**
* When running on the Jenkins setup, we need to ensure that this
* build directory exists before running the tests.
*/
@Override
public void setUp() throws Exception {
// build.test.dir is used by zookeeper
new File(System.getProperty("build.test.dir", "build")).mkdirs();
super.setUp();
}
/**
* ZK seems to have a bug when we muck with its sessions
* behind its back, causing disconnects, etc. This bug
* ends up leaving JMX beans around at the end of the test,
* and ClientBase's teardown method will throw an exception
* if it finds JMX beans leaked. So, clear them out there
* to workaround the ZK bug. See ZOOKEEPER-1438.
*/
@Override
public void tearDown() throws Exception {
Set<ObjectName> names = JMXEnv.ensureAll();
for (ObjectName n : names) {
try {
JMXEnv.conn().unregisterMBean(n);
} catch (Throwable t) {
// ignore
}
}
}
}

View File

@ -389,6 +389,7 @@ public void testCreateNodeResultRetryNoNode() {
*/
@Test
public void testStatNodeRetry() {
elector.joinElection(data);
elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK,
(Stat) null);
elector.processResult(Code.CONNECTIONLOSS.intValue(), ZK_LOCK_NAME, mockZK,
@ -409,6 +410,7 @@ public void testStatNodeRetry() {
*/
@Test
public void testStatNodeError() {
elector.joinElection(data);
elector.processResult(Code.RUNTIMEINCONSISTENCY.intValue(), ZK_LOCK_NAME,
mockZK, (Stat) null);
Mockito.verify(mockApp, Mockito.times(0)).enterNeutralMode();
@ -592,6 +594,8 @@ public void testSuccessiveStandbyCalls() {
*/
@Test
public void testQuitElection() throws Exception {
elector.joinElection(data);
Mockito.verify(mockZK, Mockito.times(0)).close();
elector.quitElection(true);
Mockito.verify(mockZK, Mockito.times(1)).close();
// no watches added

View File

@ -21,7 +21,6 @@
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.util.Collections;
import java.util.UUID;
@ -32,7 +31,6 @@
import org.apache.log4j.Level;
import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.zookeeper.server.ZooKeeperServer;
import org.apache.zookeeper.test.ClientBase;
import org.junit.Test;
import org.mockito.AdditionalMatchers;
import org.mockito.Mockito;
@ -42,7 +40,7 @@
/**
* Test for {@link ActiveStandbyElector} using real zookeeper.
*/
public class TestActiveStandbyElectorRealZK extends ClientBase {
public class TestActiveStandbyElectorRealZK extends ClientBaseWithFixes {
static final int NUM_ELECTORS = 2;
static {
@ -61,8 +59,6 @@ public class TestActiveStandbyElectorRealZK extends ClientBase {
@Override
public void setUp() throws Exception {
// build.test.dir is used by zookeeper
new File(System.getProperty("build.test.dir", "build")).mkdirs();
super.setUp();
zkServer = getServer(serverFactory);
@ -244,4 +240,19 @@ public void testHandleSessionExpirationOfStandby() throws Exception {
checkFatalsAndReset();
}
@Test(timeout=15000)
public void testDontJoinElectionOnDisconnectAndReconnect() throws Exception {
electors[0].ensureParentZNode();
stopServer();
ActiveStandbyElectorTestUtil.waitForElectorState(
null, electors[0], State.NEUTRAL);
startServer();
waitForServerUp(hostPort, CONNECTION_TIMEOUT);
// Have to sleep to allow time for the clients to reconnect.
Thread.sleep(2000);
Mockito.verify(cbs[0], Mockito.never()).becomeActive();
Mockito.verify(cbs[1], Mockito.never()).becomeActive();
checkFatalsAndReset();
}
}

View File

@ -19,7 +19,6 @@
import static org.junit.Assert.*;
import java.io.File;
import java.security.NoSuchAlgorithmException;
import org.apache.commons.logging.impl.Log4JLogger;
@ -32,12 +31,11 @@
import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.data.Stat;
import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
import org.apache.zookeeper.test.ClientBase;
import org.junit.Before;
import org.junit.Test;
import org.mockito.Mockito;
public class TestZKFailoverController extends ClientBase {
public class TestZKFailoverController extends ClientBaseWithFixes {
private Configuration conf;
private MiniZKFCCluster cluster;
@ -63,13 +61,6 @@ public class TestZKFailoverController extends ClientBase {
((Log4JLogger)ActiveStandbyElector.LOG).getLogger().setLevel(Level.ALL);
}
@Override
public void setUp() throws Exception {
// build.test.dir is used by zookeeper
new File(System.getProperty("build.test.dir", "build")).mkdirs();
super.setUp();
}
@Before
public void setupConfAndServices() {
conf = new Configuration();

View File

@ -17,15 +17,10 @@
*/
package org.apache.hadoop.ha;
import java.io.File;
import java.util.Random;
import java.util.Set;
import javax.management.ObjectName;
import org.apache.hadoop.conf.Configuration;
import org.apache.zookeeper.test.ClientBase;
import org.apache.zookeeper.test.JMXEnv;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@ -39,7 +34,7 @@
* failovers. While doing so, ensures that a fake "shared resource"
* (simulating the shared edits dir) is only owned by one service at a time.
*/
public class TestZKFailoverControllerStress extends ClientBase {
public class TestZKFailoverControllerStress extends ClientBaseWithFixes {
private static final int STRESS_RUNTIME_SECS = 30;
private static final int EXTRA_TIMEOUT_SECS = 10;
@ -47,13 +42,6 @@ public class TestZKFailoverControllerStress extends ClientBase {
private Configuration conf;
private MiniZKFCCluster cluster;
@Override
public void setUp() throws Exception {
// build.test.dir is used by zookeeper
new File(System.getProperty("build.test.dir", "build")).mkdirs();
super.setUp();
}
@Before
public void setupConfAndServices() throws Exception {
conf = new Configuration();
@ -67,22 +55,6 @@ public void stopCluster() throws Exception {
cluster.stop();
}
/**
* ZK seems to have a bug when we muck with its sessions
* behind its back, causing disconnects, etc. This bug
* ends up leaving JMX beans around at the end of the test,
* and ClientBase's teardown method will throw an exception
* if it finds JMX beans leaked. So, clear them out there
* to workaround the ZK bug. See ZOOKEEPER-1438.
*/
@After
public void clearZKJMX() throws Exception {
Set<ObjectName> names = JMXEnv.ensureAll();
for (ObjectName n : names) {
JMXEnv.conn().unregisterMBean(n);
}
}
/**
* Simply fail back and forth between two services for the
* configured amount of time, via expiring their ZK sessions.

View File

@ -19,12 +19,12 @@
import static org.junit.Assert.*;
import java.io.File;
import java.util.concurrent.TimeoutException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ha.ClientBaseWithFixes;
import org.apache.hadoop.ha.NodeFencer;
import org.apache.hadoop.ha.ZKFailoverController;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
@ -36,7 +36,6 @@
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.test.MultithreadedTestUtil.TestContext;
import org.apache.hadoop.test.MultithreadedTestUtil.TestingThread;
import org.apache.zookeeper.test.ClientBase;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@ -44,20 +43,13 @@
import com.google.common.base.Supplier;
public class TestDFSZKFailoverController extends ClientBase {
public class TestDFSZKFailoverController extends ClientBaseWithFixes {
private Configuration conf;
private MiniDFSCluster cluster;
private TestContext ctx;
private ZKFCThread thr1, thr2;
private FileSystem fs;
@Override
public void setUp() throws Exception {
// build.test.dir is used by zookeeper
new File(System.getProperty("build.test.dir", "build")).mkdirs();
super.setUp();
}
@Before
public void setup() throws Exception {
conf = new Configuration();