HADOOP-9126. FormatZK and ZKFC startup can fail due to zkclient connection establishment delay. Contributed by Rakesh R and Todd Lipcon.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1419831 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2012-12-10 22:09:20 +00:00
parent 7ba12a628a
commit 22a78a75b4
4 changed files with 127 additions and 30 deletions

View File

@ -476,6 +476,9 @@ Release 2.0.3-alpha - Unreleased
HADOOP-6762. Exception while doing RPC I/O closes channel
(Sam Rash and todd via todd)
HADOOP-9126. FormatZK and ZKFC startup can fail due to zkclient connection
establishment delay. (Rakesh R and todd via todd)
Release 2.0.2-alpha - 2012-09-07
INCOMPATIBLE CHANGES

View File

@ -21,6 +21,8 @@
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
@ -45,6 +47,7 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
/**
*
@ -205,7 +208,7 @@ public ActiveStandbyElector(String zookeeperHostPorts,
int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
List<ZKAuthInfo> authInfo,
ActiveStandbyElectorCallback app) throws IOException,
HadoopIllegalArgumentException {
HadoopIllegalArgumentException, KeeperException {
if (app == null || acl == null || parentZnodeName == null
|| zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
throw new HadoopIllegalArgumentException("Invalid argument");
@ -602,10 +605,24 @@ synchronized void processWatchEvent(ZooKeeper zk, WatchedEvent event) {
*
* @return new zookeeper client instance
* @throws IOException
* @throws KeeperException zookeeper connectionloss exception
*/
protected synchronized ZooKeeper getNewZooKeeper() throws IOException {
ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, null);
zk.register(new WatcherWithClientRef(zk));
protected synchronized ZooKeeper getNewZooKeeper() throws IOException,
KeeperException {
// Unfortunately, the ZooKeeper constructor connects to ZooKeeper and
// may trigger the Connected event immediately. So, if we register the
// watcher after constructing ZooKeeper, we may miss that event. Instead,
// we construct the watcher first, and have it queue any events it receives
// before we can set its ZooKeeper reference.
WatcherWithClientRef watcher = new WatcherWithClientRef();
ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher);
watcher.setZooKeeperRef(zk);
// Wait for the asynchronous success/failure. This may throw an exception
// if we don't connect within the session timeout.
watcher.waitForZKConnectionEvent(zkSessionTimeout);
for (ZKAuthInfo auth : zkAuthInfo) {
zk.addAuthInfo(auth.getScheme(), auth.getAuth());
}
@ -710,13 +727,16 @@ private boolean reEstablishSession() {
} catch(IOException e) {
LOG.warn(e);
sleepFor(5000);
} catch(KeeperException e) {
LOG.warn(e);
sleepFor(5000);
}
++connectionRetryCount;
}
return success;
}
private void createConnection() throws IOException {
private void createConnection() throws IOException, KeeperException {
if (zkClient != null) {
try {
zkClient.close();
@ -973,14 +993,76 @@ private synchronized boolean isStaleClient(Object ctx) {
* events.
*/
private final class WatcherWithClientRef implements Watcher {
private final ZooKeeper zk;
private ZooKeeper zk;
/**
* Latch fired whenever any event arrives. This is used in order
* to wait for the Connected event when the client is first created.
*/
private CountDownLatch hasReceivedEvent = new CountDownLatch(1);
/**
* If any events arrive before the reference to ZooKeeper is set,
* they get queued up and later forwarded when the reference is
* available.
*/
private final List<WatchedEvent> queuedEvents = Lists.newLinkedList();
private WatcherWithClientRef() {
}
private WatcherWithClientRef(ZooKeeper zk) {
this.zk = zk;
}
/**
* Waits for the next event from ZooKeeper to arrive.
*
* @param connectionTimeoutMs zookeeper connection timeout in milliseconds
* @throws KeeperException if the connection attempt times out. This will
* be a ZooKeeper ConnectionLoss exception code.
* @throws IOException if interrupted while connecting to ZooKeeper
*/
private void waitForZKConnectionEvent(int connectionTimeoutMs)
throws KeeperException, IOException {
try {
if (!hasReceivedEvent.await(connectionTimeoutMs, TimeUnit.MILLISECONDS)) {
LOG.error("Connection timed out: couldn't connect to ZooKeeper in "
+ connectionTimeoutMs + " milliseconds");
synchronized (this) {
zk.close();
}
throw KeeperException.create(Code.CONNECTIONLOSS);
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException(
"Interrupted when connecting to zookeeper server", e);
}
}
private synchronized void setZooKeeperRef(ZooKeeper zk) {
Preconditions.checkState(this.zk == null,
"zk already set -- must be set exactly once");
this.zk = zk;
for (WatchedEvent e : queuedEvents) {
forwardEvent(e);
}
queuedEvents.clear();
}
@Override
public void process(WatchedEvent event) {
public synchronized void process(WatchedEvent event) {
if (zk != null) {
forwardEvent(event);
} else {
queuedEvents.add(event);
}
}
private void forwardEvent(WatchedEvent event) {
hasReceivedEvent.countDown();
try {
ActiveStandbyElector.this.processWatchEvent(
zk, event);
@ -1024,5 +1106,4 @@ public String toString() {
((appData == null) ? "null" : StringUtils.byteToHexString(appData)) +
" cb=" + appClient;
}
}

View File

@ -180,7 +180,15 @@ public Integer run() {
private int doRun(String[] args)
throws HadoopIllegalArgumentException, IOException, InterruptedException {
initZK();
try {
initZK();
} catch (KeeperException ke) {
LOG.fatal("Unable to start failover controller. Unable to connect "
+ "to ZooKeeper quorum at " + zkQuorum + ". Please check the "
+ "configured value for " + ZK_QUORUM_KEY + " and ensure that "
+ "ZooKeeper is running.");
return ERR_CODE_NO_ZK;
}
if (args.length > 0) {
if ("-formatZK".equals(args[0])) {
boolean force = false;
@ -199,24 +207,12 @@ private int doRun(String[] args)
badArg(args[0]);
}
}
try {
if (!elector.parentZNodeExists()) {
LOG.fatal("Unable to start failover controller. " +
"Parent znode does not exist.\n" +
"Run with -formatZK flag to initialize ZooKeeper.");
return ERR_CODE_NO_PARENT_ZNODE;
}
} catch (IOException ioe) {
if (ioe.getCause() instanceof KeeperException.ConnectionLossException) {
LOG.fatal("Unable to start failover controller. Unable to connect " +
"to ZooKeeper quorum at " + zkQuorum + ". Please check the " +
"configured value for " + ZK_QUORUM_KEY + " and ensure that " +
"ZooKeeper is running.");
return ERR_CODE_NO_ZK;
} else {
throw ioe;
}
if (!elector.parentZNodeExists()) {
LOG.fatal("Unable to start failover controller. "
+ "Parent znode does not exist.\n"
+ "Run with -formatZK flag to initialize ZooKeeper.");
return ERR_CODE_NO_PARENT_ZNODE;
}
try {
@ -310,7 +306,8 @@ protected void startRPC() throws IOException {
}
private void initZK() throws HadoopIllegalArgumentException, IOException {
private void initZK() throws HadoopIllegalArgumentException, IOException,
KeeperException {
zkQuorum = conf.get(ZK_QUORUM_KEY);
int zkTimeout = conf.getInt(ZK_SESSION_TIMEOUT_KEY,
ZK_SESSION_TIMEOUT_DEFAULT);

View File

@ -42,6 +42,7 @@
import org.apache.hadoop.ha.ActiveStandbyElector.ActiveStandbyElectorCallback;
import org.apache.hadoop.ha.ActiveStandbyElector.ActiveNotFoundException;
import org.apache.hadoop.ha.HAZKUtil.ZKAuthInfo;
import org.apache.hadoop.test.GenericTestUtils;
public class TestActiveStandbyElector {
@ -56,7 +57,8 @@ class ActiveStandbyElectorTester extends ActiveStandbyElector {
private int sleptFor = 0;
ActiveStandbyElectorTester(String hostPort, int timeout, String parent,
List<ACL> acl, ActiveStandbyElectorCallback app) throws IOException {
List<ACL> acl, ActiveStandbyElectorCallback app) throws IOException,
KeeperException {
super(hostPort, timeout, parent, acl,
Collections.<ZKAuthInfo>emptyList(), app);
}
@ -83,7 +85,7 @@ protected void sleepFor(int ms) {
ActiveStandbyElector.BREADCRUMB_FILENAME;
@Before
public void init() throws IOException {
public void init() throws IOException, KeeperException {
count = 0;
mockZK = Mockito.mock(ZooKeeper.class);
mockApp = Mockito.mock(ActiveStandbyElectorCallback.class);
@ -705,4 +707,18 @@ public void testEnsureBaseNodeFails() throws Exception {
Mockito.eq(ZK_PARENT_NAME), Mockito.<byte[]>any(),
Mockito.eq(Ids.OPEN_ACL_UNSAFE), Mockito.eq(CreateMode.PERSISTENT));
}
/**
* verify the zookeeper connection establishment
*/
@Test
public void testWithoutZKServer() throws Exception {
try {
new ActiveStandbyElector("127.0.0.1", 2000, ZK_PARENT_NAME,
Ids.OPEN_ACL_UNSAFE, Collections.<ZKAuthInfo> emptyList(), mockApp);
Assert.fail("Did not throw zookeeper connection loss exceptions!");
} catch (KeeperException ke) {
GenericTestUtils.assertExceptionContains( "ConnectionLoss", ke);
}
}
}