From b34825b0cb1b48a1ab368ba6498d63649feb7458 Mon Sep 17 00:00:00 2001 From: Xuan Date: Tue, 2 Jun 2015 10:28:14 -0700 Subject: [PATCH] YARN-3753. RM failed to come up with "java.io.IOException: Wait for ZKClient creation timed out". Contributed by Jian He --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../recovery/ZKRMStateStore.java | 7 +++-- ...TestZKRMStateStoreZKClientConnections.java | 26 ++++++++++++------- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 67e06283823..12f200d5ea5 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -142,6 +142,9 @@ Release 2.7.1 - UNRELEASED YARN-3725. App submission via REST API is broken in secure mode due to Timeline DT service address is empty. (Zhijie Shen via wangda) + YARN-3753. RM failed to come up with "java.io.IOException: Wait for + ZKClient creation timed out”. (Jian He via xgong) + Release 2.7.0 - 2015-04-20 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java index a116e061e72..82ac2c13a49 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java @@ -104,6 +104,8 @@ public class ZKRMStateStore extends RMStateStore { private String zkHostPort = null; private int zkSessionTimeout; + // wait time for zkClient to re-establish connection with zk-server. + private long zkResyncWaitTime; @VisibleForTesting long zkRetryInterval; @@ -234,6 +236,7 @@ public class ZKRMStateStore extends RMStateStore { conf.getLong(YarnConfiguration.RM_ZK_RETRY_INTERVAL_MS, YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS); } + zkResyncWaitTime = zkRetryInterval * numRetries; zkAcl = RMZKUtils.getZKAcls(conf); zkAuths = RMZKUtils.getZKAuths(conf); @@ -1081,11 +1084,11 @@ public class ZKRMStateStore extends RMStateStore { long startTime = System.currentTimeMillis(); synchronized (ZKRMStateStore.this) { while (zkClient == null) { - ZKRMStateStore.this.wait(zkSessionTimeout); + ZKRMStateStore.this.wait(zkResyncWaitTime); if (zkClient != null) { break; } - if (System.currentTimeMillis() - startTime > zkSessionTimeout) { + if (System.currentTimeMillis() - startTime > zkResyncWaitTime) { throw new IOException("Wait for ZKClient creation timed out"); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java index 62dc5efd394..7f07364b863 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java @@ -170,10 +170,10 @@ public class TestZKRMStateStoreZKClientConnections extends throws Exception { TestZKClient zkClientTester = new TestZKClient(); - String path = "/test"; + final String path = "/test"; YarnConfiguration conf = new YarnConfiguration(); conf.setInt(YarnConfiguration.RM_ZK_TIMEOUT_MS, ZK_TIMEOUT_MS); - ZKRMStateStore store = + final ZKRMStateStore store = (ZKRMStateStore) zkClientTester.getRMStateStore(conf); TestDispatcher dispatcher = new TestDispatcher(); store.setRMDispatcher(dispatcher); @@ -185,14 +185,20 @@ public class TestZKRMStateStoreZKClientConnections extends store.setDataWithRetries(path, "newBytes".getBytes(), 0); stopServer(); + final AtomicBoolean isSucceeded = new AtomicBoolean(false); zkClientTester.watcher.waitForDisconnected(ZK_OP_WAIT_TIME); - try { - store.getDataWithRetries(path, true); - fail("Expected ZKClient time out exception"); - } catch (Exception e) { - assertTrue(e.getMessage().contains( - "Wait for ZKClient creation timed out")); - } + Thread thread = new Thread() { + @Override + public void run() { + try { + store.getDataWithRetries(path, true); + isSucceeded.set(true); + } catch (Exception e) { + isSucceeded.set(false); + } + } + }; + thread.start(); // ZKRMStateStore Session restored startServer(); @@ -206,6 +212,8 @@ public class TestZKRMStateStoreZKClientConnections extends fail(error); } assertEquals("newBytes", new String(ret)); + thread.join(); + assertTrue(isSucceeded.get()); } @Test(timeout = 20000)