diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 78c8f6ce183..557becb79c7 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -102,6 +102,9 @@ Release 2.5.0 - UNRELEASED
YARN-596. Use scheduling policies throughout the queue hierarchy to decide
which containers to preempt (Wei Yan via Sandy Ryza)
+ YARN-2054. Better defaults for YARN ZK configs for retries and retry-inteval
+ when HA is enabled. (kasha)
+
OPTIMIZATIONS
BUG FIXES
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 48763b3120e..18b18bc8d93 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -324,11 +324,11 @@ public class YarnConfiguration extends Configuration {
public static final String RM_ZK_ADDRESS = RM_ZK_PREFIX + "address";
public static final String RM_ZK_NUM_RETRIES = RM_ZK_PREFIX + "num-retries";
- public static final int DEFAULT_ZK_RM_NUM_RETRIES = 500;
+ public static final int DEFAULT_ZK_RM_NUM_RETRIES = 1000;
public static final String RM_ZK_RETRY_INTERVAL_MS =
RM_ZK_PREFIX + "retry-interval-ms";
- public static final long DEFAULT_RM_ZK_RETRY_INTERVAL_MS = 2000;
+ public static final long DEFAULT_RM_ZK_RETRY_INTERVAL_MS = 1000;
public static final String RM_ZK_TIMEOUT_MS = RM_ZK_PREFIX + "timeout-ms";
public static final int DEFAULT_RM_ZK_TIMEOUT_MS = 10000;
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 08d13366848..944677a8c6a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -309,14 +309,17 @@
Number of times RM tries to connect to ZooKeeper.
yarn.resourcemanager.zk-num-retries
- 500
+ 1000
Retry interval in milliseconds when connecting to ZooKeeper.
+ When HA is enabled, the value here is NOT used. It is generated
+ automatically from yarn.resourcemanager.zk-timeout-ms and
+ yarn.resourcemanager.zk-num-retries.
yarn.resourcemanager.zk-retry-interval-ms
- 2000
+ 1000
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java
index 9b15bb21e7e..6026e94f9b8 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java
@@ -90,7 +90,9 @@ public class ZKRMStateStore extends RMStateStore {
private String zkHostPort = null;
private int zkSessionTimeout;
- private long zkRetryInterval;
+
+ @VisibleForTesting
+ long zkRetryInterval;
private List zkAcl;
private List zkAuths;
@@ -199,9 +201,14 @@ public class ZKRMStateStore extends RMStateStore {
zkSessionTimeout =
conf.getInt(YarnConfiguration.RM_ZK_TIMEOUT_MS,
YarnConfiguration.DEFAULT_RM_ZK_TIMEOUT_MS);
- zkRetryInterval =
- conf.getLong(YarnConfiguration.RM_ZK_RETRY_INTERVAL_MS,
- YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS);
+
+ if (HAUtil.isHAEnabled(conf)) {
+ zkRetryInterval = zkSessionTimeout / numRetries;
+ } else {
+ zkRetryInterval =
+ conf.getLong(YarnConfiguration.RM_ZK_RETRY_INTERVAL_MS,
+ YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS);
+ }
zkAcl = RMZKUtils.getZKAcls(conf);
zkAuths = RMZKUtils.getZKAuths(conf);
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java
index 7f4d66f5966..8dc3628e2a6 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java
@@ -41,6 +41,7 @@ import java.security.NoSuchAlgorithmException;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.atomic.AtomicBoolean;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@@ -203,7 +204,7 @@ public class TestZKRMStateStoreZKClientConnections extends
LOG.error(error, e);
fail(error);
}
- Assert.assertEquals("newBytes", new String(ret));
+ assertEquals("newBytes", new String(ret));
}
@Test(timeout = 20000)
@@ -232,7 +233,7 @@ public class TestZKRMStateStoreZKClientConnections extends
try {
byte[] ret = store.getDataWithRetries(path, false);
- Assert.assertEquals("bytes", new String(ret));
+ assertEquals("bytes", new String(ret));
} catch (Exception e) {
String error = "New session creation failed";
LOG.error(error, e);
@@ -281,4 +282,24 @@ public class TestZKRMStateStoreZKClientConnections extends
zkClientTester.getRMStateStore(conf);
}
+
+ @Test
+ public void testZKRetryInterval() throws Exception {
+ TestZKClient zkClientTester = new TestZKClient();
+ YarnConfiguration conf = new YarnConfiguration();
+
+ ZKRMStateStore store =
+ (ZKRMStateStore) zkClientTester.getRMStateStore(conf);
+ assertEquals(YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS,
+ store.zkRetryInterval);
+ store.stop();
+
+ conf.setBoolean(YarnConfiguration.RM_HA_ENABLED, true);
+ store =
+ (ZKRMStateStore) zkClientTester.getRMStateStore(conf);
+ assertEquals(YarnConfiguration.DEFAULT_RM_ZK_TIMEOUT_MS /
+ YarnConfiguration.DEFAULT_ZK_RM_NUM_RETRIES,
+ store.zkRetryInterval);
+ store.stop();
+ }
}