diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 78c8f6ce183..557becb79c7 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -102,6 +102,9 @@ Release 2.5.0 - UNRELEASED YARN-596. Use scheduling policies throughout the queue hierarchy to decide which containers to preempt (Wei Yan via Sandy Ryza) + YARN-2054. Better defaults for YARN ZK configs for retries and retry-inteval + when HA is enabled. (kasha) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 48763b3120e..18b18bc8d93 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -324,11 +324,11 @@ public class YarnConfiguration extends Configuration { public static final String RM_ZK_ADDRESS = RM_ZK_PREFIX + "address"; public static final String RM_ZK_NUM_RETRIES = RM_ZK_PREFIX + "num-retries"; - public static final int DEFAULT_ZK_RM_NUM_RETRIES = 500; + public static final int DEFAULT_ZK_RM_NUM_RETRIES = 1000; public static final String RM_ZK_RETRY_INTERVAL_MS = RM_ZK_PREFIX + "retry-interval-ms"; - public static final long DEFAULT_RM_ZK_RETRY_INTERVAL_MS = 2000; + public static final long DEFAULT_RM_ZK_RETRY_INTERVAL_MS = 1000; public static final String RM_ZK_TIMEOUT_MS = RM_ZK_PREFIX + "timeout-ms"; public static final int DEFAULT_RM_ZK_TIMEOUT_MS = 10000; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 08d13366848..944677a8c6a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -309,14 +309,17 @@ Number of times RM tries to connect to ZooKeeper. yarn.resourcemanager.zk-num-retries - 500 + 1000 Retry interval in milliseconds when connecting to ZooKeeper. + When HA is enabled, the value here is NOT used. It is generated + automatically from yarn.resourcemanager.zk-timeout-ms and + yarn.resourcemanager.zk-num-retries. yarn.resourcemanager.zk-retry-interval-ms - 2000 + 1000 diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java index 9b15bb21e7e..6026e94f9b8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java @@ -90,7 +90,9 @@ public class ZKRMStateStore extends RMStateStore { private String zkHostPort = null; private int zkSessionTimeout; - private long zkRetryInterval; + + @VisibleForTesting + long zkRetryInterval; private List zkAcl; private List zkAuths; @@ -199,9 +201,14 @@ public class ZKRMStateStore extends RMStateStore { zkSessionTimeout = conf.getInt(YarnConfiguration.RM_ZK_TIMEOUT_MS, YarnConfiguration.DEFAULT_RM_ZK_TIMEOUT_MS); - zkRetryInterval = - conf.getLong(YarnConfiguration.RM_ZK_RETRY_INTERVAL_MS, - YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS); + + if (HAUtil.isHAEnabled(conf)) { + zkRetryInterval = zkSessionTimeout / numRetries; + } else { + zkRetryInterval = + conf.getLong(YarnConfiguration.RM_ZK_RETRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS); + } zkAcl = RMZKUtils.getZKAcls(conf); zkAuths = RMZKUtils.getZKAuths(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java index 7f4d66f5966..8dc3628e2a6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestZKRMStateStoreZKClientConnections.java @@ -41,6 +41,7 @@ import java.security.NoSuchAlgorithmException; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.atomic.AtomicBoolean; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -203,7 +204,7 @@ public class TestZKRMStateStoreZKClientConnections extends LOG.error(error, e); fail(error); } - Assert.assertEquals("newBytes", new String(ret)); + assertEquals("newBytes", new String(ret)); } @Test(timeout = 20000) @@ -232,7 +233,7 @@ public class TestZKRMStateStoreZKClientConnections extends try { byte[] ret = store.getDataWithRetries(path, false); - Assert.assertEquals("bytes", new String(ret)); + assertEquals("bytes", new String(ret)); } catch (Exception e) { String error = "New session creation failed"; LOG.error(error, e); @@ -281,4 +282,24 @@ public class TestZKRMStateStoreZKClientConnections extends zkClientTester.getRMStateStore(conf); } + + @Test + public void testZKRetryInterval() throws Exception { + TestZKClient zkClientTester = new TestZKClient(); + YarnConfiguration conf = new YarnConfiguration(); + + ZKRMStateStore store = + (ZKRMStateStore) zkClientTester.getRMStateStore(conf); + assertEquals(YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS, + store.zkRetryInterval); + store.stop(); + + conf.setBoolean(YarnConfiguration.RM_HA_ENABLED, true); + store = + (ZKRMStateStore) zkClientTester.getRMStateStore(conf); + assertEquals(YarnConfiguration.DEFAULT_RM_ZK_TIMEOUT_MS / + YarnConfiguration.DEFAULT_ZK_RM_NUM_RETRIES, + store.zkRetryInterval); + store.stop(); + } }