YARN-2054. Better defaults for YARN ZK configs for retries and retry-inteval when HA is enabled. (kasha)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1598632 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Karthik Kambatla 2014-05-30 15:25:27 +00:00
parent 452b37a2d7
commit 4c4ebe2353
5 changed files with 44 additions and 10 deletions

View File

@ -102,6 +102,9 @@ Release 2.5.0 - UNRELEASED
YARN-596. Use scheduling policies throughout the queue hierarchy to decide
which containers to preempt (Wei Yan via Sandy Ryza)
YARN-2054. Better defaults for YARN ZK configs for retries and retry-inteval
when HA is enabled. (kasha)
OPTIMIZATIONS
BUG FIXES

View File

@ -324,11 +324,11 @@ public class YarnConfiguration extends Configuration {
public static final String RM_ZK_ADDRESS = RM_ZK_PREFIX + "address";
public static final String RM_ZK_NUM_RETRIES = RM_ZK_PREFIX + "num-retries";
public static final int DEFAULT_ZK_RM_NUM_RETRIES = 500;
public static final int DEFAULT_ZK_RM_NUM_RETRIES = 1000;
public static final String RM_ZK_RETRY_INTERVAL_MS =
RM_ZK_PREFIX + "retry-interval-ms";
public static final long DEFAULT_RM_ZK_RETRY_INTERVAL_MS = 2000;
public static final long DEFAULT_RM_ZK_RETRY_INTERVAL_MS = 1000;
public static final String RM_ZK_TIMEOUT_MS = RM_ZK_PREFIX + "timeout-ms";
public static final int DEFAULT_RM_ZK_TIMEOUT_MS = 10000;

View File

@ -309,14 +309,17 @@
<property>
<description>Number of times RM tries to connect to ZooKeeper.</description>
<name>yarn.resourcemanager.zk-num-retries</name>
<value>500</value>
<value>1000</value>
</property>
<property>
<description>Retry interval in milliseconds when connecting to ZooKeeper.
When HA is enabled, the value here is NOT used. It is generated
automatically from yarn.resourcemanager.zk-timeout-ms and
yarn.resourcemanager.zk-num-retries.
</description>
<name>yarn.resourcemanager.zk-retry-interval-ms</name>
<value>2000</value>
<value>1000</value>
</property>
<property>

View File

@ -90,7 +90,9 @@ public class ZKRMStateStore extends RMStateStore {
private String zkHostPort = null;
private int zkSessionTimeout;
private long zkRetryInterval;
@VisibleForTesting
long zkRetryInterval;
private List<ACL> zkAcl;
private List<ZKUtil.ZKAuthInfo> zkAuths;
@ -199,9 +201,14 @@ public synchronized void initInternal(Configuration conf) throws Exception {
zkSessionTimeout =
conf.getInt(YarnConfiguration.RM_ZK_TIMEOUT_MS,
YarnConfiguration.DEFAULT_RM_ZK_TIMEOUT_MS);
zkRetryInterval =
conf.getLong(YarnConfiguration.RM_ZK_RETRY_INTERVAL_MS,
YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS);
if (HAUtil.isHAEnabled(conf)) {
zkRetryInterval = zkSessionTimeout / numRetries;
} else {
zkRetryInterval =
conf.getLong(YarnConfiguration.RM_ZK_RETRY_INTERVAL_MS,
YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS);
}
zkAcl = RMZKUtils.getZKAcls(conf);
zkAuths = RMZKUtils.getZKAuths(conf);

View File

@ -41,6 +41,7 @@
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.atomic.AtomicBoolean;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@ -203,7 +204,7 @@ public void testZKClientDisconnectAndReconnect()
LOG.error(error, e);
fail(error);
}
Assert.assertEquals("newBytes", new String(ret));
assertEquals("newBytes", new String(ret));
}
@Test(timeout = 20000)
@ -232,7 +233,7 @@ public void testZKSessionTimeout() throws Exception {
try {
byte[] ret = store.getDataWithRetries(path, false);
Assert.assertEquals("bytes", new String(ret));
assertEquals("bytes", new String(ret));
} catch (Exception e) {
String error = "New session creation failed";
LOG.error(error, e);
@ -281,4 +282,24 @@ public void testZKAuths() throws Exception {
zkClientTester.getRMStateStore(conf);
}
@Test
public void testZKRetryInterval() throws Exception {
TestZKClient zkClientTester = new TestZKClient();
YarnConfiguration conf = new YarnConfiguration();
ZKRMStateStore store =
(ZKRMStateStore) zkClientTester.getRMStateStore(conf);
assertEquals(YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS,
store.zkRetryInterval);
store.stop();
conf.setBoolean(YarnConfiguration.RM_HA_ENABLED, true);
store =
(ZKRMStateStore) zkClientTester.getRMStateStore(conf);
assertEquals(YarnConfiguration.DEFAULT_RM_ZK_TIMEOUT_MS /
YarnConfiguration.DEFAULT_ZK_RM_NUM_RETRIES,
store.zkRetryInterval);
store.stop();
}
}