Merge -c 1514135 from trunk to branch-2 to fix YARN-1056. Remove dual use of string 'resourcemanager' in yarn.resourcemanager.connect.{max.wait.secs|retry_interval.secs}. Contributed by Karthik Kambatla.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1514136 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Arun Murthy 2013-08-15 02:36:58 +00:00
parent 777a48855b
commit 77a60701c1
5 changed files with 55 additions and 46 deletions

View File

@ -860,6 +860,10 @@ Release 2.1.0-beta - 2013-08-06
YARN-1043. Push all metrics consistently. (Jian He via acmurthy) YARN-1043. Push all metrics consistently. (Jian He via acmurthy)
YARN-1056. Remove dual use of string 'resourcemanager' in
yarn.resourcemanager.connect.{max.wait.secs|retry_interval.secs}
(Karthik Kambatla via acmurthy)
Release 2.0.5-alpha - 06/06/2013 Release 2.0.5-alpha - 06/06/2013
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -259,7 +259,7 @@ public class YarnConfiguration extends Configuration {
/** URI for FileSystemRMStateStore */ /** URI for FileSystemRMStateStore */
public static final String FS_RM_STATE_STORE_URI = public static final String FS_RM_STATE_STORE_URI =
RM_PREFIX + "fs.rm-state-store.uri"; RM_PREFIX + "fs.state-store.uri";
/** The maximum number of completed applications RM keeps. */ /** The maximum number of completed applications RM keeps. */
public static final String RM_MAX_COMPLETED_APPLICATIONS = public static final String RM_MAX_COMPLETED_APPLICATIONS =
@ -655,19 +655,17 @@ public class YarnConfiguration extends Configuration {
public static final long DEFAULT_NM_PROCESS_KILL_WAIT_MS = public static final long DEFAULT_NM_PROCESS_KILL_WAIT_MS =
2000; 2000;
/** Max time to wait to establish a connection to RM /** Max time to wait to establish a connection to RM */
*/ public static final String RESOURCEMANAGER_CONNECT_MAX_WAIT_MS =
public static final String RESOURCEMANAGER_CONNECT_MAX_WAIT_SECS = RM_PREFIX + "connect.max-wait.ms";
RM_PREFIX + "resourcemanager.connect.max.wait.secs"; public static final int DEFAULT_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS =
public static final int DEFAULT_RESOURCEMANAGER_CONNECT_MAX_WAIT_SECS = 15 * 60 * 1000;
15*60;
/** Time interval between each attempt to connect to RM /** Time interval between each attempt to connect to RM */
*/ public static final String RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS =
public static final String RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS = RM_PREFIX + "connect.retry-interval.ms";
RM_PREFIX + "resourcemanager.connect.retry_interval.secs"; public static final long DEFAULT_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS
public static final long DEFAULT_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS = 30 * 1000;
= 30;
/** /**
* CLASSPATH for YARN applications. A comma-separated list of CLASSPATH * CLASSPATH for YARN applications. A comma-separated list of CLASSPATH

View File

@ -35,14 +35,10 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.retry.RetryPolicies; import org.apache.hadoop.io.retry.RetryPolicies;
import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.io.retry.RetryProxy; import org.apache.hadoop.io.retry.RetryProxy;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.ipc.YarnRPC; import org.apache.hadoop.yarn.ipc.YarnRPC;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;
@ -79,38 +75,36 @@ public class RMProxy<T> {
public static RetryPolicy createRetryPolicy(Configuration conf) { public static RetryPolicy createRetryPolicy(Configuration conf) {
long rmConnectWaitMS = long rmConnectWaitMS =
conf.getInt( conf.getInt(
YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_SECS, YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
YarnConfiguration.DEFAULT_RESOURCEMANAGER_CONNECT_MAX_WAIT_SECS) YarnConfiguration.DEFAULT_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS);
* 1000;
long rmConnectionRetryIntervalMS = long rmConnectionRetryIntervalMS =
conf.getLong( conf.getLong(
YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS, YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
YarnConfiguration YarnConfiguration
.DEFAULT_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS) .DEFAULT_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS);
* 1000;
if (rmConnectionRetryIntervalMS < 0) { if (rmConnectionRetryIntervalMS < 0) {
throw new YarnRuntimeException("Invalid Configuration. " + throw new YarnRuntimeException("Invalid Configuration. " +
YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS + YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS +
" should not be negative."); " should not be negative.");
} }
boolean waitForEver = (rmConnectWaitMS == -1000); boolean waitForEver = (rmConnectWaitMS == -1);
if (waitForEver) { if (waitForEver) {
return RetryPolicies.RETRY_FOREVER; return RetryPolicies.RETRY_FOREVER;
} else { } else {
if (rmConnectWaitMS < 0) { if (rmConnectWaitMS < 0) {
throw new YarnRuntimeException("Invalid Configuration. " throw new YarnRuntimeException("Invalid Configuration. "
+ YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_SECS + YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS
+ " can be -1, but can not be other negative numbers"); + " can be -1, but can not be other negative numbers");
} }
// try connect once // try connect once
if (rmConnectWaitMS < rmConnectionRetryIntervalMS) { if (rmConnectWaitMS < rmConnectionRetryIntervalMS) {
LOG.warn(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_SECS LOG.warn(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS
+ " is smaller than " + " is smaller than "
+ YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS + YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS
+ ". Only try connect once."); + ". Only try connect once.");
rmConnectWaitMS = 0; rmConnectWaitMS = 0;
} }

View File

@ -140,6 +140,20 @@
<value>1000</value> <value>1000</value>
</property> </property>
<property>
<description>Maximum time to wait to establish connection to
ResourceManager.</description>
<name>yarn.resourcemanager.connect.max-wait.ms</name>
<value>900000</value>
</property>
<property>
<description>How often to try connecting to the
ResourceManager.</description>
<name>yarn.resourcemanager.connect.retry-interval.ms</name>
<value>30000</value>
</property>
<property> <property>
<description>The maximum number of application attempts. It's a global <description>The maximum number of application attempts. It's a global
setting for all application masters. Each application master can specify setting for all application masters. Each application master can specify
@ -249,7 +263,7 @@
RM state will be stored. This must be supplied when using RM state will be stored. This must be supplied when using
org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
as the value for yarn.resourcemanager.store.class</description> as the value for yarn.resourcemanager.store.class</description>
<name>yarn.resourcemanager.fs.rm-state-store.uri</name> <name>yarn.resourcemanager.fs.state-store.uri</name>
<value>${hadoop.tmp.dir}/yarn/system/rmstore</value> <value>${hadoop.tmp.dir}/yarn/system/rmstore</value>
<!--value>hdfs://localhost:9000/rmstore</value--> <!--value>hdfs://localhost:9000/rmstore</value-->
</property> </property>

View File

@ -957,15 +957,14 @@ public class TestNodeStatusUpdater {
@Test (timeout = 150000) @Test (timeout = 150000)
public void testNMConnectionToRM() throws Exception { public void testNMConnectionToRM() throws Exception {
final long delta = 50000; final long delta = 50000;
final long connectionWaitSecs = 5; final long connectionWaitMs = 5000;
final long connectionRetryIntervalSecs = 1; final long connectionRetryIntervalMs = 1000;
//Waiting for rmStartIntervalMS, RM will be started //Waiting for rmStartIntervalMS, RM will be started
final long rmStartIntervalMS = 2*1000; final long rmStartIntervalMS = 2*1000;
conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_SECS, conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
connectionWaitSecs); connectionWaitMs);
conf.setLong(YarnConfiguration conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS, connectionRetryIntervalMs);
connectionRetryIntervalSecs);
//Test NM try to connect to RM Several times, but finally fail //Test NM try to connect to RM Several times, but finally fail
NodeManagerWithCustomNodeStatusUpdater nmWithUpdater; NodeManagerWithCustomNodeStatusUpdater nmWithUpdater;
@ -987,15 +986,15 @@ public class TestNodeStatusUpdater {
} catch(Exception e) { } catch(Exception e) {
long t = System.currentTimeMillis(); long t = System.currentTimeMillis();
long duration = t - waitStartTime; long duration = t - waitStartTime;
boolean waitTimeValid = (duration >= connectionWaitSecs * 1000) boolean waitTimeValid = (duration >= connectionWaitMs)
&& (duration < (connectionWaitSecs * 1000 + delta)); && (duration < (connectionWaitMs + delta));
if(!waitTimeValid) { if(!waitTimeValid) {
//either the exception was too early, or it had a different cause. //either the exception was too early, or it had a different cause.
//reject with the inner stack trace //reject with the inner stack trace
throw new Exception("NM should have tried re-connecting to RM during " + throw new Exception("NM should have tried re-connecting to RM during " +
"period of at least " + connectionWaitSecs + " seconds, but " + "period of at least " + connectionWaitMs + " ms, but " +
"stopped retrying within " + (connectionWaitSecs + delta/1000) + "stopped retrying within " + (connectionWaitMs + delta) +
" seconds: " + e, e); " ms: " + e, e);
} }
} }
@ -1149,14 +1148,14 @@ public class TestNodeStatusUpdater {
@Test(timeout = 200000) @Test(timeout = 200000)
public void testNodeStatusUpdaterRetryAndNMShutdown() public void testNodeStatusUpdaterRetryAndNMShutdown()
throws Exception { throws Exception {
final long connectionWaitSecs = 1; final long connectionWaitSecs = 1000;
final long connectionRetryIntervalSecs = 1; final long connectionRetryIntervalMs = 1000;
YarnConfiguration conf = createNMConfig(); YarnConfiguration conf = createNMConfig();
conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_SECS, conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
connectionWaitSecs); connectionWaitSecs);
conf.setLong(YarnConfiguration conf.setLong(YarnConfiguration
.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_SECS, .RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
connectionRetryIntervalSecs); connectionRetryIntervalMs);
conf.setLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, 5000); conf.setLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, 5000);
CyclicBarrier syncBarrier = new CyclicBarrier(2); CyclicBarrier syncBarrier = new CyclicBarrier(2);
nm = new MyNodeManager2(syncBarrier, conf); nm = new MyNodeManager2(syncBarrier, conf);