YARN-10212. Create separate configuration for max global AM attempts. Contributed by Bilwa S T

(cherry picked from commit 57659422abbf6d9bf52e6e27fca775254bb77a56)
This commit is contained in:
Jonathan Hung 2020-04-09 10:23:05 -07:00
parent e4331a73c9
commit e1dd78143b
6 changed files with 70 additions and 18 deletions

View File

@ -502,13 +502,20 @@ public class YarnConfiguration extends Configuration {
public static final int DEFAULT_RM_ADMIN_CLIENT_THREAD_COUNT = 1;
/**
* The maximum number of application attempts.
* It's a global setting for all application masters.
* The maximum number of application attempts for
* an application, if unset by user.
*/
public static final String RM_AM_MAX_ATTEMPTS =
RM_PREFIX + "am.max-attempts";
public static final int DEFAULT_RM_AM_MAX_ATTEMPTS = 2;
/**
* The maximum number of application attempts.
* It's a global setting for all application masters.
*/
public static final String GLOBAL_RM_AM_MAX_ATTEMPTS =
RM_PREFIX + "am.global.max-attempts";
/** The keytab for the resource manager.*/
public static final String RM_KEYTAB =
RM_PREFIX + "keytab";

View File

@ -327,11 +327,10 @@
</property>
<property>
<description>The maximum number of application attempts. It's a global
setting for all application masters. Each application master can specify
its individual maximum number of application attempts via the API, but the
individual number cannot be more than the global upper bound. If it is,
the resourcemanager will override it. The default number is set to 2, to
<description>The default maximum number of application attempts, if unset by
the user. Each application master can specify its individual maximum number of application
attempts via the API, but the individual number cannot be more than the global upper bound in
yarn.resourcemanager.am.global.max-attempts. The default number is set to 2, to
allow at least one retry for AM.</description>
<name>yarn.resourcemanager.am.max-attempts</name>
<value>2</value>
@ -4534,4 +4533,18 @@
<name>yarn.webapp.enable-rest-app-submissions</name>
<value>true</value>
</property>
<property>
<description>
The maximum number of application attempts. It's a global
setting for all application masters. Each application master can specify
its individual maximum number of application attempts via the API, but the
individual number cannot be more than the global upper bound. If it is,
the resourcemanager will override it. The default number value is set to
yarn.resourcemanager.am.max-attempts.
</description>
<name>yarn.resourcemanager.am.global.max-attempts</name>
<value></value>
</property>
</configuration>

View File

@ -613,12 +613,20 @@ public class ResourceManager extends CompositeService
// sanity check for configurations
protected static void validateConfigs(Configuration conf) {
// validate max-attempts
int globalMaxAppAttempts =
conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
int rmMaxAppAttempts = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
if (rmMaxAppAttempts <= 0) {
throw new YarnRuntimeException("Invalid rm am max attempts configuration"
+ ", " + YarnConfiguration.RM_AM_MAX_ATTEMPTS
+ "=" + rmMaxAppAttempts + ", it should be a positive integer.");
}
int globalMaxAppAttempts = conf.getInt(
YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS,
conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS));
if (globalMaxAppAttempts <= 0) {
throw new YarnRuntimeException("Invalid global max attempts configuration"
+ ", " + YarnConfiguration.RM_AM_MAX_ATTEMPTS
+ ", " + YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS
+ "=" + globalMaxAppAttempts + ", it should be a positive integer.");
}

View File

@ -453,11 +453,20 @@ public class RMAppImpl implements RMApp, Recoverable {
this.applicationPriority = Priority.newInstance(0);
}
int globalMaxAppAttempts = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
int globalMaxAppAttempts = conf.getInt(
YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS,
conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS));
int rmMaxAppAttempts = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
int individualMaxAppAttempts = submissionContext.getMaxAppAttempts();
if (individualMaxAppAttempts <= 0 ||
individualMaxAppAttempts > globalMaxAppAttempts) {
if (individualMaxAppAttempts <= 0) {
this.maxAppAttempts = rmMaxAppAttempts;
LOG.warn("The specific max attempts: " + individualMaxAppAttempts
+ " for application: " + applicationId.getId()
+ " is invalid, because it is less than or equal to zero."
+ " Use the rm max attempts instead.");
} else if (individualMaxAppAttempts > globalMaxAppAttempts) {
this.maxAppAttempts = globalMaxAppAttempts;
LOG.warn("The specific max attempts: " + individualMaxAppAttempts
+ " for application: " + applicationId.getId()
@ -1211,8 +1220,9 @@ public class RMAppImpl implements RMApp, Recoverable {
+ " failed due to " + failedEvent.getDiagnosticMsg()
+ ". Failing the application.";
} else if (this.isNumAttemptsBeyondThreshold) {
int globalLimit = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
int globalLimit = conf.getInt(YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS,
conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS,
YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS));
msg = String.format(
"Application %s failed %d times%s%s due to %s. Failing the application.",
getApplicationId(),

View File

@ -980,17 +980,20 @@ public class TestAppManager extends AppManagerTestBase{
@Test (timeout = 30000)
public void testRMAppSubmitMaxAppAttempts() throws Exception {
int[] globalMaxAppAttempts = new int[] { 10, 1 };
int[] rmAmMaxAttempts = new int[] { 8, 1 };
int[][] individualMaxAppAttempts = new int[][]{
new int[]{ 9, 10, 11, 0 },
new int[]{ 1, 10, 0, -1 }};
int[][] expectedNums = new int[][]{
new int[]{ 9, 10, 10, 10 },
new int[]{ 9, 10, 10, 8 },
new int[]{ 1, 1, 1, 1 }};
for (int i = 0; i < globalMaxAppAttempts.length; ++i) {
for (int j = 0; j < individualMaxAppAttempts.length; ++j) {
ResourceScheduler scheduler = mockResourceScheduler();
Configuration conf = new Configuration();
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, globalMaxAppAttempts[i]);
conf.setInt(YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS,
globalMaxAppAttempts[i]);
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, rmAmMaxAttempts[i]);
ApplicationMasterService masterService =
new ApplicationMasterService(rmContext, scheduler);
TestRMAppManager appMonitor = new TestRMAppManager(rmContext,

View File

@ -237,7 +237,7 @@ public class TestResourceManager {
@Test (timeout = 30000)
public void testResourceManagerInitConfigValidation() throws Exception {
Configuration conf = new YarnConfiguration();
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, -1);
conf.setInt(YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS, -1);
try {
resourceManager = new MockRM(conf);
fail("Exception is expected because the global max attempts" +
@ -247,6 +247,17 @@ public class TestResourceManager {
if (!e.getMessage().startsWith(
"Invalid global max attempts configuration")) throw e;
}
Configuration yarnConf = new YarnConfiguration();
yarnConf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, -1);
try {
resourceManager = new MockRM(yarnConf);
fail("Exception is expected because AM max attempts" +
" is negative.");
} catch (YarnRuntimeException e) {
// Exception is expected.
if (!e.getMessage().startsWith(
"Invalid rm am max attempts configuration")) throw e;
}
}
@Test