From 6271a2852ea70c54589ce554e6bfad2eb703fe86 Mon Sep 17 00:00:00 2001 From: Jonathan Hung Date: Thu, 9 Apr 2020 10:23:05 -0700 Subject: [PATCH] YARN-10212. Create separate configuration for max global AM attempts. Contributed by Bilwa S T (cherry picked from commit 57659422abbf6d9bf52e6e27fca775254bb77a56) (cherry picked from commit e3a52804b03d646f15048c078f8c5292d5cbecfa) (cherry picked from commit 54599b177c46ed511e096909bed0c4f17bca1fe0) --- .../hadoop/yarn/conf/YarnConfiguration.java | 11 +++++++-- .../src/main/resources/yarn-default.xml | 23 +++++++++++++++---- .../resourcemanager/ResourceManager.java | 14 ++++++++--- .../resourcemanager/rmapp/RMAppImpl.java | 20 ++++++++++++---- .../resourcemanager/TestAppManager.java | 7 ++++-- .../resourcemanager/TestResourceManager.java | 13 ++++++++++- 6 files changed, 70 insertions(+), 18 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index bb92b5b82d6..6e9297a5830 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -486,12 +486,19 @@ public class YarnConfiguration extends Configuration { public static final int DEFAULT_RM_ADMIN_CLIENT_THREAD_COUNT = 1; /** - * The maximum number of application attempts. - * It's a global setting for all application masters. + * The maximum number of application attempts for + * an application, if unset by user. */ public static final String RM_AM_MAX_ATTEMPTS = RM_PREFIX + "am.max-attempts"; public static final int DEFAULT_RM_AM_MAX_ATTEMPTS = 2; + + /** + * The maximum number of application attempts. + * It's a global setting for all application masters. + */ + public static final String GLOBAL_RM_AM_MAX_ATTEMPTS = + RM_PREFIX + "am.global.max-attempts"; /** The keytab for the resource manager.*/ public static final String RM_KEYTAB = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index 156e59bee9a..a6d0d08c15c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -322,11 +322,10 @@ - The maximum number of application attempts. It's a global - setting for all application masters. Each application master can specify - its individual maximum number of application attempts via the API, but the - individual number cannot be more than the global upper bound. If it is, - the resourcemanager will override it. The default number is set to 2, to + The default maximum number of application attempts, if unset by + the user. Each application master can specify its individual maximum number of application + attempts via the API, but the individual number cannot be more than the global upper bound in + yarn.resourcemanager.am.global.max-attempts. The default number is set to 2, to allow at least one retry for AM. yarn.resourcemanager.am.max-attempts 2 @@ -3896,4 +3895,18 @@ yarn.webapp.enable-rest-app-submissions true + + + + The maximum number of application attempts. It's a global + setting for all application masters. Each application master can specify + its individual maximum number of application attempts via the API, but the + individual number cannot be more than the global upper bound. If it is, + the resourcemanager will override it. The default number value is set to + yarn.resourcemanager.am.max-attempts. + + yarn.resourcemanager.am.global.max-attempts + + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index e368e262edf..2fef798a468 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -583,12 +583,20 @@ public class ResourceManager extends CompositeService // sanity check for configurations protected static void validateConfigs(Configuration conf) { // validate max-attempts - int globalMaxAppAttempts = - conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, + int rmMaxAppAttempts = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS); + if (rmMaxAppAttempts <= 0) { + throw new YarnRuntimeException("Invalid rm am max attempts configuration" + + ", " + YarnConfiguration.RM_AM_MAX_ATTEMPTS + + "=" + rmMaxAppAttempts + ", it should be a positive integer."); + } + int globalMaxAppAttempts = conf.getInt( + YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS, + conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, + YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)); if (globalMaxAppAttempts <= 0) { throw new YarnRuntimeException("Invalid global max attempts configuration" - + ", " + YarnConfiguration.RM_AM_MAX_ATTEMPTS + + ", " + YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS + "=" + globalMaxAppAttempts + ", it should be a positive integer."); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index f176e81c9ba..7bb142febae 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -452,11 +452,20 @@ public class RMAppImpl implements RMApp, Recoverable { this.applicationPriority = Priority.newInstance(0); } - int globalMaxAppAttempts = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, + int globalMaxAppAttempts = conf.getInt( + YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS, + conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, + YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)); + int rmMaxAppAttempts = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS); int individualMaxAppAttempts = submissionContext.getMaxAppAttempts(); - if (individualMaxAppAttempts <= 0 || - individualMaxAppAttempts > globalMaxAppAttempts) { + if (individualMaxAppAttempts <= 0) { + this.maxAppAttempts = rmMaxAppAttempts; + LOG.warn("The specific max attempts: " + individualMaxAppAttempts + + " for application: " + applicationId.getId() + + " is invalid, because it is less than or equal to zero." + + " Use the rm max attempts instead."); + } else if (individualMaxAppAttempts > globalMaxAppAttempts) { this.maxAppAttempts = globalMaxAppAttempts; LOG.warn("The specific max attempts: " + individualMaxAppAttempts + " for application: " + applicationId.getId() @@ -1208,8 +1217,9 @@ public class RMAppImpl implements RMApp, Recoverable { + " failed due to " + failedEvent.getDiagnosticMsg() + ". Failing the application."; } else if (this.isNumAttemptsBeyondThreshold) { - int globalLimit = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, - YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS); + int globalLimit = conf.getInt(YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS, + conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, + YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS)); msg = String.format( "Application %s failed %d times%s%s due to %s. Failing the application.", getApplicationId(), diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestAppManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestAppManager.java index d240b3d9e54..8c6866bcccb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestAppManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestAppManager.java @@ -903,17 +903,20 @@ public class TestAppManager{ @Test (timeout = 30000) public void testRMAppSubmitMaxAppAttempts() throws Exception { int[] globalMaxAppAttempts = new int[] { 10, 1 }; + int[] rmAmMaxAttempts = new int[] { 8, 1 }; int[][] individualMaxAppAttempts = new int[][]{ new int[]{ 9, 10, 11, 0 }, new int[]{ 1, 10, 0, -1 }}; int[][] expectedNums = new int[][]{ - new int[]{ 9, 10, 10, 10 }, + new int[]{ 9, 10, 10, 8 }, new int[]{ 1, 1, 1, 1 }}; for (int i = 0; i < globalMaxAppAttempts.length; ++i) { for (int j = 0; j < individualMaxAppAttempts.length; ++j) { ResourceScheduler scheduler = mockResourceScheduler(); Configuration conf = new Configuration(); - conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, globalMaxAppAttempts[i]); + conf.setInt(YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS, + globalMaxAppAttempts[i]); + conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, rmAmMaxAttempts[i]); ApplicationMasterService masterService = new ApplicationMasterService(rmContext, scheduler); TestRMAppManager appMonitor = new TestRMAppManager(rmContext, diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java index 254de9a6111..86e0e054e34 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestResourceManager.java @@ -230,7 +230,7 @@ public class TestResourceManager { @Test (timeout = 30000) public void testResourceManagerInitConfigValidation() throws Exception { Configuration conf = new YarnConfiguration(); - conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, -1); + conf.setInt(YarnConfiguration.GLOBAL_RM_AM_MAX_ATTEMPTS, -1); try { resourceManager = new MockRM(conf); fail("Exception is expected because the global max attempts" + @@ -240,6 +240,17 @@ public class TestResourceManager { if (!e.getMessage().startsWith( "Invalid global max attempts configuration")) throw e; } + Configuration yarnConf = new YarnConfiguration(); + yarnConf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, -1); + try { + resourceManager = new MockRM(yarnConf); + fail("Exception is expected because AM max attempts" + + " is negative."); + } catch (YarnRuntimeException e) { + // Exception is expected. + if (!e.getMessage().startsWith( + "Invalid rm am max attempts configuration")) throw e; + } } @Test