From 86c250a54a586b1db098121c0c052cb3580fe5a4 Mon Sep 17 00:00:00 2001 From: slfan1989 <55643692+slfan1989@users.noreply.github.com> Date: Tue, 30 May 2023 01:37:08 +0800 Subject: [PATCH] YARN-7720. Race condition between second app attempt and UAM timeout when first attempt node is down. (#5672) --- .../hadoop/yarn/conf/YarnConfiguration.java | 3 ++- .../yarn/util/AbstractLivelinessMonitor.java | 2 +- .../src/main/resources/yarn-default.xml | 7 +++-- .../yarn/conf/TestYarnConfiguration.java | 19 +++++++++++++ .../AMRMProxyTokenSecretManager.java | 20 +++++++++----- .../resourcemanager/ResourceManager.java | 27 +++++++++++++++++++ .../rmapp/attempt/AMLivelinessMonitor.java | 14 ++++++++-- .../security/AMRMTokenSecretManager.java | 19 +++++++++---- 8 files changed, 94 insertions(+), 17 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 123a6de6a40..b6601b835d6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -394,7 +394,8 @@ public class YarnConfiguration extends Configuration { /** The expiry interval for application master reporting.*/ public static final String RM_AM_EXPIRY_INTERVAL_MS = YARN_PREFIX + "am.liveness-monitor.expiry-interval-ms"; - public static final int DEFAULT_RM_AM_EXPIRY_INTERVAL_MS = 600000; + public static final long DEFAULT_RM_AM_EXPIRY_INTERVAL_MS = + TimeUnit.MINUTES.toMillis(15); /** How long to wait until a node manager is considered dead.*/ public static final String RM_NM_EXPIRY_INTERVAL_MS = diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java index 3a9c0e8d8f8..0ae6c47d0ec 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java @@ -83,7 +83,7 @@ public abstract class AbstractLivelinessMonitor extends AbstractService { protected abstract void expire(O ob); - protected void setExpireInterval(int expireInterval) { + protected void setExpireInterval(long expireInterval) { this.expireInterval = expireInterval; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index d89f048ed3c..1f0982aede0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -169,9 +169,12 @@ - The expiry interval for application master reporting. + + The expiry interval for application master reporting. + The default is 900000 ms, or 15m. + yarn.am.liveness-monitor.expiry-interval-ms - 600000 + 900000 diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/conf/TestYarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/conf/TestYarnConfiguration.java index e4547a9163d..3b76d41af79 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/conf/TestYarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/conf/TestYarnConfiguration.java @@ -19,7 +19,9 @@ package org.apache.hadoop.yarn.conf; import java.net.InetSocketAddress; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.math.NumberUtils; import org.junit.jupiter.api.Test; import org.apache.hadoop.yarn.webapp.util.WebAppUtils; @@ -247,4 +249,21 @@ public class TestYarnConfiguration { assertNull(conf.get( HAUtil.addSuffix(YarnConfiguration.NM_LOCALIZER_ADDRESS, "rm1"))); } + + @Test + void checkRmAmExpiryIntervalSetting() throws Exception { + YarnConfiguration conf = new YarnConfiguration(); + + // 30m, 1800000ms + conf.set(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, "30m"); + long rmAmExpiryIntervalMS = conf.getTimeDuration(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS, TimeUnit.MILLISECONDS); + assertEquals(1800000, rmAmExpiryIntervalMS); + + // 10m, 600000ms + conf.set(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, "600000"); + String rmAmExpiryIntervalMS1 = conf.get(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS); + assertTrue(NumberUtils.isDigits(rmAmExpiryIntervalMS1)); + assertEquals(600000, Long.parseLong(rmAmExpiryIntervalMS1)); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/AMRMProxyTokenSecretManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/AMRMProxyTokenSecretManager.java index d1468c41115..ea4e9794013 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/AMRMProxyTokenSecretManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/AMRMProxyTokenSecretManager.java @@ -24,9 +24,12 @@ import java.util.HashSet; import java.util.Set; import java.util.Timer; import java.util.TimerTask; +import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; + +import org.apache.commons.lang3.math.NumberUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -88,12 +91,17 @@ public class AMRMProxyTokenSecretManager extends YarnConfiguration.DEFAULT_RM_AMRM_TOKEN_MASTER_KEY_ROLLING_INTERVAL_SECS) * 1000; // Adding delay = 1.5 * expiry interval makes sure that all active AMs get // the updated shared-key. - this.activationDelay = - (long) (conf.getLong(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, - YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS) * 1.5); - LOG.info("AMRMTokenKeyRollingInterval: " + this.rollingInterval - + "ms and AMRMTokenKeyActivationDelay: " + this.activationDelay - + " ms"); + String rmAmExpiryIntervalMS = conf.get(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS); + if (NumberUtils.isDigits(rmAmExpiryIntervalMS)) { + this.activationDelay = (long) (conf.getLong(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS) * 1.5); + } else { + this.activationDelay = (long) (conf.getTimeDuration( + YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS, TimeUnit.MILLISECONDS) * 1.5); + } + LOG.info("AMRMTokenKeyRollingInterval: {} ms and AMRMTokenKeyActivationDelay: {} ms.", + this.rollingInterval, this.activationDelay); if (rollingInterval <= activationDelay * 2) { throw new IllegalArgumentException( YarnConfiguration.RM_AMRM_TOKEN_MASTER_KEY_ROLLING_INTERVAL_SECS diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index da500add4cb..52aa466b3f6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager; +import org.apache.commons.lang3.math.NumberUtils; import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.hadoop.classification.VisibleForTesting; import com.sun.jersey.spi.container.servlet.ServletContainer; @@ -703,6 +704,32 @@ public class ResourceManager extends CompositeService + ", " + YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS + "=" + heartbeatIntvl); } + + if (HAUtil.isFederationEnabled(conf)) { + /* + * In Yarn Federation, we need UAMs in secondary sub-clusters to stay + * alive when the next attempt AM in home sub-cluster gets launched. If + * the previous AM died because the node is lost after NM timeout. It will + * already be too late if AM timeout is even shorter. + */ + String rmAmExpiryIntervalMS = conf.get(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS); + long amExpireIntvl; + if (NumberUtils.isDigits(rmAmExpiryIntervalMS)) { + amExpireIntvl = conf.getLong(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS); + } else { + amExpireIntvl = conf.getTimeDuration(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, + YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS, TimeUnit.MILLISECONDS); + } + + if (amExpireIntvl <= expireIntvl) { + throw new YarnRuntimeException("When Yarn Federation is enabled, " + + "AM expiry interval should be no less than NM expiry interval, " + + YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS + "=" + amExpireIntvl + + ", " + YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS + "=" + + expireIntvl); + } + } } /** diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java index 7006e500b98..d85b044b84d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java @@ -18,6 +18,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt; +import org.apache.commons.lang3.math.NumberUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.conf.YarnConfiguration; @@ -27,6 +28,8 @@ import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.util.AbstractLivelinessMonitor; import org.apache.hadoop.yarn.util.Clock; +import java.util.concurrent.TimeUnit; + public class AMLivelinessMonitor extends AbstractLivelinessMonitor { private EventHandler dispatcher; @@ -43,8 +46,15 @@ public class AMLivelinessMonitor extends AbstractLivelinessMonitor