From 1828ba00be40721ae798e2288a75a5e7d187b643 Mon Sep 17 00:00:00 2001 From: Xuan Date: Mon, 7 Sep 2015 18:09:35 -0700 Subject: [PATCH] YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when state-store error occurs. Contributed by Jian He (cherry picked from commit 9f7fcb54e798cf4fda1ea7972dd96491976e1857) --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../hadoop/yarn/conf/YarnConfiguration.java | 2 +- .../src/main/resources/yarn-default.xml | 5 ++++- .../resourcemanager/recovery/RMStateStore.java | 15 +++++++++------ 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 15d082bb4f6..fdf65bbcda1 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -21,6 +21,9 @@ Release 2.6.2 - UNRELEASED BUG FIXES + YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when + state-store error occurs. (Jian He via xgong) + Release 2.6.1 - 2015-09-23 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 471297c18ab..d7f24ac886f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -381,7 +381,7 @@ public class YarnConfiguration extends Configuration { public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false; public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast"; - public static final boolean DEFAULT_YARN_FAIL_FAST = true; + public static final boolean DEFAULT_YARN_FAIL_FAST = false; public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index fa2e5ccb319..9b56d992acc 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -286,9 +286,12 @@ Should YARN fail fast if it encounters any errors. + This is a global config for all other components including RM,NM etc. + If no value is set for component-specific config (e.g yarn.resourcemanager.fail-fast), + this value will be the default. yarn.fail-fast - true + false diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java index 37855f7f57b..fa912ca3903 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java @@ -42,6 +42,7 @@ import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl; +import org.apache.hadoop.yarn.conf.HAUtil; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.Dispatcher; @@ -822,17 +823,19 @@ public abstract class RMStateStore extends AbstractService { */ protected void notifyStoreOperationFailed(Exception failureCause) { LOG.error("State store operation failed ", failureCause); - if (failureCause instanceof StoreFencedException) { + if (HAUtil.isHAEnabled(getConfig())) { + LOG.warn("State-store fenced ! Transitioning RM to standby"); Thread standByTransitionThread = new Thread(new StandByTransitionThread()); standByTransitionThread.setName("StandByTransitionThread Handler"); standByTransitionThread.start(); + } else if (YarnConfiguration.shouldRMFailFast(getConfig())) { + LOG.fatal("Fail RM now due to state-store error!"); + rmDispatcher.getEventHandler().handle( + new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, + failureCause)); } else { - if (YarnConfiguration.shouldRMFailFast(getConfig())) { - rmDispatcher.getEventHandler().handle( - new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, - failureCause)); - } + LOG.warn("Skip the state-store error."); } }