diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 2bfab93944b..7aede89b72d 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -837,6 +837,9 @@ Release 2.7.2 - UNRELEASED
YARN-4105. Capacity Scheduler headroom for DRF is wrong (Chang Li via
jlowe)
+ YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when
+ state-store error occurs. (Jian He via xgong)
+
Release 2.7.1 - 2015-07-06
INCOMPATIBLE CHANGES
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 60ed0ed0141..9d03470d51e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -402,7 +402,7 @@ public class YarnConfiguration extends Configuration {
public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false;
public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast";
- public static final boolean DEFAULT_YARN_FAIL_FAST = true;
+ public static final boolean DEFAULT_YARN_FAIL_FAST = false;
public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast";
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 436bfb04e63..59bfb569865 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -349,9 +349,12 @@
Should YARN fail fast if it encounters any errors.
+ This is a global config for all other components including RM,NM etc.
+ If no value is set for component-specific config (e.g yarn.resourcemanager.fail-fast),
+ this value will be the default.
yarn.fail-fast
- true
+ false
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
index b7f1e6c72f1..aa5caf96e9a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java
@@ -45,6 +45,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.ReservationId;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
+import org.apache.hadoop.yarn.conf.HAUtil;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher;
@@ -1013,18 +1014,20 @@ public abstract class RMStateStore extends AbstractService {
*/
protected void notifyStoreOperationFailed(Exception failureCause) {
LOG.error("State store operation failed ", failureCause);
- if (failureCause instanceof StoreFencedException) {
+ if (HAUtil.isHAEnabled(getConfig())) {
+ LOG.warn("State-store fenced ! Transitioning RM to standby");
updateFencedState();
Thread standByTransitionThread =
new Thread(new StandByTransitionThread());
standByTransitionThread.setName("StandByTransitionThread Handler");
standByTransitionThread.start();
+ } else if (YarnConfiguration.shouldRMFailFast(getConfig())) {
+ LOG.fatal("Fail RM now due to state-store error!");
+ rmDispatcher.getEventHandler().handle(
+ new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
+ failureCause));
} else {
- if (YarnConfiguration.shouldRMFailFast(getConfig())) {
- rmDispatcher.getEventHandler().handle(
- new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
- failureCause));
- }
+ LOG.warn("Skip the state-store error.");
}
}