YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when

state-store error occurs. Contributed by Jian He
This commit is contained in:
Xuan 2015-09-07 17:45:47 -07:00
parent 6a50689705
commit 9b78e6e33d
4 changed files with 17 additions and 8 deletions

View File

@ -889,6 +889,9 @@ Release 2.7.2 - UNRELEASED
YARN-4105. Capacity Scheduler headroom for DRF is wrong (Chang Li via YARN-4105. Capacity Scheduler headroom for DRF is wrong (Chang Li via
jlowe) jlowe)
YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when
state-store error occurs. (Jian He via xgong)
Release 2.7.1 - 2015-07-06 Release 2.7.1 - 2015-07-06
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -402,7 +402,7 @@ public class YarnConfiguration extends Configuration {
public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false; public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false;
public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast"; public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast";
public static final boolean DEFAULT_YARN_FAIL_FAST = true; public static final boolean DEFAULT_YARN_FAIL_FAST = false;
public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast"; public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast";

View File

@ -349,9 +349,12 @@
<property> <property>
<description>Should YARN fail fast if it encounters any errors. <description>Should YARN fail fast if it encounters any errors.
This is a global config for all other components including RM,NM etc.
If no value is set for component-specific config (e.g yarn.resourcemanager.fail-fast),
this value will be the default.
</description> </description>
<name>yarn.fail-fast</name> <name>yarn.fail-fast</name>
<value>true</value> <value>false</value>
</property> </property>
<property> <property>

View File

@ -45,6 +45,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.ReservationId; import org.apache.hadoop.yarn.api.records.ReservationId;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
import org.apache.hadoop.yarn.conf.HAUtil;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.Dispatcher;
@ -1013,18 +1014,20 @@ public abstract class RMStateStore extends AbstractService {
*/ */
protected void notifyStoreOperationFailed(Exception failureCause) { protected void notifyStoreOperationFailed(Exception failureCause) {
LOG.error("State store operation failed ", failureCause); LOG.error("State store operation failed ", failureCause);
if (failureCause instanceof StoreFencedException) { if (HAUtil.isHAEnabled(getConfig())) {
LOG.warn("State-store fenced ! Transitioning RM to standby");
updateFencedState(); updateFencedState();
Thread standByTransitionThread = Thread standByTransitionThread =
new Thread(new StandByTransitionThread()); new Thread(new StandByTransitionThread());
standByTransitionThread.setName("StandByTransitionThread Handler"); standByTransitionThread.setName("StandByTransitionThread Handler");
standByTransitionThread.start(); standByTransitionThread.start();
} else { } else if (YarnConfiguration.shouldRMFailFast(getConfig())) {
if (YarnConfiguration.shouldRMFailFast(getConfig())) { LOG.fatal("Fail RM now due to state-store error!");
rmDispatcher.getEventHandler().handle( rmDispatcher.getEventHandler().handle(
new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
failureCause)); failureCause));
} } else {
LOG.warn("Skip the state-store error.");
} }
} }