YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when
state-store error occurs. Contributed by Jian He (cherry picked from commit9b78e6e33d
) (cherry picked from commita0b7ef15d0
)
This commit is contained in:
parent
96b9455c6f
commit
b55fb0ac44
|
@ -76,6 +76,9 @@ Release 2.7.2 - UNRELEASED
|
|||
YARN-4105. Capacity Scheduler headroom for DRF is wrong (Chang Li via
|
||||
jlowe)
|
||||
|
||||
YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when
|
||||
state-store error occurs. (Jian He via xgong)
|
||||
|
||||
Release 2.7.1 - 2015-07-06
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -402,7 +402,7 @@ public class YarnConfiguration extends Configuration {
|
|||
public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false;
|
||||
|
||||
public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast";
|
||||
public static final boolean DEFAULT_YARN_FAIL_FAST = true;
|
||||
public static final boolean DEFAULT_YARN_FAIL_FAST = false;
|
||||
|
||||
public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast";
|
||||
|
||||
|
|
|
@ -298,9 +298,12 @@
|
|||
|
||||
<property>
|
||||
<description>Should YARN fail fast if it encounters any errors.
|
||||
This is a global config for all other components including RM,NM etc.
|
||||
If no value is set for component-specific config (e.g yarn.resourcemanager.fail-fast),
|
||||
this value will be the default.
|
||||
</description>
|
||||
<name>yarn.fail-fast</name>
|
||||
<value>true</value>
|
||||
<value>false</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
|
|
|
@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
|||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
|
||||
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
|
||||
import org.apache.hadoop.yarn.conf.HAUtil;
|
||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||
|
@ -857,18 +858,20 @@ public abstract class RMStateStore extends AbstractService {
|
|||
*/
|
||||
protected void notifyStoreOperationFailed(Exception failureCause) {
|
||||
LOG.error("State store operation failed ", failureCause);
|
||||
if (failureCause instanceof StoreFencedException) {
|
||||
if (HAUtil.isHAEnabled(getConfig())) {
|
||||
LOG.warn("State-store fenced ! Transitioning RM to standby");
|
||||
updateFencedState();
|
||||
Thread standByTransitionThread =
|
||||
new Thread(new StandByTransitionThread());
|
||||
standByTransitionThread.setName("StandByTransitionThread Handler");
|
||||
standByTransitionThread.start();
|
||||
} else if (YarnConfiguration.shouldRMFailFast(getConfig())) {
|
||||
LOG.fatal("Fail RM now due to state-store error!");
|
||||
rmDispatcher.getEventHandler().handle(
|
||||
new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
|
||||
failureCause));
|
||||
} else {
|
||||
if (YarnConfiguration.shouldRMFailFast(getConfig())) {
|
||||
rmDispatcher.getEventHandler().handle(
|
||||
new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
|
||||
failureCause));
|
||||
}
|
||||
LOG.warn("Skip the state-store error.");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue