YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when state-store error occurs. Contributed by Jian He
(cherry picked from commit 9f7fcb54e798cf4fda1ea7972dd96491976e1857)
This commit is contained in:
parent
d27f09c936
commit
1828ba00be
|
@ -21,6 +21,9 @@ Release 2.6.2 - UNRELEASED
|
||||||
|
|
||||||
BUG FIXES
|
BUG FIXES
|
||||||
|
|
||||||
|
YARN-4087. Followup fixes after YARN-2019 regarding RM behavior when
|
||||||
|
state-store error occurs. (Jian He via xgong)
|
||||||
|
|
||||||
Release 2.6.1 - 2015-09-23
|
Release 2.6.1 - 2015-09-23
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -381,7 +381,7 @@ public class YarnConfiguration extends Configuration {
|
||||||
public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false;
|
public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false;
|
||||||
|
|
||||||
public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast";
|
public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast";
|
||||||
public static final boolean DEFAULT_YARN_FAIL_FAST = true;
|
public static final boolean DEFAULT_YARN_FAIL_FAST = false;
|
||||||
|
|
||||||
public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast";
|
public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast";
|
||||||
|
|
||||||
|
|
|
@ -286,9 +286,12 @@
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>Should YARN fail fast if it encounters any errors.
|
<description>Should YARN fail fast if it encounters any errors.
|
||||||
|
This is a global config for all other components including RM,NM etc.
|
||||||
|
If no value is set for component-specific config (e.g yarn.resourcemanager.fail-fast),
|
||||||
|
this value will be the default.
|
||||||
</description>
|
</description>
|
||||||
<name>yarn.fail-fast</name>
|
<name>yarn.fail-fast</name>
|
||||||
<value>true</value>
|
<value>false</value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
|
|
|
@ -42,6 +42,7 @@ import org.apache.hadoop.yarn.api.records.Container;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
|
||||||
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
|
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
|
||||||
|
import org.apache.hadoop.yarn.conf.HAUtil;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
||||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||||
|
@ -822,17 +823,19 @@ public abstract class RMStateStore extends AbstractService {
|
||||||
*/
|
*/
|
||||||
protected void notifyStoreOperationFailed(Exception failureCause) {
|
protected void notifyStoreOperationFailed(Exception failureCause) {
|
||||||
LOG.error("State store operation failed ", failureCause);
|
LOG.error("State store operation failed ", failureCause);
|
||||||
if (failureCause instanceof StoreFencedException) {
|
if (HAUtil.isHAEnabled(getConfig())) {
|
||||||
|
LOG.warn("State-store fenced ! Transitioning RM to standby");
|
||||||
Thread standByTransitionThread =
|
Thread standByTransitionThread =
|
||||||
new Thread(new StandByTransitionThread());
|
new Thread(new StandByTransitionThread());
|
||||||
standByTransitionThread.setName("StandByTransitionThread Handler");
|
standByTransitionThread.setName("StandByTransitionThread Handler");
|
||||||
standByTransitionThread.start();
|
standByTransitionThread.start();
|
||||||
} else {
|
} else if (YarnConfiguration.shouldRMFailFast(getConfig())) {
|
||||||
if (YarnConfiguration.shouldRMFailFast(getConfig())) {
|
LOG.fatal("Fail RM now due to state-store error!");
|
||||||
rmDispatcher.getEventHandler().handle(
|
rmDispatcher.getEventHandler().handle(
|
||||||
new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
|
new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
|
||||||
failureCause));
|
failureCause));
|
||||||
}
|
} else {
|
||||||
|
LOG.warn("Skip the state-store error.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue