YARN-2019. Retrospect on decision of making RM crashed if any exception throw in ZKRMStateStore. Contributed by Jian He.
(cherry picked from commit ee98d6354b
)
This commit is contained in:
parent
be2334ba3a
commit
6772c3f4dd
|
@ -89,6 +89,9 @@ Release 2.8.0 - UNRELEASED
|
||||||
YARN-2003. Support for Application priority : Changes in RM and Capacity
|
YARN-2003. Support for Application priority : Changes in RM and Capacity
|
||||||
Scheduler. (Sunil G via wangda)
|
Scheduler. (Sunil G via wangda)
|
||||||
|
|
||||||
|
YARN-2019. Retrospect on decision of making RM crashed if any exception throw
|
||||||
|
in ZKRMStateStore. (Jian He via junping_du)
|
||||||
|
|
||||||
IMPROVEMENTS
|
IMPROVEMENTS
|
||||||
|
|
||||||
YARN-644. Basic null check is not performed on passed in arguments before
|
YARN-644. Basic null check is not performed on passed in arguments before
|
||||||
|
|
|
@ -401,6 +401,11 @@ public class YarnConfiguration extends Configuration {
|
||||||
public static final String RECOVERY_ENABLED = RM_PREFIX + "recovery.enabled";
|
public static final String RECOVERY_ENABLED = RM_PREFIX + "recovery.enabled";
|
||||||
public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false;
|
public static final boolean DEFAULT_RM_RECOVERY_ENABLED = false;
|
||||||
|
|
||||||
|
public static final String YARN_FAIL_FAST = YARN_PREFIX + "fail-fast";
|
||||||
|
public static final boolean DEFAULT_YARN_FAIL_FAST = true;
|
||||||
|
|
||||||
|
public static final String RM_FAIL_FAST = RM_PREFIX + "fail-fast";
|
||||||
|
|
||||||
@Private
|
@Private
|
||||||
public static final String RM_WORK_PRESERVING_RECOVERY_ENABLED = RM_PREFIX
|
public static final String RM_WORK_PRESERVING_RECOVERY_ENABLED = RM_PREFIX
|
||||||
+ "work-preserving-recovery.enabled";
|
+ "work-preserving-recovery.enabled";
|
||||||
|
@ -2018,6 +2023,12 @@ public class YarnConfiguration extends Configuration {
|
||||||
YARN_HTTP_POLICY_DEFAULT));
|
YARN_HTTP_POLICY_DEFAULT));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static boolean shouldRMFailFast(Configuration conf) {
|
||||||
|
return conf.getBoolean(YarnConfiguration.RM_FAIL_FAST,
|
||||||
|
conf.getBoolean(YarnConfiguration.YARN_FAIL_FAST,
|
||||||
|
YarnConfiguration.DEFAULT_YARN_FAIL_FAST));
|
||||||
|
}
|
||||||
|
|
||||||
@Private
|
@Private
|
||||||
public static String getClusterId(Configuration conf) {
|
public static String getClusterId(Configuration conf) {
|
||||||
String clusterId = conf.get(YarnConfiguration.RM_CLUSTER_ID);
|
String clusterId = conf.get(YarnConfiguration.RM_CLUSTER_ID);
|
||||||
|
|
|
@ -323,6 +323,22 @@
|
||||||
<value>false</value>
|
<value>false</value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>Should RM fail fast if it encounters any errors. By defalt, it
|
||||||
|
points to ${yarn.fail-fast}. Errors include:
|
||||||
|
1) exceptions when state-store write/read operations fails.
|
||||||
|
</description>
|
||||||
|
<name>yarn.resourcemanager.fail-fast</name>
|
||||||
|
<value>${yarn.fail-fast}</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>Should YARN fail fast if it encounters any errors.
|
||||||
|
</description>
|
||||||
|
<name>yarn.fail-fast</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>Enable RM work preserving recovery. This configuration is private
|
<description>Enable RM work preserving recovery. This configuration is private
|
||||||
to YARN for experimenting the feature.
|
to YARN for experimenting the feature.
|
||||||
|
|
|
@ -44,6 +44,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
|
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
|
||||||
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
|
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl;
|
||||||
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
||||||
import org.apache.hadoop.yarn.event.Dispatcher;
|
import org.apache.hadoop.yarn.event.Dispatcher;
|
||||||
import org.apache.hadoop.yarn.event.EventHandler;
|
import org.apache.hadoop.yarn.event.EventHandler;
|
||||||
|
@ -855,6 +856,7 @@ public abstract class RMStateStore extends AbstractService {
|
||||||
* @param failureCause the exception due to which the operation failed
|
* @param failureCause the exception due to which the operation failed
|
||||||
*/
|
*/
|
||||||
protected void notifyStoreOperationFailed(Exception failureCause) {
|
protected void notifyStoreOperationFailed(Exception failureCause) {
|
||||||
|
LOG.error("State store operation failed ", failureCause);
|
||||||
if (failureCause instanceof StoreFencedException) {
|
if (failureCause instanceof StoreFencedException) {
|
||||||
updateFencedState();
|
updateFencedState();
|
||||||
Thread standByTransitionThread =
|
Thread standByTransitionThread =
|
||||||
|
@ -862,8 +864,11 @@ public abstract class RMStateStore extends AbstractService {
|
||||||
standByTransitionThread.setName("StandByTransitionThread Handler");
|
standByTransitionThread.setName("StandByTransitionThread Handler");
|
||||||
standByTransitionThread.start();
|
standByTransitionThread.start();
|
||||||
} else {
|
} else {
|
||||||
rmDispatcher.getEventHandler().handle(
|
if (YarnConfiguration.shouldRMFailFast(getConfig())) {
|
||||||
new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, failureCause));
|
rmDispatcher.getEventHandler().handle(
|
||||||
|
new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED,
|
||||||
|
failureCause));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue