diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java index d568d6a3eab..5025c7cb8d9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestRMFailover.java @@ -54,6 +54,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.HATestUtil; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; import org.apache.hadoop.yarn.server.resourcemanager.RMCriticalThreadUncaughtExceptionHandler; import org.apache.hadoop.yarn.server.resourcemanager.MockRM; +import org.apache.hadoop.yarn.server.resourcemanager.RMFatalEvent; +import org.apache.hadoop.yarn.server.resourcemanager.RMFatalEventType; import org.apache.hadoop.yarn.server.webproxy.WebAppProxyServer; import org.apache.hadoop.yarn.webapp.YarnWebParams; import org.junit.After; @@ -200,7 +202,8 @@ public class TestRMFailover extends ClientBaseWithFixes { // so it transitions to standby. ResourceManager rm = cluster.getResourceManager( cluster.getActiveRMIndex()); - rm.handleTransitionToStandByInNewThread(); + rm.getRMContext().getDispatcher().getEventHandler().handle( + new RMFatalEvent(RMFatalEventType.STATE_STORE_FENCED, "test")); verifyRMTransitionToStandby(rm); verifyConnections(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ActiveStandbyElectorBasedElectorService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ActiveStandbyElectorBasedElectorService.java index 751eedd9afb..b59bc25f5a9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ActiveStandbyElectorBasedElectorService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ActiveStandbyElectorBasedElectorService.java @@ -108,8 +108,9 @@ public class ActiveStandbyElectorBasedElectorService extends AbstractService elector.ensureParentZNode(); if (!isParentZnodeSafe(clusterId)) { - notifyFatalError(electionZNode + " znode has invalid data! "+ - "Might need formatting!"); + notifyFatalError(String.format("invalid data in znode, %s, " + + "which may require the state store to be reformatted", + electionZNode)); } super.serviceInit(conf); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java index 74c87a2392b..75717652d97 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/AdminService.java @@ -304,13 +304,12 @@ public class AdminService extends CompositeService implements // call all refresh*s for active RM to get the updated configurations. refreshAll(); } catch (Exception e) { - LOG.error("RefreshAll failed so firing fatal event", e); rmContext .getDispatcher() .getEventHandler() .handle( new RMFatalEvent(RMFatalEventType.TRANSITION_TO_ACTIVE_FAILED, - e)); + e, "failure to refresh configuration settings")); throw new ServiceFailedException( "Error on refreshAll during transition to Active", e); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMCriticalThreadUncaughtExceptionHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMCriticalThreadUncaughtExceptionHandler.java index a67f81a8f7f..c863889f12a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMCriticalThreadUncaughtExceptionHandler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMCriticalThreadUncaughtExceptionHandler.java @@ -23,7 +23,7 @@ import java.lang.Thread.UncaughtExceptionHandler; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; -import org.apache.hadoop.yarn.conf.HAUtil; +import org.apache.hadoop.yarn.exceptions.YarnException; /** * This class either shuts down {@link ResourceManager} or transitions the @@ -45,14 +45,19 @@ public class RMCriticalThreadUncaughtExceptionHandler @Override public void uncaughtException(Thread t, Throwable e) { - LOG.fatal("Critical thread " + t.getName() + " crashed!", e); + Exception ex; - if (HAUtil.isHAEnabled(rmContext.getYarnConfiguration())) { - rmContext.getResourceManager().handleTransitionToStandByInNewThread(); + if (e instanceof Exception) { + ex = (Exception)e; } else { - rmContext.getDispatcher().getEventHandler().handle( - new RMFatalEvent(RMFatalEventType.CRITICAL_THREAD_CRASH, - new Exception(e))); + ex = new YarnException(e); } + + RMFatalEvent event = + new RMFatalEvent(RMFatalEventType.CRITICAL_THREAD_CRASH, ex, + String.format("a critical thread, %s, that exited unexpectedly", + t.getName())); + + rmContext.getDispatcher().getEventHandler().handle(event); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEvent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEvent.java index 59e6236c897..899377d7f62 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEvent.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEvent.java @@ -20,18 +20,73 @@ package org.apache.hadoop.yarn.server.resourcemanager; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.event.AbstractEvent; +/** + * Event that indicates a non-recoverable error for the resource manager. + */ public class RMFatalEvent extends AbstractEvent { - private String cause; + private final Exception cause; + private final String message; - public RMFatalEvent(RMFatalEventType rmFatalEventType, String cause) { + /** + * Create a new event of the given type with the given cause. + * @param rmFatalEventType The {@link RMFatalEventType} of the event + * @param message a text description of the reason for the event + */ + public RMFatalEvent(RMFatalEventType rmFatalEventType, String message) { + this(rmFatalEventType, null, message); + } + + /** + * Create a new event of the given type around the given source + * {@link Exception}. + * @param rmFatalEventType The {@link RMFatalEventType} of the event + * @param cause the source exception + */ + public RMFatalEvent(RMFatalEventType rmFatalEventType, Exception cause) { + this(rmFatalEventType, cause, null); + } + + /** + * Create a new event of the given type around the given source + * {@link Exception} with the given cause. + * @param rmFatalEventType The {@link RMFatalEventType} of the event + * @param cause the source exception + * @param message a text description of the reason for the event + */ + public RMFatalEvent(RMFatalEventType rmFatalEventType, Exception cause, + String message) { super(rmFatalEventType); this.cause = cause; + this.message = message; } - public RMFatalEvent(RMFatalEventType rmFatalEventType, Exception cause) { - super(rmFatalEventType); - this.cause = StringUtils.stringifyException(cause); + /** + * Get a text description of the reason for the event. If a cause was, that + * {@link Exception} will be converted to a {@link String} and included in + * the result. + * @return a text description of the reason for the event + */ + public String getExplanation() { + StringBuilder sb = new StringBuilder(); + + if (message != null) { + sb.append(message); + + if (cause != null) { + sb.append(": "); + } + } + + if (cause != null) { + sb.append(StringUtils.stringifyException(cause)); + } + + return sb.toString(); } - public String getCause() {return this.cause;} + @Override + public String toString() { + return String.format("RMFatalEvent of type %s, caused by %s", + getType().name(), getExplanation()); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEventType.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEventType.java index b6f6b3c1184..69b285facfa 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEventType.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMFatalEventType.java @@ -23,6 +23,7 @@ import org.apache.hadoop.classification.InterfaceAudience; @InterfaceAudience.Private public enum RMFatalEventType { // Source <- Store + STATE_STORE_FENCED, STATE_STORE_OP_FAILED, // Source <- Embedded Elector diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index 81c3f1b8fa3..75d6df292d2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -811,15 +811,45 @@ public class ResourceManager extends CompositeService implements Recoverable { } @Private - public static class RMFatalEventDispatcher - implements EventHandler { - + private class RMFatalEventDispatcher implements EventHandler { @Override public void handle(RMFatalEvent event) { - LOG.fatal("Received a " + RMFatalEvent.class.getName() + " of type " + - event.getType().name() + ". Cause:\n" + event.getCause()); + LOG.error("Received " + event); - ExitUtil.terminate(1, event.getCause()); + if (HAUtil.isHAEnabled(getConfig())) { + // If we're in an HA config, the right answer is always to go into + // standby. + LOG.warn("Transitioning the resource manager to standby."); + handleTransitionToStandByInNewThread(); + } else { + // If we're stand-alone, we probably want to shut down, but the if and + // how depends on the event. + switch(event.getType()) { + case STATE_STORE_FENCED: + LOG.fatal("State store fenced even though the resource manager " + + "is not configured for high availability. Shutting down this " + + "resource manager to protect the integrity of the state store."); + ExitUtil.terminate(1, event.getExplanation()); + break; + case STATE_STORE_OP_FAILED: + if (YarnConfiguration.shouldRMFailFast(getConfig())) { + LOG.fatal("Shutting down the resource manager because a state " + + "store operation failed, and the resource manager is " + + "configured to fail fast. See the yarn.fail-fast and " + + "yarn.resourcemanager.fail-fast properties."); + ExitUtil.terminate(1, event.getExplanation()); + } else { + LOG.warn("Ignoring state store operation failure because the " + + "resource manager is not configured to fail fast. See the " + + "yarn.fail-fast and yarn.resourcemanager.fail-fast " + + "properties."); + } + break; + default: + LOG.fatal("Shutting down the resource manager."); + ExitUtil.terminate(1, event.getExplanation()); + } + } } } @@ -827,7 +857,7 @@ public class ResourceManager extends CompositeService implements Recoverable { * Transition to standby state in a new thread. The transition operation is * asynchronous to avoid deadlock caused by cyclic dependency. */ - public void handleTransitionToStandByInNewThread() { + private void handleTransitionToStandByInNewThread() { Thread standByTransitionThread = new Thread(activeServices.standByTransitionRunnable); standByTransitionThread.setName("StandByTransitionThread"); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java index 5e3cf22da5a..975847cfdff 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/RMStateStore.java @@ -1129,18 +1129,18 @@ public abstract class RMStateStore extends AbstractService { Exception failureCause) { boolean isFenced = false; LOG.error("State store operation failed ", failureCause); + if (HAUtil.isHAEnabled(getConfig())) { - LOG.warn("State-store fenced ! Transitioning RM to standby"); + rmDispatcher.getEventHandler().handle( + new RMFatalEvent(RMFatalEventType.STATE_STORE_FENCED, + failureCause)); isFenced = true; - resourceManager.handleTransitionToStandByInNewThread(); - } else if (YarnConfiguration.shouldRMFailFast(getConfig())) { - LOG.fatal("Fail RM now due to state-store error!"); + } else { rmDispatcher.getEventHandler().handle( new RMFatalEvent(RMFatalEventType.STATE_STORE_OP_FAILED, failureCause)); - } else { - LOG.warn("Skip the state-store error."); } + return isFenced; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestMemoryRMStateStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestMemoryRMStateStore.java index 89b9e2b46db..cb278c06e61 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestMemoryRMStateStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/TestMemoryRMStateStore.java @@ -43,6 +43,7 @@ public class TestMemoryRMStateStore { store.init(conf); ResourceManager mockRM = mock(ResourceManager.class); store.setResourceManager(mockRM); + store.setRMDispatcher(new RMStateStoreTestBase.TestDispatcher()); RMDelegationTokenIdentifier mockTokenId = mock(RMDelegationTokenIdentifier.class); store.removeRMDelegationToken(mockTokenId); @@ -58,6 +59,7 @@ public class TestMemoryRMStateStore { }; store.init(conf); store.setResourceManager(mockRM); + store.setRMDispatcher(new RMStateStoreTestBase.TestDispatcher()); store.removeRMDelegationToken(mockTokenId); assertTrue("RMStateStore should have been in fenced state", store.isFencedState());