From e665c0a9dd29248ac0cc22ec5a0c830ada13df60 Mon Sep 17 00:00:00 2001 From: Eric Yang Date: Mon, 23 Jul 2018 12:57:01 -0400 Subject: [PATCH] YARN-8360. Improve YARN service restart policy and node manager auto restart policy. Contributed by Suma Shivaprasad (cherry picked from commit 84d7bf1eeff6b9418361afa4aa713e5e6f771365) --- .../component/AlwaysRestartPolicy.java | 5 ++ .../component/ComponentRestartPolicy.java | 2 + .../service/component/NeverRestartPolicy.java | 5 ++ .../component/OnFailureRestartPolicy.java | 5 ++ .../provider/AbstractProviderService.java | 29 ++++---- .../hadoop/yarn/service/ServiceTestUtils.java | 2 +- .../containerlaunch/TestAbstractLauncher.java | 66 +++++++++++++++++++ 7 files changed, 101 insertions(+), 13 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/AlwaysRestartPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/AlwaysRestartPolicy.java index 704ab14d0ec..505120d8c25 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/AlwaysRestartPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/AlwaysRestartPolicy.java @@ -79,4 +79,9 @@ public final class AlwaysRestartPolicy implements ComponentRestartPolicy { @Override public boolean shouldTerminate(Component component) { return false; } + + @Override public boolean allowContainerRetriesForInstance( + ComponentInstance componentInstance) { + return true; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/ComponentRestartPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/ComponentRestartPolicy.java index 23b0fb9e2c8..c5adffebcc8 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/ComponentRestartPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/ComponentRestartPolicy.java @@ -42,4 +42,6 @@ public interface ComponentRestartPolicy { boolean shouldTerminate(Component component); + boolean allowContainerRetriesForInstance(ComponentInstance componentInstance); + } \ No newline at end of file diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/NeverRestartPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/NeverRestartPolicy.java index ace1f8940e7..cd44a585680 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/NeverRestartPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/NeverRestartPolicy.java @@ -79,4 +79,9 @@ public final class NeverRestartPolicy implements ComponentRestartPolicy { } return true; } + + @Override public boolean allowContainerRetriesForInstance( + ComponentInstance componentInstance) { + return false; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/OnFailureRestartPolicy.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/OnFailureRestartPolicy.java index 39fba2afd01..b939ba0428f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/OnFailureRestartPolicy.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/OnFailureRestartPolicy.java @@ -84,4 +84,9 @@ public final class OnFailureRestartPolicy implements ComponentRestartPolicy { } return true; } + + @Override public boolean allowContainerRetriesForInstance( + ComponentInstance componentInstance) { + return true; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/provider/AbstractProviderService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/provider/AbstractProviderService.java index 6d213c86652..46e3a7ff6e5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/provider/AbstractProviderService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/provider/AbstractProviderService.java @@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.ApplicationConstants; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.service.api.records.Service; +import org.apache.hadoop.yarn.service.component.ComponentRestartPolicy; import org.apache.hadoop.yarn.service.conf.YarnServiceConf; import org.apache.hadoop.yarn.service.conf.YarnServiceConstants; import org.apache.hadoop.yarn.service.containerlaunch.ContainerLaunchService; @@ -116,18 +117,22 @@ public abstract class AbstractProviderService implements ProviderService, public void buildContainerRetry(AbstractLauncher launcher, Configuration yarnConf, - ContainerLaunchService.ComponentLaunchContext compLaunchContext) { + ContainerLaunchService.ComponentLaunchContext compLaunchContext, + ComponentInstance instance) { // By default retry forever every 30 seconds - launcher.setRetryContext( - YarnServiceConf.getInt(CONTAINER_RETRY_MAX, - DEFAULT_CONTAINER_RETRY_MAX, - compLaunchContext.getConfiguration(), yarnConf), - YarnServiceConf.getInt(CONTAINER_RETRY_INTERVAL, - DEFAULT_CONTAINER_RETRY_INTERVAL, - compLaunchContext.getConfiguration(), yarnConf), - YarnServiceConf.getLong(CONTAINER_FAILURES_VALIDITY_INTERVAL, - DEFAULT_CONTAINER_FAILURES_VALIDITY_INTERVAL, - compLaunchContext.getConfiguration(), yarnConf)); + + ComponentRestartPolicy restartPolicy = instance.getComponent() + .getRestartPolicyHandler(); + if (restartPolicy.allowContainerRetriesForInstance(instance)) { + launcher.setRetryContext(YarnServiceConf + .getInt(CONTAINER_RETRY_MAX, DEFAULT_CONTAINER_RETRY_MAX, + compLaunchContext.getConfiguration(), yarnConf), YarnServiceConf + .getInt(CONTAINER_RETRY_INTERVAL, DEFAULT_CONTAINER_RETRY_INTERVAL, + compLaunchContext.getConfiguration(), yarnConf), YarnServiceConf + .getLong(CONTAINER_FAILURES_VALIDITY_INTERVAL, + DEFAULT_CONTAINER_FAILURES_VALIDITY_INTERVAL, + compLaunchContext.getConfiguration(), yarnConf)); + } } public void buildContainerLaunchContext(AbstractLauncher launcher, @@ -161,6 +166,6 @@ public abstract class AbstractProviderService implements ProviderService, yarnConf, container, compLaunchContext, tokensForSubstitution); // Setup container retry settings - buildContainerRetry(launcher, yarnConf, compLaunchContext); + buildContainerRetry(launcher, yarnConf, compLaunchContext, instance); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/ServiceTestUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/ServiceTestUtils.java index 3d1412dfe71..170c20b84b9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/ServiceTestUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/ServiceTestUtils.java @@ -115,7 +115,7 @@ public class ServiceTestUtils { exampleApp.setName(serviceName); exampleApp.setVersion("v1"); exampleApp.addComponent( - createComponent("terminating-comp1", 2, "sleep " + "1000", + createComponent("terminating-comp1", 2, "sleep 1000", Component.RestartPolicyEnum.NEVER, null)); exampleApp.addComponent( createComponent("terminating-comp2", 2, "sleep 1000", diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/containerlaunch/TestAbstractLauncher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/containerlaunch/TestAbstractLauncher.java index f4f1a50e439..108078ca7f0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/containerlaunch/TestAbstractLauncher.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/containerlaunch/TestAbstractLauncher.java @@ -19,13 +19,33 @@ package org.apache.hadoop.yarn.service.containerlaunch; import org.apache.hadoop.yarn.service.ServiceContext; +import org.apache.hadoop.yarn.service.api.records.Configuration; +import org.apache.hadoop.yarn.service.component.AlwaysRestartPolicy; +import org.apache.hadoop.yarn.service.component.Component; +import org.apache.hadoop.yarn.service.component.NeverRestartPolicy; +import org.apache.hadoop.yarn.service.component.OnFailureRestartPolicy; +import org.apache.hadoop.yarn.service.component.instance.ComponentInstance; +import org.apache.hadoop.yarn.service.provider.defaultImpl + .DefaultProviderService; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import java.io.IOException; +import static org.apache.hadoop.fi.FiConfig.getConfig; +import static org.apache.hadoop.yarn.service.conf.YarnServiceConf + .DEFAULT_CONTAINER_FAILURES_VALIDITY_INTERVAL; +import static org.apache.hadoop.yarn.service.conf.YarnServiceConf + .DEFAULT_CONTAINER_RETRY_INTERVAL; +import static org.apache.hadoop.yarn.service.conf.YarnServiceConf + .DEFAULT_CONTAINER_RETRY_MAX; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.reset; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyZeroInteractions; +import static org.mockito.Mockito.when; /** * Tests for {@link AbstractLauncher}. @@ -51,4 +71,50 @@ public class TestAbstractLauncher { Assert.assertEquals("s1:t1:ro,s2:t2:ro", dockerContainerMounts); } + + @Test + public void testContainerRetries() throws Exception { + + DefaultProviderService providerService = new DefaultProviderService(); + AbstractLauncher mockLauncher = mock(AbstractLauncher.class); + ContainerLaunchService.ComponentLaunchContext componentLaunchContext = + mock(ContainerLaunchService.ComponentLaunchContext.class); + + ComponentInstance componentInstance = mock(ComponentInstance.class); + + //Never Restart Policy + Component component = mock(Component.class); + when(componentInstance.getComponent()).thenReturn(component); + + when(component.getRestartPolicyHandler()).thenReturn(NeverRestartPolicy + .getInstance()); + + providerService.buildContainerRetry(mockLauncher, getConfig(), + componentLaunchContext, componentInstance); + verifyZeroInteractions(mockLauncher); + + + //OnFailure restart policy + when(component.getRestartPolicyHandler()).thenReturn(OnFailureRestartPolicy + .getInstance()); + when(componentLaunchContext.getConfiguration()).thenReturn(new + Configuration()); + providerService.buildContainerRetry(mockLauncher, getConfig(), + componentLaunchContext, componentInstance); + verify(mockLauncher).setRetryContext(DEFAULT_CONTAINER_RETRY_MAX, + DEFAULT_CONTAINER_RETRY_INTERVAL, + DEFAULT_CONTAINER_FAILURES_VALIDITY_INTERVAL); + + reset(mockLauncher); + + //Always restart policy + when(component.getRestartPolicyHandler()).thenReturn(AlwaysRestartPolicy + .getInstance()); + providerService.buildContainerRetry(mockLauncher, getConfig(), + componentLaunchContext, componentInstance); + + verify(mockLauncher).setRetryContext(DEFAULT_CONTAINER_RETRY_MAX, + DEFAULT_CONTAINER_RETRY_INTERVAL, + DEFAULT_CONTAINER_FAILURES_VALIDITY_INTERVAL); + } }