From b4e8ae591d03ccb2e04e764ac2f1fcd637eaf76a Mon Sep 17 00:00:00 2001 From: Karthik Kambatla Date: Fri, 19 Dec 2014 16:00:24 -0800 Subject: [PATCH] YARN-2675. containersKilled metrics is not updated when the container is killed during localization. (Zhihai Xu via kasha) (cherry picked from commit 954fb8581ec6d7d389ac5d6f94061760a29bc309) --- hadoop-yarn-project/CHANGES.txt | 3 + .../container/ContainerImpl.java | 164 ++++++++++++------ .../metrics/NodeManagerMetrics.java | 16 ++ .../container/TestContainer.java | 27 +++ 4 files changed, 159 insertions(+), 51 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index ed671f7ebe5..73bae44e716 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -218,6 +218,9 @@ Release 2.7.0 - UNRELEASED YARN-2964. RM prematurely cancels tokens for jobs that submit jobs (oozie) (Jian He via jlowe) + YARN-2675. containersKilled metrics is not updated when the container is killed + during localization. (Zhihai Xu via kasha) + Release 2.6.0 - 2014-11-18 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java index 58b1e2299e7..f0c506dcfa9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java @@ -159,9 +159,6 @@ public class ContainerImpl implements Container { this.diagnostics.append(diagnostics); } - private static final ContainerDoneTransition CONTAINER_DONE_TRANSITION = - new ContainerDoneTransition(); - private static final ContainerDiagnosticsUpdateTransition UPDATE_DIAGNOSTICS_TRANSITION = new ContainerDiagnosticsUpdateTransition(); @@ -202,7 +199,7 @@ public class ContainerImpl implements Container { .addTransition(ContainerState.LOCALIZATION_FAILED, ContainerState.DONE, ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP, - CONTAINER_DONE_TRANSITION) + new LocalizationFailedToDoneTransition()) .addTransition(ContainerState.LOCALIZATION_FAILED, ContainerState.LOCALIZATION_FAILED, ContainerEventType.UPDATE_DIAGNOSTICS_MSG, @@ -254,7 +251,7 @@ public class ContainerImpl implements Container { // From CONTAINER_EXITED_WITH_SUCCESS State .addTransition(ContainerState.EXITED_WITH_SUCCESS, ContainerState.DONE, ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP, - CONTAINER_DONE_TRANSITION) + new ExitedWithSuccessToDoneTransition()) .addTransition(ContainerState.EXITED_WITH_SUCCESS, ContainerState.EXITED_WITH_SUCCESS, ContainerEventType.UPDATE_DIAGNOSTICS_MSG, @@ -266,7 +263,7 @@ public class ContainerImpl implements Container { // From EXITED_WITH_FAILURE State .addTransition(ContainerState.EXITED_WITH_FAILURE, ContainerState.DONE, ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP, - CONTAINER_DONE_TRANSITION) + new ExitedWithFailureToDoneTransition()) .addTransition(ContainerState.EXITED_WITH_FAILURE, ContainerState.EXITED_WITH_FAILURE, ContainerEventType.UPDATE_DIAGNOSTICS_MSG, @@ -301,7 +298,7 @@ public class ContainerImpl implements Container { .addTransition(ContainerState.KILLING, ContainerState.DONE, ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP, - CONTAINER_DONE_TRANSITION) + new KillingToDoneTransition()) // Handle a launched container during killing stage is a no-op // as cleanup container is always handled after launch container event // in the container launcher @@ -313,7 +310,7 @@ public class ContainerImpl implements Container { .addTransition(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, ContainerState.DONE, ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP, - CONTAINER_DONE_TRANSITION) + new ContainerCleanedupAfterKillToDoneTransition()) .addTransition(ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, ContainerState.CONTAINER_CLEANEDUP_AFTER_KILL, ContainerEventType.UPDATE_DIAGNOSTICS_MSG, @@ -463,47 +460,6 @@ public class ContainerImpl implements Container { } } - @SuppressWarnings("fallthrough") - private void finished() { - ApplicationId applicationId = - containerId.getApplicationAttemptId().getApplicationId(); - switch (getContainerState()) { - case EXITED_WITH_SUCCESS: - metrics.endRunningContainer(); - metrics.completedContainer(); - NMAuditLogger.logSuccess(user, - AuditConstants.FINISH_SUCCESS_CONTAINER, "ContainerImpl", - applicationId, containerId); - break; - case EXITED_WITH_FAILURE: - if (wasLaunched) { - metrics.endRunningContainer(); - } - // fall through - case LOCALIZATION_FAILED: - metrics.failedContainer(); - NMAuditLogger.logFailure(user, - AuditConstants.FINISH_FAILED_CONTAINER, "ContainerImpl", - "Container failed with state: " + getContainerState(), - applicationId, containerId); - break; - case CONTAINER_CLEANEDUP_AFTER_KILL: - if (wasLaunched) { - metrics.endRunningContainer(); - } - // fall through - case NEW: - metrics.killedContainer(); - NMAuditLogger.logSuccess(user, - AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl", - applicationId, - containerId); - } - - metrics.releaseContainer(this.resource); - sendFinishedEvents(); - } - @SuppressWarnings("unchecked") private void sendFinishedEvents() { // Inform the application @@ -611,7 +567,13 @@ public class ContainerImpl implements Container { } else if (container.recoveredAsKilled && container.recoveredStatus == RecoveredContainerStatus.REQUESTED) { // container was killed but never launched - container.finished(); + container.metrics.killedContainer(); + NMAuditLogger.logSuccess(container.user, + AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl", + container.containerId.getApplicationAttemptId().getApplicationId(), + container.containerId); + container.metrics.releaseContainer(container.resource); + container.sendFinishedEvents(); return ContainerState.DONE; } @@ -994,7 +956,8 @@ public class ContainerImpl implements Container { @Override @SuppressWarnings("unchecked") public void transition(ContainerImpl container, ContainerEvent event) { - container.finished(); + container.metrics.releaseContainer(container.resource); + container.sendFinishedEvents(); //if the current state is NEW it means the CONTAINER_INIT was never // sent for the event, thus no need to send the CONTAINER_STOP if (container.getCurrentState() @@ -1016,6 +979,105 @@ public class ContainerImpl implements Container { container.exitCode = killEvent.getContainerExitStatus(); container.addDiagnostics(killEvent.getDiagnostic(), "\n"); container.addDiagnostics("Container is killed before being launched.\n"); + container.metrics.killedContainer(); + NMAuditLogger.logSuccess(container.user, + AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl", + container.containerId.getApplicationAttemptId().getApplicationId(), + container.containerId); + super.transition(container, event); + } + } + + /** + * Handle the following transition: + * - LOCALIZATION_FAILED -> DONE upon CONTAINER_RESOURCES_CLEANEDUP + */ + static class LocalizationFailedToDoneTransition extends + ContainerDoneTransition { + @Override + public void transition(ContainerImpl container, ContainerEvent event) { + container.metrics.failedContainer(); + NMAuditLogger.logFailure(container.user, + AuditConstants.FINISH_FAILED_CONTAINER, "ContainerImpl", + "Container failed with state: " + container.getContainerState(), + container.containerId.getApplicationAttemptId().getApplicationId(), + container.containerId); + super.transition(container, event); + } + } + + /** + * Handle the following transition: + * - EXITED_WITH_SUCCESS -> DONE upon CONTAINER_RESOURCES_CLEANEDUP + */ + static class ExitedWithSuccessToDoneTransition extends + ContainerDoneTransition { + @Override + public void transition(ContainerImpl container, ContainerEvent event) { + container.metrics.endRunningContainer(); + container.metrics.completedContainer(); + NMAuditLogger.logSuccess(container.user, + AuditConstants.FINISH_SUCCESS_CONTAINER, "ContainerImpl", + container.containerId.getApplicationAttemptId().getApplicationId(), + container.containerId); + super.transition(container, event); + } + } + + /** + * Handle the following transition: + * - EXITED_WITH_FAILURE -> DONE upon CONTAINER_RESOURCES_CLEANEDUP + */ + static class ExitedWithFailureToDoneTransition extends + ContainerDoneTransition { + @Override + public void transition(ContainerImpl container, ContainerEvent event) { + if (container.wasLaunched) { + container.metrics.endRunningContainer(); + } + container.metrics.failedContainer(); + NMAuditLogger.logFailure(container.user, + AuditConstants.FINISH_FAILED_CONTAINER, "ContainerImpl", + "Container failed with state: " + container.getContainerState(), + container.containerId.getApplicationAttemptId().getApplicationId(), + container.containerId); + super.transition(container, event); + } + } + + /** + * Handle the following transition: + * - KILLING -> DONE upon CONTAINER_RESOURCES_CLEANEDUP + */ + static class KillingToDoneTransition extends + ContainerDoneTransition { + @Override + public void transition(ContainerImpl container, ContainerEvent event) { + container.metrics.killedContainer(); + NMAuditLogger.logSuccess(container.user, + AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl", + container.containerId.getApplicationAttemptId().getApplicationId(), + container.containerId); + super.transition(container, event); + } + } + + /** + * Handle the following transition: + * CONTAINER_CLEANEDUP_AFTER_KILL -> DONE upon CONTAINER_RESOURCES_CLEANEDUP + */ + static class ContainerCleanedupAfterKillToDoneTransition extends + ContainerDoneTransition { + @Override + public void transition(ContainerImpl container, ContainerEvent event) { + if (container.wasLaunched) { + container.metrics.endRunningContainer(); + } + container.metrics.killedContainer(); + NMAuditLogger.logSuccess(container.user, + AuditConstants.FINISH_KILLED_CONTAINER, "ContainerImpl", + container.containerId.getApplicationAttemptId().getApplicationId(), + container.containerId); super.transition(container, event); } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java index beaafe192a4..3615feefa93 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java @@ -27,6 +27,8 @@ import org.apache.hadoop.metrics2.lib.MutableRate; import org.apache.hadoop.metrics2.source.JvmMetrics; import org.apache.hadoop.yarn.api.records.Resource; +import com.google.common.annotations.VisibleForTesting; + @Metrics(about="Metrics for node manager", context="yarn") public class NodeManagerMetrics { @Metric MutableCounterInt containersLaunched; @@ -127,4 +129,18 @@ public class NodeManagerMetrics { return containersRunning.value(); } + @VisibleForTesting + public int getKilledContainers() { + return containersKilled.value(); + } + + @VisibleForTesting + public int getFailedContainers() { + return containersFailed.value(); + } + + @VisibleForTesting + public int getCompletedContainers() { + return containersCompleted.value(); + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java index c28d691a99d..2834e30247e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java @@ -173,13 +173,20 @@ public class TestContainer { wc = new WrappedContainer(13, 314159265358979L, 4344, "yak"); wc.initContainer(); wc.localizeResources(); + int running = metrics.getRunningContainers(); wc.launchContainer(); + assertEquals(running + 1, metrics.getRunningContainers()); reset(wc.localizerBus); wc.containerKilledOnRequest(); assertEquals(ContainerState.EXITED_WITH_FAILURE, wc.c.getContainerState()); assertNull(wc.c.getLocalizedResources()); verifyCleanupCall(wc); + int failed = metrics.getFailedContainers(); + wc.containerResourcesCleanup(); + assertEquals(ContainerState.DONE, wc.c.getContainerState()); + assertEquals(failed + 1, metrics.getFailedContainers()); + assertEquals(running, metrics.getRunningContainers()); } finally { if (wc != null) { @@ -219,13 +226,20 @@ public class TestContainer { wc = new WrappedContainer(11, 314159265358979L, 4344, "yak"); wc.initContainer(); wc.localizeResources(); + int running = metrics.getRunningContainers(); wc.launchContainer(); + assertEquals(running + 1, metrics.getRunningContainers()); reset(wc.localizerBus); wc.containerSuccessful(); assertEquals(ContainerState.EXITED_WITH_SUCCESS, wc.c.getContainerState()); assertNull(wc.c.getLocalizedResources()); verifyCleanupCall(wc); + int completed = metrics.getCompletedContainers(); + wc.containerResourcesCleanup(); + assertEquals(ContainerState.DONE, wc.c.getContainerState()); + assertEquals(completed + 1, metrics.getCompletedContainers()); + assertEquals(running, metrics.getRunningContainers()); } finally { if (wc != null) { @@ -319,12 +333,14 @@ public class TestContainer { try { wc = new WrappedContainer(13, 314159265358979L, 4344, "yak"); assertEquals(ContainerState.NEW, wc.c.getContainerState()); + int killed = metrics.getKilledContainers(); wc.killContainer(); assertEquals(ContainerState.DONE, wc.c.getContainerState()); assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER, wc.c.cloneAndGetContainerStatus().getExitStatus()); assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics() .contains("KillRequest")); + assertEquals(killed + 1, metrics.getKilledContainers()); } finally { if (wc != null) { wc.finished(); @@ -345,6 +361,10 @@ public class TestContainer { wc.c.cloneAndGetContainerStatus().getExitStatus()); assertTrue(wc.c.cloneAndGetContainerStatus().getDiagnostics() .contains("KillRequest")); + int killed = metrics.getKilledContainers(); + wc.containerResourcesCleanup(); + assertEquals(ContainerState.DONE, wc.c.getContainerState()); + assertEquals(killed + 1, metrics.getKilledContainers()); } finally { if (wc != null) { wc.finished(); @@ -365,6 +385,10 @@ public class TestContainer { assertEquals(ContainerState.LOCALIZATION_FAILED, wc.c.getContainerState()); assertNull(wc.c.getLocalizedResources()); verifyCleanupCall(wc); + int failed = metrics.getFailedContainers(); + wc.containerResourcesCleanup(); + assertEquals(ContainerState.DONE, wc.c.getContainerState()); + assertEquals(failed + 1, metrics.getFailedContainers()); } finally { if (wc != null) { wc.finished(); @@ -389,8 +413,11 @@ public class TestContainer { wc.c.getContainerState()); assertNull(wc.c.getLocalizedResources()); verifyCleanupCall(wc); + int killed = metrics.getKilledContainers(); wc.c.handle(new ContainerEvent(wc.c.getContainerId(), ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP)); + assertEquals(ContainerState.DONE, wc.c.getContainerState()); + assertEquals(killed + 1, metrics.getKilledContainers()); assertEquals(0, metrics.getRunningContainers()); } finally { if (wc != null) {