From 404b3896c91e6a089018195cc7449763c8687b05 Mon Sep 17 00:00:00 2001 From: Vinod Kumar Vavilapalli Date: Sat, 1 Jun 2013 22:02:44 +0000 Subject: [PATCH] YARN-733. Fixed TestNMClient from failing occasionally. Contributed by Zhijie Shen. svn merge --ignore-ancestry -c 1488618 ../../trunk/ git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1488619 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-yarn-project/CHANGES.txt | 3 + .../hadoop/yarn/client/NMClientImpl.java | 11 ++++ .../hadoop/yarn/client/TestNMClient.java | 56 ++++++++++--------- 3 files changed, 45 insertions(+), 25 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 902554372ab..fa4c47308bf 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -377,6 +377,9 @@ Release 2.1.0-beta - UNRELEASED YARN-578. Fixed NM to use SecureIOUtils for reading and aggregating logs. (Omkar Vinit Joshi via vinodkv) + YARN-733. Fixed TestNMClient from failing occasionally. (Zhijie Shen via + vinodkv) + BREAKDOWN OF HADOOP-8562 SUBTASKS AND RELATED JIRAS YARN-158. Yarn creating package-info.java must not depend on sh. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java index 4cf12b07d6a..1a564f4a487 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/main/java/org/apache/hadoop/yarn/client/NMClientImpl.java @@ -64,6 +64,17 @@ * continue to run even after this client is stopped and till the application * runs at which point ResourceManager will forcefully kill them. *

+ * + *

+ * Note that the blocking APIs ensure the RPC calls to NodeManager + * are executed immediately, and the responses are received before these APIs + * return. However, when {@link #startContainer} or {@link #stopContainer} + * returns, NodeManager may still need some time to either start + * or stop the container because of its asynchronous implementation. Therefore, + * {@link #getContainerStatus} is likely to return a transit container status + * if it is executed immediately after {@link #startContainer} or + * {@link #stopContainer}. + *

*/ public class NMClientImpl extends AbstractService implements NMClient { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java index 34ca1ae754d..8e1c3926f51 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/TestNMClient.java @@ -20,8 +20,8 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; @@ -228,7 +228,7 @@ private Set allocateContainers( } private void testContainerManagement(NMClientImpl nmClient, - Set containers) throws IOException { + Set containers) throws YarnRemoteException, IOException { int size = containers.size(); int i = 0; for (Container container : containers) { @@ -271,17 +271,9 @@ private void testContainerManagement(NMClientImpl nmClient, // leave one container unclosed if (++i < size) { - try { - ContainerStatus status = nmClient.getContainerStatus(container.getId(), - container.getNodeId(), container.getContainerToken()); - // verify the container is started and in good shape - assertEquals(container.getId(), status.getContainerId()); - assertEquals(ContainerState.RUNNING, status.getState()); - assertEquals("", status.getDiagnostics()); - assertEquals(-1000, status.getExitStatus()); - } catch (YarnRemoteException e) { - fail("Exception is not expected"); - } + // NodeManager may still need some time to make the container started + testGetContainerStatus(container, i, ContainerState.RUNNING, "", + -1000); try { nmClient.stopContainer(container.getId(), container.getNodeId(), @@ -291,18 +283,8 @@ private void testContainerManagement(NMClientImpl nmClient, } // getContainerStatus can be called after stopContainer - try { - ContainerStatus status = nmClient.getContainerStatus( - container.getId(), container.getNodeId(), - container.getContainerToken()); - assertEquals(container.getId(), status.getContainerId()); - assertEquals(ContainerState.RUNNING, status.getState()); - assertTrue("" + i, status.getDiagnostics().contains( - "Container killed by the ApplicationMaster.")); - assertEquals(-1000, status.getExitStatus()); - } catch (YarnRemoteException e) { - fail("Exception is not expected"); - } + testGetContainerStatus(container, i, ContainerState.COMPLETE, + "Container killed by the ApplicationMaster.", 143); } } } @@ -315,4 +297,28 @@ private void sleep(int sleepTime) { } } + private void testGetContainerStatus(Container container, int index, + ContainerState state, String diagnostics, int exitStatus) + throws YarnRemoteException, IOException { + while (true) { + try { + ContainerStatus status = nmClient.getContainerStatus( + container.getId(), container.getNodeId(), + container.getContainerToken()); + // NodeManager may still need some time to get the stable + // container status + if (status.getState() == state) { + assertEquals(container.getId(), status.getContainerId()); + assertTrue("" + index + ": " + status.getDiagnostics(), + status.getDiagnostics().contains(diagnostics)); + assertEquals(exitStatus, status.getExitStatus()); + break; + } + Thread.sleep(100); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + }