From 2fae1c5b0534deb3d1249a901d0530c69a57e333 Mon Sep 17 00:00:00 2001 From: Eric Badger Date: Fri, 24 Jul 2020 23:08:31 +0000 Subject: [PATCH] YARN-4771. Some containers can be skipped during log aggregation after NM restart. Contributed by Jason Lowe and Jim Brennan. (cherry picked from commit ac5f21dbef0f0ad4210e4027f53877760fa606a5) --- .../nodemanager/NodeStatusUpdaterImpl.java | 11 +++++---- .../nodemanager/TestNodeStatusUpdater.java | 24 +++++++++---------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java index af59e34c254..0b40dd533a0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java @@ -737,8 +737,13 @@ public class NodeStatusUpdaterImpl extends AbstractService implements while (i.hasNext()) { Entry mapEntry = i.next(); ContainerId cid = mapEntry.getKey(); - if (mapEntry.getValue() < currentTime) { - if (!context.getContainers().containsKey(cid)) { + if (mapEntry.getValue() >= currentTime) { + break; + } + if (!context.getContainers().containsKey(cid)) { + ApplicationId appId = + cid.getApplicationAttemptId().getApplicationId(); + if (isApplicationStopped(appId)) { i.remove(); try { context.getNMStateStore().removeContainer(cid); @@ -746,8 +751,6 @@ public class NodeStatusUpdaterImpl extends AbstractService implements LOG.error("Unable to remove container " + cid + " in store", e); } } - } else { - break; } } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java index 533cf2a6c7b..0d73a724076 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java @@ -901,9 +901,8 @@ public class TestNodeStatusUpdater extends NodeManagerTestBase { public void testRecentlyFinishedContainers() throws Exception { NodeManager nm = new NodeManager(); YarnConfiguration conf = new YarnConfiguration(); - conf.set( - NodeStatusUpdaterImpl.YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS, - "10000"); + conf.setInt(NodeStatusUpdaterImpl. + YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS, 1); nm.init(conf); NodeStatusUpdaterImpl nodeStatusUpdater = (NodeStatusUpdaterImpl) nm.getNodeStatusUpdater(); @@ -918,18 +917,17 @@ public class TestNodeStatusUpdater extends NodeManagerTestBase { nodeStatusUpdater.addCompletedContainer(cId); Assert.assertTrue(nodeStatusUpdater.isContainerRecentlyStopped(cId)); + // verify container remains even after expiration if app + // is still active nm.getNMContext().getContainers().remove(cId); - long time1 = System.currentTimeMillis(); - int waitInterval = 15; - while (waitInterval-- > 0 - && nodeStatusUpdater.isContainerRecentlyStopped(cId)) { - nodeStatusUpdater.removeVeryOldStoppedContainersFromCache(); - Thread.sleep(1000); - } - long time2 = System.currentTimeMillis(); - // By this time the container will be removed from cache. need to verify. + Thread.sleep(10); + nodeStatusUpdater.removeVeryOldStoppedContainersFromCache(); + Assert.assertTrue(nodeStatusUpdater.isContainerRecentlyStopped(cId)); + + // complete the application and verify container is removed + nm.getNMContext().getApplications().remove(appId); + nodeStatusUpdater.removeVeryOldStoppedContainersFromCache(); Assert.assertFalse(nodeStatusUpdater.isContainerRecentlyStopped(cId)); - Assert.assertTrue((time2 - time1) >= 10000 && (time2 - time1) <= 250000); } @Test(timeout = 90000)