From 01de8fd9d650a41aba850043f78e565de759a83e Mon Sep 17 00:00:00 2001 From: Szilard Nemeth Date: Fri, 17 Dec 2021 14:53:06 +0100 Subject: [PATCH] YARN-6862. Nodemanager resource usage metrics sometimes are negative. Contributed by Benjamin Teke --- .../monitor/ContainersMonitorImpl.java | 8 ++ .../MockCPUResourceCalculatorProcessTree.java | 10 +++ ...ckMemoryResourceCalculatorProcessTree.java | 89 +++++++++++++++++++ .../MockResourceCalculatorProcessTree.java | 6 ++ .../TestContainersMonitorResourceChange.java | 24 +++-- 5 files changed, 131 insertions(+), 6 deletions(-) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockMemoryResourceCalculatorProcessTree.java diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java index a7bf73fdfa1..d7da7d7eaea 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java @@ -540,6 +540,14 @@ public class ContainersMonitorImpl extends AbstractService implements pTree.updateProcessTree(); // update process-tree long currentVmemUsage = pTree.getVirtualMemorySize(); long currentPmemUsage = pTree.getRssMemorySize(); + if (currentVmemUsage < 0 || currentPmemUsage < 0) { + // YARN-6862/YARN-5021 If the container just exited or for + // another reason the physical/virtual memory is UNAVAILABLE (-1) + // the values shouldn't be aggregated. + LOG.info("Skipping monitoring container {} because " + + "memory usage is not available.", containerId); + continue; + } // if machine has 6 cores and 3 are used, // cpuUsagePercentPerCore should be 300% diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockCPUResourceCalculatorProcessTree.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockCPUResourceCalculatorProcessTree.java index eb35c917bc8..49161f3085d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockCPUResourceCalculatorProcessTree.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockCPUResourceCalculatorProcessTree.java @@ -56,6 +56,16 @@ public class MockCPUResourceCalculatorProcessTree return true; } + @Override + public long getVirtualMemorySize(int olderThanAge) { + return 0; + } + + @Override + public long getRssMemorySize(int olderThanAge) { + return 0; + } + @Override public float getCpuUsagePercent() { long cpu = this.cpuPercentage; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockMemoryResourceCalculatorProcessTree.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockMemoryResourceCalculatorProcessTree.java new file mode 100644 index 00000000000..ea45ac437ca --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockMemoryResourceCalculatorProcessTree.java @@ -0,0 +1,89 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor; + +import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree; + +/** + * Mock class to obtain resource usage (Memory). + */ +public class MockMemoryResourceCalculatorProcessTree extends ResourceCalculatorProcessTree { + private final long memorySize = 500000000L; + + private long rssMemorySize = memorySize; + private long virtualMemorySize = ResourceCalculatorProcessTree.UNAVAILABLE; + + /** + * Constructor for MockMemoryResourceCalculatorProcessTree with specified root + * process. + * @param root + */ + public MockMemoryResourceCalculatorProcessTree(String root) { + super(root); + } + + @Override + public void updateProcessTree() { + } + + @Override + public String getProcessTreeDump() { + return ""; + } + + @Override + public long getCumulativeCpuTime() { + return 0; + } + + @Override + public boolean checkPidPgrpidForMatch() { + return true; + } + + @Override + public long getRssMemorySize(int olderThanAge) { + long rssMemory = this.rssMemorySize; + // First getter call will return with 500000000, and second call will + // return -1, rest of the calls will return a valid value. + if (rssMemory == memorySize) { + this.rssMemorySize = ResourceCalculatorProcessTree.UNAVAILABLE; + } + if (rssMemory == ResourceCalculatorProcessTree.UNAVAILABLE) { + this.rssMemorySize = 2 * memorySize; + } + return rssMemory; + } + + @Override + public long getVirtualMemorySize(int olderThanAge) { + long virtualMemory = this.virtualMemorySize; + // First getter call will return with -1, and rest of the calls will + // return a valid value. + if (virtualMemory == ResourceCalculatorProcessTree.UNAVAILABLE) { + this.virtualMemorySize = 3 * memorySize; + } + return virtualMemory; + } + + @Override + public float getCpuUsagePercent() { + return 0; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockResourceCalculatorProcessTree.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockResourceCalculatorProcessTree.java index ff2a570a660..801895993fa 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockResourceCalculatorProcessTree.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockResourceCalculatorProcessTree.java @@ -51,10 +51,16 @@ public class MockResourceCalculatorProcessTree extends ResourceCalculatorProcess this.rssMemorySize = rssMemorySize; } + @Override public long getRssMemorySize() { return this.rssMemorySize; } + @Override + public long getVirtualMemorySize() { + return 0; + } + @Override public float getCpuUsagePercent() { return 0; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java index 8aee532e414..a36b192e44b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java @@ -268,13 +268,24 @@ public class TestContainersMonitorResourceChange { @Test public void testContainersCPUResourceForDefaultValue() throws Exception { + testContainerMonitoringInvalidResources( + MockCPUResourceCalculatorProcessTree.class.getCanonicalName()); + } + + @Test + public void testContainersMemoryResourceUnavailable() throws Exception { + testContainerMonitoringInvalidResources( + MockMemoryResourceCalculatorProcessTree.class.getCanonicalName()); + } + + private void testContainerMonitoringInvalidResources( + String processTreeClassName) throws Exception { Configuration newConf = new Configuration(conf); - // set container monitor interval to be 20s + // set container monitor interval to be 20ms newConf.setLong(YarnConfiguration.NM_CONTAINER_MON_INTERVAL_MS, 20L); containersMonitor = createContainersMonitor(executor, dispatcher, context); newConf.set(YarnConfiguration.NM_CONTAINER_MON_PROCESS_TREE, - MockCPUResourceCalculatorProcessTree.class.getCanonicalName()); - // set container monitor interval to be 20ms + processTreeClassName); containersMonitor.init(newConf); containersMonitor.start(); @@ -291,7 +302,7 @@ public class TestContainersMonitorResourceChange { 0, containersMonitor.getContainersUtilization() .compareTo(ResourceUtilization.newInstance(0, 0, 0.0f))); - // Verify the container utilization value. Since atleast one round is done, + // Verify the container utilization value. Since at least one round is done, // we can expect a non-zero value for container utilization as // MockCPUResourceCalculatorProcessTree#getCpuUsagePercent will return 50. waitForContainerResourceUtilizationChange(containersMonitor, 100); @@ -310,12 +321,13 @@ public class TestContainersMonitorResourceChange { } LOG.info( - "Monitor thread is waiting for resource utlization change."); + "Monitor thread is waiting for resource utilization change."); Thread.sleep(WAIT_MS_PER_LOOP); timeWaiting += WAIT_MS_PER_LOOP; } - assertTrue("Resource utilization is not changed from second run onwards", + assertTrue("Resource utilization is not changed after " + + timeoutMsecs / WAIT_MS_PER_LOOP + " updates", 0 != containersMonitor.getContainersUtilization() .compareTo(ResourceUtilization.newInstance(0, 0, 0.0f))); }