YARN-6862. Nodemanager resource usage metrics sometimes are negative. Contributed by Benjamin Teke

This commit is contained in:
Szilard Nemeth 2021-12-17 14:51:41 +01:00
parent e7b1f87665
commit 357423b57a
5 changed files with 131 additions and 6 deletions

View File

@ -537,6 +537,14 @@ public class ContainersMonitorImpl extends AbstractService implements
pTree.updateProcessTree(); // update process-tree
long currentVmemUsage = pTree.getVirtualMemorySize();
long currentPmemUsage = pTree.getRssMemorySize();
if (currentVmemUsage < 0 || currentPmemUsage < 0) {
// YARN-6862/YARN-5021 If the container just exited or for
// another reason the physical/virtual memory is UNAVAILABLE (-1)
// the values shouldn't be aggregated.
LOG.info("Skipping monitoring container {} because "
+ "memory usage is not available.", containerId);
continue;
}
// if machine has 6 cores and 3 are used,
// cpuUsagePercentPerCore should be 300%

View File

@ -56,6 +56,16 @@ public class MockCPUResourceCalculatorProcessTree
return true;
}
@Override
public long getVirtualMemorySize(int olderThanAge) {
return 0;
}
@Override
public long getRssMemorySize(int olderThanAge) {
return 0;
}
@Override
public float getCpuUsagePercent() {
long cpu = this.cpuPercentage;

View File

@ -0,0 +1,89 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor;
import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
/**
* Mock class to obtain resource usage (Memory).
*/
public class MockMemoryResourceCalculatorProcessTree extends ResourceCalculatorProcessTree {
private final long memorySize = 500000000L;
private long rssMemorySize = memorySize;
private long virtualMemorySize = ResourceCalculatorProcessTree.UNAVAILABLE;
/**
* Constructor for MockMemoryResourceCalculatorProcessTree with specified root
* process.
* @param root
*/
public MockMemoryResourceCalculatorProcessTree(String root) {
super(root);
}
@Override
public void updateProcessTree() {
}
@Override
public String getProcessTreeDump() {
return "";
}
@Override
public long getCumulativeCpuTime() {
return 0;
}
@Override
public boolean checkPidPgrpidForMatch() {
return true;
}
@Override
public long getRssMemorySize(int olderThanAge) {
long rssMemory = this.rssMemorySize;
// First getter call will return with 500000000, and second call will
// return -1, rest of the calls will return a valid value.
if (rssMemory == memorySize) {
this.rssMemorySize = ResourceCalculatorProcessTree.UNAVAILABLE;
}
if (rssMemory == ResourceCalculatorProcessTree.UNAVAILABLE) {
this.rssMemorySize = 2 * memorySize;
}
return rssMemory;
}
@Override
public long getVirtualMemorySize(int olderThanAge) {
long virtualMemory = this.virtualMemorySize;
// First getter call will return with -1, and rest of the calls will
// return a valid value.
if (virtualMemory == ResourceCalculatorProcessTree.UNAVAILABLE) {
this.virtualMemorySize = 3 * memorySize;
}
return virtualMemory;
}
@Override
public float getCpuUsagePercent() {
return 0;
}
}

View File

@ -51,10 +51,16 @@ public class MockResourceCalculatorProcessTree extends ResourceCalculatorProcess
this.rssMemorySize = rssMemorySize;
}
@Override
public long getRssMemorySize() {
return this.rssMemorySize;
}
@Override
public long getVirtualMemorySize() {
return 0;
}
@Override
public float getCpuUsagePercent() {
return 0;

View File

@ -282,13 +282,24 @@ public class TestContainersMonitorResourceChange {
@Test
public void testContainersCPUResourceForDefaultValue() throws Exception {
testContainerMonitoringInvalidResources(
MockCPUResourceCalculatorProcessTree.class.getCanonicalName());
}
@Test
public void testContainersMemoryResourceUnavailable() throws Exception {
testContainerMonitoringInvalidResources(
MockMemoryResourceCalculatorProcessTree.class.getCanonicalName());
}
private void testContainerMonitoringInvalidResources(
String processTreeClassName) throws Exception {
Configuration newConf = new Configuration(conf);
// set container monitor interval to be 20s
// set container monitor interval to be 20ms
newConf.setLong(YarnConfiguration.NM_CONTAINER_MON_INTERVAL_MS, 20L);
containersMonitor = createContainersMonitor(executor, dispatcher, context);
newConf.set(YarnConfiguration.NM_CONTAINER_MON_PROCESS_TREE,
MockCPUResourceCalculatorProcessTree.class.getCanonicalName());
// set container monitor interval to be 20ms
processTreeClassName);
containersMonitor.init(newConf);
containersMonitor.start();
@ -305,7 +316,7 @@ public class TestContainersMonitorResourceChange {
0, containersMonitor.getContainersUtilization()
.compareTo(ResourceUtilization.newInstance(0, 0, 0.0f)));
// Verify the container utilization value. Since atleast one round is done,
// Verify the container utilization value. Since at least one round is done,
// we can expect a non-zero value for container utilization as
// MockCPUResourceCalculatorProcessTree#getCpuUsagePercent will return 50.
waitForContainerResourceUtilizationChange(containersMonitor, 100);
@ -324,12 +335,13 @@ public class TestContainersMonitorResourceChange {
}
LOG.info(
"Monitor thread is waiting for resource utlization change.");
"Monitor thread is waiting for resource utilization change.");
Thread.sleep(WAIT_MS_PER_LOOP);
timeWaiting += WAIT_MS_PER_LOOP;
}
assertTrue("Resource utilization is not changed from second run onwards",
assertTrue("Resource utilization is not changed after " +
timeoutMsecs / WAIT_MS_PER_LOOP + " updates",
0 != containersMonitor.getContainersUtilization()
.compareTo(ResourceUtilization.newInstance(0, 0, 0.0f)));
}