YARN-3534. Collect memory/cpu usage on the node. (Inigo Goiri via kasha)

This commit is contained in:
Karthik Kambatla 2015-08-16 06:24:16 -07:00
parent 8dfec7a197
commit def12933b3
9 changed files with 220 additions and 9 deletions

View File

@ -170,6 +170,8 @@ Release 2.8.0 - UNRELEASED
YARN-4023. Publish Application Priority to TimelineServer. (Sunil G YARN-4023. Publish Application Priority to TimelineServer. (Sunil G
via rohithsharmaks) via rohithsharmaks)
YARN-3534. Collect memory/cpu usage on the node. (Inigo Goiri via kasha)
IMPROVEMENTS IMPROVEMENTS
YARN-644. Basic null check is not performed on passed in arguments before YARN-644. Basic null check is not performed on passed in arguments before

View File

@ -960,12 +960,21 @@ private static void addDeprecatedKeys() {
public static final int DEFAULT_NM_WEBAPP_HTTPS_PORT = 8044; public static final int DEFAULT_NM_WEBAPP_HTTPS_PORT = 8044;
public static final String DEFAULT_NM_WEBAPP_HTTPS_ADDRESS = "0.0.0.0:" public static final String DEFAULT_NM_WEBAPP_HTTPS_ADDRESS = "0.0.0.0:"
+ DEFAULT_NM_WEBAPP_HTTPS_PORT; + DEFAULT_NM_WEBAPP_HTTPS_PORT;
/** How often to monitor resource in a node.*/
public static final String NM_RESOURCE_MON_INTERVAL_MS =
NM_PREFIX + "resource-monitor.interval-ms";
public static final int DEFAULT_NM_RESOURCE_MON_INTERVAL_MS = 3000;
/** How often to monitor containers.*/ /** How often to monitor containers.*/
public final static String NM_CONTAINER_MON_INTERVAL_MS = public final static String NM_CONTAINER_MON_INTERVAL_MS =
NM_PREFIX + "container-monitor.interval-ms"; NM_PREFIX + "container-monitor.interval-ms";
@Deprecated
public final static int DEFAULT_NM_CONTAINER_MON_INTERVAL_MS = 3000; public final static int DEFAULT_NM_CONTAINER_MON_INTERVAL_MS = 3000;
/** Class that calculates current resource utilization.*/
public static final String NM_MON_RESOURCE_CALCULATOR =
NM_PREFIX + "resource-calculator.class";
/** Class that calculates containers current resource utilization.*/ /** Class that calculates containers current resource utilization.*/
public static final String NM_CONTAINER_MON_RESOURCE_CALCULATOR = public static final String NM_CONTAINER_MON_RESOURCE_CALCULATOR =
NM_PREFIX + "container-monitor.resource-calculator.class"; NM_PREFIX + "container-monitor.resource-calculator.class";

View File

@ -1235,13 +1235,26 @@
</property> </property>
<property> <property>
<description>How often to monitor containers.</description> <description>How often to monitor the node and the containers.</description>
<name>yarn.nodemanager.container-monitor.interval-ms</name> <name>yarn.nodemanager.resource-monitor.interval-ms</name>
<value>3000</value> <value>3000</value>
</property> </property>
<property> <property>
<description>Class that calculates containers current resource utilization.</description> <description>Class that calculates current resource utilization.</description>
<name>yarn.nodemanager.resource-calculator.class</name>
</property>
<property>
<description>How often to monitor containers. If not set, the value for
yarn.nodemanager.resource-monitor.interval-ms will be used.</description>
<name>yarn.nodemanager.container-monitor.interval-ms</name>
</property>
<property>
<description>Class that calculates containers current resource utilization.
If not set, the value for yarn.nodemanager.resource-calculator.class will
be used.</description>
<name>yarn.nodemanager.container-monitor.resource-calculator.class</name> <name>yarn.nodemanager.container-monitor.resource-calculator.class</name>
</property> </property>

View File

@ -19,7 +19,15 @@
package org.apache.hadoop.yarn.server.nodemanager; package org.apache.hadoop.yarn.server.nodemanager;
import org.apache.hadoop.service.Service; import org.apache.hadoop.service.Service;
import org.apache.hadoop.yarn.server.api.records.ResourceUtilization;
/**
* Interface for monitoring the resources of a node.
*/
public interface NodeResourceMonitor extends Service { public interface NodeResourceMonitor extends Service {
/**
* Get the <em>resource utilization</em> of the node.
* @return <em>resource utilization</em> of the node.
*/
public ResourceUtilization getUtilization();
} }

View File

@ -18,13 +18,153 @@
package org.apache.hadoop.yarn.server.nodemanager; package org.apache.hadoop.yarn.server.nodemanager;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.service.AbstractService; import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.api.records.ResourceUtilization;
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
/**
* Implementation of the node resource monitor. It periodically tracks the
* resource utilization of the node and reports it to the NM.
*/
public class NodeResourceMonitorImpl extends AbstractService implements public class NodeResourceMonitorImpl extends AbstractService implements
NodeResourceMonitor { NodeResourceMonitor {
/** Logging infrastructure. */
final static Log LOG = LogFactory
.getLog(NodeResourceMonitorImpl.class);
/** Interval to monitor the node resource utilization. */
private long monitoringInterval;
/** Thread to monitor the node resource utilization. */
private MonitoringThread monitoringThread;
/** Resource calculator. */
private ResourceCalculatorPlugin resourceCalculatorPlugin;
/** Current <em>resource utilization</em> of the node. */
private ResourceUtilization nodeUtilization;
/**
* Initialize the node resource monitor.
*/
public NodeResourceMonitorImpl() { public NodeResourceMonitorImpl() {
super(NodeResourceMonitorImpl.class.getName()); super(NodeResourceMonitorImpl.class.getName());
this.monitoringThread = new MonitoringThread();
} }
/**
* Initialize the service with the proper parameters.
*/
@Override
protected void serviceInit(Configuration conf) throws Exception {
this.monitoringInterval =
conf.getLong(YarnConfiguration.NM_RESOURCE_MON_INTERVAL_MS,
YarnConfiguration.DEFAULT_NM_RESOURCE_MON_INTERVAL_MS);
Class<? extends ResourceCalculatorPlugin> clazz =
conf.getClass(YarnConfiguration.NM_MON_RESOURCE_CALCULATOR, null,
ResourceCalculatorPlugin.class);
this.resourceCalculatorPlugin =
ResourceCalculatorPlugin.getResourceCalculatorPlugin(clazz, conf);
LOG.info(" Using ResourceCalculatorPlugin : "
+ this.resourceCalculatorPlugin);
}
/**
* Check if we should be monitoring.
* @return <em>true</em> if we can monitor the node resource utilization.
*/
private boolean isEnabled() {
if (resourceCalculatorPlugin == null) {
LOG.info("ResourceCalculatorPlugin is unavailable on this system. "
+ this.getClass().getName() + " is disabled.");
return false;
}
return true;
}
/**
* Start the thread that does the node resource utilization monitoring.
*/
@Override
protected void serviceStart() throws Exception {
if (this.isEnabled()) {
this.monitoringThread.start();
}
super.serviceStart();
}
/**
* Stop the thread that does the node resource utilization monitoring.
*/
@Override
protected void serviceStop() throws Exception {
if (this.isEnabled()) {
this.monitoringThread.interrupt();
try {
this.monitoringThread.join(10 * 1000);
} catch (InterruptedException e) {
LOG.warn("Could not wait for the thread to join");
}
}
super.serviceStop();
}
/**
* Thread that monitors the resource utilization of this node.
*/
private class MonitoringThread extends Thread {
/**
* Initialize the node resource monitoring thread.
*/
public MonitoringThread() {
super("Node Resource Monitor");
this.setDaemon(true);
}
/**
* Periodically monitor the resource utilization of the node.
*/
@Override
public void run() {
while (true) {
// Get node utilization and save it into the health status
long pmem = resourceCalculatorPlugin.getPhysicalMemorySize() -
resourceCalculatorPlugin.getAvailablePhysicalMemorySize();
long vmem =
resourceCalculatorPlugin.getVirtualMemorySize()
- resourceCalculatorPlugin.getAvailableVirtualMemorySize();
float cpu = resourceCalculatorPlugin.getCpuUsage();
nodeUtilization =
ResourceUtilization.newInstance(
(int) (pmem >> 20), // B -> MB
(int) (vmem >> 20), // B -> MB
cpu); // 1 CPU at 100% is 1
try {
Thread.sleep(monitoringInterval);
} catch (InterruptedException e) {
LOG.warn(NodeResourceMonitorImpl.class.getName()
+ " is interrupted. Exiting.");
break;
}
}
}
}
/**
* Get the <em>resource utilization</em> of the node.
* @return <em>resource utilization</em> of the node.
*/
@Override
public ResourceUtilization getUtilization() {
return this.nodeUtilization;
}
} }

View File

@ -100,10 +100,14 @@ public ContainersMonitorImpl(ContainerExecutor exec,
protected void serviceInit(Configuration conf) throws Exception { protected void serviceInit(Configuration conf) throws Exception {
this.monitoringInterval = this.monitoringInterval =
conf.getLong(YarnConfiguration.NM_CONTAINER_MON_INTERVAL_MS, conf.getLong(YarnConfiguration.NM_CONTAINER_MON_INTERVAL_MS,
YarnConfiguration.DEFAULT_NM_CONTAINER_MON_INTERVAL_MS); conf.getLong(YarnConfiguration.NM_RESOURCE_MON_INTERVAL_MS,
YarnConfiguration.DEFAULT_NM_RESOURCE_MON_INTERVAL_MS));
Class<? extends ResourceCalculatorPlugin> clazz = Class<? extends ResourceCalculatorPlugin> clazz =
conf.getClass(YarnConfiguration.NM_CONTAINER_MON_RESOURCE_CALCULATOR, null, conf.getClass(YarnConfiguration.NM_CONTAINER_MON_RESOURCE_CALCULATOR,
conf.getClass(
YarnConfiguration.NM_MON_RESOURCE_CALCULATOR, null,
ResourceCalculatorPlugin.class),
ResourceCalculatorPlugin.class); ResourceCalculatorPlugin.class);
this.resourceCalculatorPlugin = this.resourceCalculatorPlugin =
ResourceCalculatorPlugin.getResourceCalculatorPlugin(clazz, conf); ResourceCalculatorPlugin.getResourceCalculatorPlugin(clazz, conf);

View File

@ -0,0 +1,35 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
import org.junit.Test;
public class TestNodeResourceMonitor extends BaseContainerManagerTest {
public TestNodeResourceMonitor() throws UnsupportedFileSystemException {
super();
}
@Test
public void testNodeResourceMonitor() {
NodeResourceMonitor nrm = new NodeResourceMonitorImpl();
}
}

View File

@ -125,7 +125,7 @@ public TestContainerLaunch() throws UnsupportedFileSystemException {
@Before @Before
public void setup() throws IOException { public void setup() throws IOException {
conf.setClass( conf.setClass(
YarnConfiguration.NM_CONTAINER_MON_RESOURCE_CALCULATOR, YarnConfiguration.NM_MON_RESOURCE_CALCULATOR,
LinuxResourceCalculatorPlugin.class, ResourceCalculatorPlugin.class); LinuxResourceCalculatorPlugin.class, ResourceCalculatorPlugin.class);
super.setup(); super.setup();
} }

View File

@ -86,7 +86,7 @@ public TestContainersMonitor() throws UnsupportedFileSystemException {
@Before @Before
public void setup() throws IOException { public void setup() throws IOException {
conf.setClass( conf.setClass(
YarnConfiguration.NM_CONTAINER_MON_RESOURCE_CALCULATOR, YarnConfiguration.NM_MON_RESOURCE_CALCULATOR,
LinuxResourceCalculatorPlugin.class, ResourceCalculatorPlugin.class); LinuxResourceCalculatorPlugin.class, ResourceCalculatorPlugin.class);
conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, true); conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, true);
super.setup(); super.setup();