YARN-9923. Introduce HealthReporter interface to support multiple health checker files. Contributed by Adam Antal

This commit is contained in:
Szilard Nemeth 2019-12-15 17:28:04 +01:00
parent 7a87007545
commit 631dbbc6f2
36 changed files with 908 additions and 484 deletions

View File

@ -1966,24 +1966,37 @@ public class YarnConfiguration extends Configuration {
*/ */
public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0; public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0;
/** The health checker scripts. */
public static final String NM_HEALTH_CHECK_SCRIPTS =
NM_PREFIX + "health-checker.scripts";
public static final String[] DEFAULT_NM_HEALTH_CHECK_SCRIPTS = {"script"};
/** Frequency of running node health script.*/ /** Frequency of running node health script.*/
public static final String NM_HEALTH_CHECK_INTERVAL_MS = public static final String NM_HEALTH_CHECK_INTERVAL_MS =
NM_PREFIX + "health-checker.interval-ms"; NM_PREFIX + "health-checker.interval-ms";
public static final long DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS = 10 * 60 * 1000; public static final long DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS = 10 * 60 * 1000;
/** Health check time out period for all scripts.*/
public static final String NM_HEALTH_CHECK_TIMEOUT_MS =
NM_PREFIX + "health-checker.timeout-ms";
public static final long DEFAULT_NM_HEALTH_CHECK_TIMEOUT_MS =
2 * DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS;
/** Health check script time out period.*/ /** Health check script time out period.*/
public static final String NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS = public static final String NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS_TEMPLATE =
NM_PREFIX + "health-checker.script.timeout-ms"; NM_PREFIX + "health-checker.%s.timeout-ms";
public static final long DEFAULT_NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS =
2 * DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS;
/** The health check script to run.*/ /** The health check script to run.*/
public static final String NM_HEALTH_CHECK_SCRIPT_PATH = public static final String NM_HEALTH_CHECK_SCRIPT_PATH_TEMPLATE =
NM_PREFIX + "health-checker.script.path"; NM_PREFIX + "health-checker.%s.path";
/** The arguments to pass to the health check script.*/ /** The arguments to pass to the health check script.*/
public static final String NM_HEALTH_CHECK_SCRIPT_OPTS = public static final String NM_HEALTH_CHECK_SCRIPT_OPTS_TEMPLATE =
NM_PREFIX + "health-checker.script.opts"; NM_PREFIX + "health-checker.%s.opts";
/** Frequency of running node health script. */
public static final String NM_HEALTH_CHECK_SCRIPT_INTERVAL_MS_TEMPLATE =
NM_PREFIX + "health-checker.%s.interval-ms";
/** The JVM options used on forking ContainerLocalizer process /** The JVM options used on forking ContainerLocalizer process
by container executor. */ by container executor. */

View File

@ -1617,27 +1617,21 @@
</property> </property>
<property> <property>
<description>Frequency of running node health script.</description> <description>The nodemanager health check scripts to run.</description>
<name>yarn.nodemanager.health-checker.interval-ms</name> <name>yarn.nodemanager.health-checker.scripts</name>
<value>600000</value> <value>script</value>
</property> </property>
<property> <property>
<description>Script time out period.</description> <description>Health check script time out period.</description>
<name>yarn.nodemanager.health-checker.script.timeout-ms</name> <name>yarn.nodemanager.health-checker.timeout-ms</name>
<value>1200000</value> <value>1200000</value>
</property> </property>
<property> <property>
<description>The health check script to run.</description> <description>Frequency of running node health scripts.</description>
<name>yarn.nodemanager.health-checker.script.path</name> <name>yarn.nodemanager.health-checker.interval-ms</name>
<value></value> <value>600000</value>
</property>
<property>
<description>The arguments to pass to the health check script.</description>
<name>yarn.nodemanager.health-checker.script.opts</name>
<value></value>
</property> </property>
<property> <property>

View File

@ -29,9 +29,11 @@ import java.util.Timer;
import java.util.TimerTask; import java.util.TimerTask;
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.util.DiskChecker.DiskErrorException; import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.DiskValidator; import org.apache.hadoop.util.DiskValidator;
import org.apache.hadoop.util.DiskValidatorFactory; import org.apache.hadoop.util.DiskValidatorFactory;
import org.apache.hadoop.yarn.server.nodemanager.health.HealthReporter;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -42,7 +44,6 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalDirAllocator; import org.apache.hadoop.fs.LocalDirAllocator;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
@ -54,7 +55,8 @@ import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
* directories of a node. This specifically manages nodemanager-local-dirs and * directories of a node. This specifically manages nodemanager-local-dirs and
* nodemanager-log-dirs by periodically checking their health. * nodemanager-log-dirs by periodically checking their health.
*/ */
public class LocalDirsHandlerService extends AbstractService { public class LocalDirsHandlerService extends AbstractService
implements HealthReporter {
private static final Logger LOG = private static final Logger LOG =
LoggerFactory.getLogger(LocalDirsHandlerService.class); LoggerFactory.getLogger(LocalDirsHandlerService.class);
@ -426,6 +428,11 @@ public class LocalDirsHandlerService extends AbstractService {
} }
@Override
public String getHealthReport() {
return getDisksHealthReport(false);
}
/** /**
* The minimum fraction of number of disks needed to be healthy for a node to * The minimum fraction of number of disks needed to be healthy for a node to
* be considered healthy in terms of disks is configured using * be considered healthy in terms of disks is configured using
@ -457,10 +464,20 @@ public class LocalDirsHandlerService extends AbstractService {
return true; return true;
} }
@Override
public boolean isHealthy() {
return areDisksHealthy();
}
public long getLastDisksCheckTime() { public long getLastDisksCheckTime() {
return lastDisksCheckTime; return lastDisksCheckTime;
} }
@Override
public long getLastHealthReportTime() {
return getLastDisksCheckTime();
}
public boolean isGoodLocalDir(String path) { public boolean isGoodLocalDir(String path) {
return isInGoodDirs(getLocalDirs(), path); return isInGoodDirs(getLocalDirs(), path);
} }

View File

@ -1,123 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.service.CompositeService;
import org.apache.hadoop.util.NodeHealthScriptRunner;
import java.util.Arrays;
import java.util.Collections;
/**
* The class which provides functionality of checking the health of the node and
* reporting back to the service for which the health checker has been asked to
* report.
*/
public class NodeHealthCheckerService extends CompositeService {
private NodeHealthScriptRunner nodeHealthScriptRunner;
private LocalDirsHandlerService dirsHandler;
private Exception nodeHealthException;
private long nodeHealthExceptionReportTime;
static final String SEPARATOR = ";";
public NodeHealthCheckerService(NodeHealthScriptRunner scriptRunner,
LocalDirsHandlerService dirHandlerService) {
super(NodeHealthCheckerService.class.getName());
nodeHealthScriptRunner = scriptRunner;
dirsHandler = dirHandlerService;
nodeHealthException = null;
nodeHealthExceptionReportTime = 0;
}
@Override
protected void serviceInit(Configuration conf) throws Exception {
if (nodeHealthScriptRunner != null) {
addService(nodeHealthScriptRunner);
}
addService(dirsHandler);
super.serviceInit(conf);
}
/**
* @return the reporting string of health of the node
*/
String getHealthReport() {
String scriptReport = Strings.emptyToNull(
nodeHealthScriptRunner == null ? null :
nodeHealthScriptRunner.getHealthReport());
String discReport =
Strings.emptyToNull(
dirsHandler.getDisksHealthReport(false));
String exceptionReport = Strings.emptyToNull(
nodeHealthException == null ? null :
nodeHealthException.getMessage());
return Joiner.on(SEPARATOR).skipNulls()
.join(scriptReport, discReport, exceptionReport);
}
/**
* @return <em>true</em> if the node is healthy
*/
boolean isHealthy() {
boolean scriptHealthy = nodeHealthScriptRunner == null ||
nodeHealthScriptRunner.isHealthy();
return nodeHealthException == null &&
scriptHealthy && dirsHandler.areDisksHealthy();
}
/**
* @return when the last time the node health status is reported
*/
long getLastHealthReportTime() {
return Collections.max(Arrays.asList(
dirsHandler.getLastDisksCheckTime(),
nodeHealthScriptRunner == null ? 0 :
nodeHealthScriptRunner.getLastReportedTime(),
nodeHealthExceptionReportTime));
}
/**
* @return the disk handler
*/
public LocalDirsHandlerService getDiskHandler() {
return dirsHandler;
}
/**
* @return the node health script runner
*/
NodeHealthScriptRunner getNodeHealthScriptRunner() {
return nodeHealthScriptRunner;
}
/**
* Report an exception to mark the node as unhealthy.
* @param ex the exception that makes the node unhealthy
*/
void reportException(Exception ex) {
nodeHealthException = ex;
nodeHealthExceptionReportTime = System.currentTimeMillis();
}
}

View File

@ -33,7 +33,7 @@ import org.apache.hadoop.service.CompositeService;
import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.ExitUtil;
import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.JvmPauseMonitor; import org.apache.hadoop.util.JvmPauseMonitor;
import org.apache.hadoop.util.NodeHealthScriptRunner; import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.ShutdownHookManager; import org.apache.hadoop.util.ShutdownHookManager;
@ -347,27 +347,6 @@ public class NodeManager extends CompositeService
} }
} }
public static NodeHealthScriptRunner getNodeHealthScriptRunner(Configuration conf) {
String nodeHealthScript =
conf.get(YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH);
if(!NodeHealthScriptRunner.shouldRun(nodeHealthScript)) {
LOG.info("Node Manager health check script is not available "
+ "or doesn't have execute permission, so not "
+ "starting the node health script runner.");
return null;
}
long nmCheckintervalTime = conf.getLong(
YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS,
YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS);
long scriptTimeout = conf.getLong(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS,
YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS);
String[] scriptArgs = conf.getStrings(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_OPTS, new String[] {});
return new NodeHealthScriptRunner(nodeHealthScript,
nmCheckintervalTime, scriptTimeout, scriptArgs);
}
@VisibleForTesting @VisibleForTesting
protected ResourcePluginManager createResourcePluginManager() { protected ResourcePluginManager createResourcePluginManager() {
return new ResourcePluginManager(); return new ResourcePluginManager();
@ -431,12 +410,9 @@ public class NodeManager extends CompositeService
// NodeManager level dispatcher // NodeManager level dispatcher
this.dispatcher = createNMDispatcher(); this.dispatcher = createNMDispatcher();
nodeHealthChecker = this.nodeHealthChecker = new NodeHealthCheckerService(dirsHandler);
new NodeHealthCheckerService(
getNodeHealthScriptRunner(conf), dirsHandler);
addService(nodeHealthChecker); addService(nodeHealthChecker);
((NMContext)context).setContainerExecutor(exec); ((NMContext)context).setContainerExecutor(exec);
((NMContext)context).setDeletionService(del); ((NMContext)context).setDeletionService(del);

View File

@ -86,6 +86,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider;

View File

@ -0,0 +1,61 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.health;
/**
* Simple {@link HealthReporter} implementation which reports whether a fatal
* exception has happened in the NodeManager.
*
* See the <code>reportException</code> call of
* {@link org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl}
*/
public class ExceptionReporter implements HealthReporter {
private Exception nodeHealthException;
private long nodeHealthExceptionReportTime;
ExceptionReporter() {
this.nodeHealthException = null;
this.nodeHealthExceptionReportTime = 0;
}
@Override
public synchronized boolean isHealthy() {
return nodeHealthException == null;
}
@Override
public synchronized String getHealthReport() {
return nodeHealthException == null ? null :
nodeHealthException.getMessage();
}
@Override
public synchronized long getLastHealthReportTime() {
return nodeHealthExceptionReportTime;
}
/**
* Report an exception to mark the node as unhealthy.
* @param ex the exception that makes the node unhealthy
*/
public synchronized void reportException(Exception ex) {
nodeHealthException = ex;
nodeHealthExceptionReportTime = System.currentTimeMillis();
}
}

View File

@ -0,0 +1,65 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.health;
/**
* Interface providing information about the health of a service.
*
* Associated pieces of information:
* <ul>
* <li>whether the service is healthy ({@link #isHealthy()})</li>
* <li>report of the healthiness ({@link #getHealthReport()})</li>
* <li>latest timestamp of the health check
* ({@link #getLastHealthReportTime()})</li>
* </ul>
*
* Classes implementing this interface are used in
* {@link NodeHealthCheckerService}.
*
* Developers are discouraged to implement new Java-based health scripts,
* they should rather try to implement it as a script and use the
* {@link NodeHealthScriptRunner} implementation.
*
* @see TimedHealthReporterService
* @see org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService
*/
public interface HealthReporter {
/**
* Gets whether the node is healthy or not.
*
* @return true if node is healthy
*/
boolean isHealthy();
/**
* Returns output from health check. If node is healthy then an empty string
* is returned.
*
* @return output from health check
*/
String getHealthReport();
/**
* Returns time stamp when node health check was last run.
*
* @return timestamp when node health script was last run
*/
long getLastHealthReportTime();
}

View File

@ -0,0 +1,164 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.health;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.service.CompositeService;
import org.apache.hadoop.service.Service;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
/**
* This class provides functionality of checking the health of a node and
* reporting back to the service for which the health checker has been asked to
* report.
*
* It is a {@link CompositeService}: every {@link Service} must be registered
* first in serviceInit, and should also implement the {@link HealthReporter}
* interface - otherwise an exception is thrown.
*
* Calling functions of HealthReporter merge its dependent
* services' reports.
*
* @see HealthReporter
* @see LocalDirsHandlerService
* @see TimedHealthReporterService
*/
public class NodeHealthCheckerService extends CompositeService
implements HealthReporter {
public static final Logger LOG =
LoggerFactory.getLogger(NodeHealthCheckerService.class);
private static final int MAX_SCRIPTS = 4;
private List<HealthReporter> reporters;
private LocalDirsHandlerService dirsHandler;
private ExceptionReporter exceptionReporter;
public static final String SEPARATOR = ";";
public NodeHealthCheckerService(
LocalDirsHandlerService dirHandlerService) {
super(NodeHealthCheckerService.class.getName());
this.reporters = new ArrayList<>();
this.dirsHandler = dirHandlerService;
this.exceptionReporter = new ExceptionReporter();
}
@Override
protected void serviceInit(Configuration conf) throws Exception {
reporters.add(exceptionReporter);
addHealthReporter(dirsHandler);
String[] configuredScripts = conf.getTrimmedStrings(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPTS,
YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_SCRIPTS);
if (configuredScripts.length > MAX_SCRIPTS) {
throw new IllegalArgumentException("Due to performance reasons " +
"running more than " + MAX_SCRIPTS + "scripts is not allowed.");
}
for (String configuredScript : configuredScripts) {
addHealthReporter(NodeHealthScriptRunner.newInstance(
configuredScript, conf));
}
super.serviceInit(conf);
}
/**
* Adds a {@link Service} implementing the {@link HealthReporter} interface,
* if that service has not been added to this {@link CompositeService} yet.
*
* @param service to add
* @throws Exception if not a {@link HealthReporter}
* implementation is provided to this function
*/
@VisibleForTesting
void addHealthReporter(Service service) throws Exception {
if (service != null) {
if (getServices().stream()
.noneMatch(x -> x.getName().equals(service.getName()))) {
if (!(service instanceof HealthReporter)) {
throw new Exception("Attempted to add service to " +
"NodeHealthCheckerService that does not implement " +
"HealthReporter.");
}
reporters.add((HealthReporter) service);
addService(service);
} else {
LOG.debug("Omitting duplicate service: {}.", service.getName());
}
}
}
/**
* Joining the health reports of the dependent services.
*
* @return the report string about the health of the node
*/
@Override
public String getHealthReport() {
ArrayList<String> reports = reporters.stream()
.map(reporter -> Strings.emptyToNull(reporter.getHealthReport()))
.collect(Collectors.toCollection(ArrayList::new));
return Joiner.on(SEPARATOR).skipNulls().join(reports);
}
/**
* @return <em>true</em> if the node is healthy
*/
@Override
public boolean isHealthy() {
return reporters.stream().allMatch(HealthReporter::isHealthy);
}
/**
* @return when the last time the node health status is reported
*/
@Override
public long getLastHealthReportTime() {
Optional<Long> max = reporters.stream()
.map(HealthReporter::getLastHealthReportTime).max(Long::compareTo);
return max.orElse(0L);
}
/**
* @return the disk handler
*/
public LocalDirsHandlerService getDiskHandler() {
return dirsHandler;
}
/**
* Propagating an exception to {@link ExceptionReporter}.
* @param exception the exception to propagate
*/
public void reportException(Exception exception) {
exceptionReporter.reportException(exception);
}
}

View File

@ -16,7 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hadoop.util; package org.apache.hadoop.yarn.server.nodemanager.health;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -27,50 +27,94 @@ import java.util.TimerTask;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.util.Shell.ExitCodeException; import org.apache.hadoop.util.Shell.ExitCodeException;
import org.apache.hadoop.util.Shell.ShellCommandExecutor; import org.apache.hadoop.util.Shell.ShellCommandExecutor;
import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
/** /**
*
* The class which provides functionality of checking the health of the node * The class which provides functionality of checking the health of the node
* using the configured node health script and reporting back to the service * using the configured node health script and reporting back to the service
* for which the health checker has been asked to report. * for which the health checker has been asked to report.
*/ */
public class NodeHealthScriptRunner extends AbstractService { public class NodeHealthScriptRunner extends TimedHealthReporterService {
private static final Logger LOG = private static final Logger LOG =
LoggerFactory.getLogger(NodeHealthScriptRunner.class); LoggerFactory.getLogger(NodeHealthScriptRunner.class);
/** Absolute path to the health script. */ /** Absolute path to the health script. */
private String nodeHealthScript; private String nodeHealthScript;
/** Delay after which node health script to be executed */ /** Time after which the script should be timed out. */
private long intervalTime;
/** Time after which the script should be timedout */
private long scriptTimeout; private long scriptTimeout;
/** Timer used to schedule node health monitoring script execution */ /** ShellCommandExecutor used to execute monitoring script. */
private Timer nodeHealthScriptScheduler; private ShellCommandExecutor commandExecutor = null;
/** ShellCommandExecutor used to execute monitoring script */ /** Pattern used for searching in the output of the node health script. */
ShellCommandExecutor shexec = null; private static final String ERROR_PATTERN = "ERROR";
/** Pattern used for searching in the output of the node health script */ /** Time out error message. */
static private final String ERROR_PATTERN = "ERROR"; static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG =
"Node health script timed out";
/** Time out error message */ private NodeHealthScriptRunner(String scriptName, long checkInterval,
public static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG = "Node health script timed out"; long timeout, String[] scriptArgs) {
super(NodeHealthScriptRunner.class.getName(), checkInterval);
this.nodeHealthScript = scriptName;
this.scriptTimeout = timeout;
setTimerTask(new NodeHealthMonitorExecutor(scriptArgs));
}
private boolean isHealthy; public static NodeHealthScriptRunner newInstance(String scriptName,
Configuration conf) {
String nodeHealthScriptsConfig = String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH_TEMPLATE, scriptName);
String nodeHealthScript = conf.get(nodeHealthScriptsConfig);
if (!shouldRun(scriptName, nodeHealthScript)) {
return null;
}
private String healthReport; // Determine check interval ms
String checkIntervalMsConfig = String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_INTERVAL_MS_TEMPLATE,
scriptName);
long checkIntervalMs = conf.getLong(checkIntervalMsConfig, 0L);
if (checkIntervalMs == 0L) {
checkIntervalMs = conf.getLong(
YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS,
YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS);
}
if (checkIntervalMs < 0) {
throw new IllegalArgumentException("The node health-checker's " +
"interval-ms can not be set to a negative number.");
}
private long lastReportedTime; // Determine time out
String scriptTimeoutConfig = String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS_TEMPLATE,
scriptName);
long scriptTimeout = conf.getLong(scriptTimeoutConfig, 0L);
if (scriptTimeout == 0L) {
scriptTimeout = conf.getLong(
YarnConfiguration.NM_HEALTH_CHECK_TIMEOUT_MS,
YarnConfiguration.DEFAULT_NM_HEALTH_CHECK_TIMEOUT_MS);
}
if (scriptTimeout <= 0) {
throw new IllegalArgumentException("The node health-checker's " +
"timeout can only be set to a positive number.");
}
private TimerTask timer; // Determine script arguments
String scriptArgsConfig = String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_OPTS_TEMPLATE,
scriptName);
String[] scriptArgs = conf.getStrings(scriptArgsConfig, new String[]{});
return new NodeHealthScriptRunner(nodeHealthScript,
checkIntervalMs, scriptTimeout, scriptArgs);
}
private enum HealthCheckerExitStatus { private enum HealthCheckerExitStatus {
SUCCESS, SUCCESS,
@ -84,19 +128,17 @@ public class NodeHealthScriptRunner extends AbstractService {
/** /**
* Class which is used by the {@link Timer} class to periodically execute the * Class which is used by the {@link Timer} class to periodically execute the
* node health script. * node health script.
*
*/ */
private class NodeHealthMonitorExecutor extends TimerTask { private class NodeHealthMonitorExecutor extends TimerTask {
private String exceptionStackTrace = "";
String exceptionStackTrace = ""; NodeHealthMonitorExecutor(String[] args) {
public NodeHealthMonitorExecutor(String[] args) {
ArrayList<String> execScript = new ArrayList<String>(); ArrayList<String> execScript = new ArrayList<String>();
execScript.add(nodeHealthScript); execScript.add(nodeHealthScript);
if (args != null) { if (args != null) {
execScript.addAll(Arrays.asList(args)); execScript.addAll(Arrays.asList(args));
} }
shexec = new ShellCommandExecutor(execScript commandExecutor = new ShellCommandExecutor(execScript
.toArray(new String[execScript.size()]), null, null, scriptTimeout); .toArray(new String[execScript.size()]), null, null, scriptTimeout);
} }
@ -104,18 +146,18 @@ public class NodeHealthScriptRunner extends AbstractService {
public void run() { public void run() {
HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS; HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS;
try { try {
shexec.execute(); commandExecutor.execute();
} catch (ExitCodeException e) { } catch (ExitCodeException e) {
// ignore the exit code of the script // ignore the exit code of the script
status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE; status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE;
// On Windows, we will not hit the Stream closed IOException // On Windows, we will not hit the Stream closed IOException
// thrown by stdout buffered reader for timeout event. // thrown by stdout buffered reader for timeout event.
if (Shell.WINDOWS && shexec.isTimedOut()) { if (Shell.WINDOWS && commandExecutor.isTimedOut()) {
status = HealthCheckerExitStatus.TIMED_OUT; status = HealthCheckerExitStatus.TIMED_OUT;
} }
} catch (Exception e) { } catch (Exception e) {
LOG.warn("Caught exception : " + e.getMessage()); LOG.warn("Caught exception : " + e.getMessage());
if (!shexec.isTimedOut()) { if (!commandExecutor.isTimedOut()) {
status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION; status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION;
} else { } else {
status = HealthCheckerExitStatus.TIMED_OUT; status = HealthCheckerExitStatus.TIMED_OUT;
@ -123,7 +165,7 @@ public class NodeHealthScriptRunner extends AbstractService {
exceptionStackTrace = StringUtils.stringifyException(e); exceptionStackTrace = StringUtils.stringifyException(e);
} finally { } finally {
if (status == HealthCheckerExitStatus.SUCCESS) { if (status == HealthCheckerExitStatus.SUCCESS) {
if (hasErrors(shexec.getOutput())) { if (hasErrors(commandExecutor.getOutput())) {
status = HealthCheckerExitStatus.FAILED; status = HealthCheckerExitStatus.FAILED;
} }
} }
@ -141,7 +183,8 @@ public class NodeHealthScriptRunner extends AbstractService {
* The node is marked unhealthy if * The node is marked unhealthy if
* <ol> * <ol>
* <li>The node health script times out</li> * <li>The node health script times out</li>
* <li>The node health scripts output has a line which begins with ERROR</li> * <li>The node health scripts output has a line which begins
* with ERROR</li>
* <li>An exception is thrown while executing the script</li> * <li>An exception is thrown while executing the script</li>
* </ol> * </ol>
* If the script throws {@link IOException} or {@link ExitCodeException} the * If the script throws {@link IOException} or {@link ExitCodeException} the
@ -151,23 +194,23 @@ public class NodeHealthScriptRunner extends AbstractService {
* @param status * @param status
*/ */
void reportHealthStatus(HealthCheckerExitStatus status) { void reportHealthStatus(HealthCheckerExitStatus status) {
long now = System.currentTimeMillis();
switch (status) { switch (status) {
case SUCCESS: case SUCCESS:
setHealthStatus(true, "", now);
break;
case TIMED_OUT:
setHealthStatus(false, NODE_HEALTH_SCRIPT_TIMED_OUT_MSG);
break;
case FAILED_WITH_EXCEPTION:
setHealthStatus(false, exceptionStackTrace);
break;
case FAILED_WITH_EXIT_CODE: case FAILED_WITH_EXIT_CODE:
// see Javadoc above - we don't report bad health intentionally // see Javadoc above - we don't report bad health intentionally
setHealthStatus(true, "", now); setHealthyWithoutReport();
break;
case TIMED_OUT:
setUnhealthyWithReport(NODE_HEALTH_SCRIPT_TIMED_OUT_MSG);
break;
case FAILED_WITH_EXCEPTION:
setUnhealthyWithReport(exceptionStackTrace);
break; break;
case FAILED: case FAILED:
setHealthStatus(false, shexec.getOutput()); setUnhealthyWithReport(commandExecutor.getOutput());
break;
default:
LOG.warn("Unknown HealthCheckerExitStatus - ignored.");
break; break;
} }
} }
@ -175,8 +218,7 @@ public class NodeHealthScriptRunner extends AbstractService {
/** /**
* Method to check if the output string has line which begins with ERROR. * Method to check if the output string has line which begins with ERROR.
* *
* @param output * @param output the output of the node health script to process
* string
* @return true if output string has error pattern in it. * @return true if output string has error pattern in it.
*/ */
private boolean hasErrors(String output) { private boolean hasErrors(String output) {
@ -190,116 +232,21 @@ public class NodeHealthScriptRunner extends AbstractService {
} }
} }
public NodeHealthScriptRunner(String scriptName, long chkInterval, long timeout,
String[] scriptArgs) {
super(NodeHealthScriptRunner.class.getName());
this.lastReportedTime = System.currentTimeMillis();
this.isHealthy = true;
this.healthReport = "";
this.nodeHealthScript = scriptName;
this.intervalTime = chkInterval;
this.scriptTimeout = timeout;
this.timer = new NodeHealthMonitorExecutor(scriptArgs);
}
/*
* Method which initializes the values for the script path and interval time.
*/
@Override @Override
protected void serviceInit(Configuration conf) throws Exception { public void serviceStop() throws Exception {
super.serviceInit(conf); if (commandExecutor != null) {
} Process p = commandExecutor.getProcess();
/**
* Method used to start the Node health monitoring.
*
*/
@Override
protected void serviceStart() throws Exception {
nodeHealthScriptScheduler = new Timer("NodeHealthMonitor-Timer", true);
// Start the timer task immediately and
// then periodically at interval time.
nodeHealthScriptScheduler.scheduleAtFixedRate(timer, 0, intervalTime);
super.serviceStart();
}
/**
* Method used to terminate the node health monitoring service.
*
*/
@Override
protected void serviceStop() {
if (nodeHealthScriptScheduler != null) {
nodeHealthScriptScheduler.cancel();
}
if (shexec != null) {
Process p = shexec.getProcess();
if (p != null) { if (p != null) {
p.destroy(); p.destroy();
} }
} }
super.serviceStop();
} }
/** /**
* Gets the if the node is healthy or not * Method used to determine whether the {@link NodeHealthScriptRunner}
* * should be started or not.<p>
* @return true if node is healthy * Returns true if following conditions are met:
*/
public boolean isHealthy() {
return isHealthy;
}
/**
* Sets if the node is healthy or not considering disks' health also.
*
* @param isHealthy
* if or not node is healthy
*/
private synchronized void setHealthy(boolean isHealthy) {
this.isHealthy = isHealthy;
}
/**
* Returns output from health script. if node is healthy then an empty string
* is returned.
*
* @return output from health script
*/
public String getHealthReport() {
return healthReport;
}
/**
* Sets the health report from the node health script. Also set the disks'
* health info obtained from DiskHealthCheckerService.
*
* @param healthReport
*/
private synchronized void setHealthReport(String healthReport) {
this.healthReport = healthReport;
}
/**
* Returns time stamp when node health script was last run.
*
* @return timestamp when node health script was last run
*/
public long getLastReportedTime() {
return lastReportedTime;
}
/**
* Sets the last run time of the node health script.
*
* @param lastReportedTime
*/
private synchronized void setLastReportedTime(long lastReportedTime) {
this.lastReportedTime = lastReportedTime;
}
/**
* Method used to determine if or not node health monitoring service should be
* started or not. Returns true if following conditions are met:
* *
* <ol> * <ol>
* <li>Path to Node health check script is not empty</li> * <li>Path to Node health check script is not empty</li>
@ -308,32 +255,23 @@ public class NodeHealthScriptRunner extends AbstractService {
* *
* @return true if node health monitoring service can be started. * @return true if node health monitoring service can be started.
*/ */
public static boolean shouldRun(String healthScript) { static boolean shouldRun(String script, String healthScript) {
if (healthScript == null || healthScript.trim().isEmpty()) { if (healthScript == null || healthScript.trim().isEmpty()) {
LOG.info("Missing location for the node health check script \"{}\".",
script);
return false; return false;
} }
File f = new File(healthScript); File f = new File(healthScript);
return f.exists() && FileUtil.canExecute(f); if (!f.exists()) {
} LOG.warn("File {} for script \"{}\" does not exist.",
healthScript, script);
private synchronized void setHealthStatus(boolean isHealthy, String output) { return false;
LOG.info("health status being set as " + output); }
this.setHealthy(isHealthy); if (!FileUtil.canExecute(f)) {
this.setHealthReport(output); LOG.warn("File {} for script \"{}\" can not be executed.",
} healthScript, script);
return false;
private synchronized void setHealthStatus(boolean isHealthy, String output, }
long time) { return true;
LOG.info("health status being set as " + output);
this.setHealthStatus(isHealthy, output);
this.setLastReportedTime(time);
}
/**
* Used only by tests to access the timer task directly
* @return the timer task
*/
public TimerTask getTimerTask() {
return timer;
} }
} }

View File

@ -0,0 +1,146 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.health;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.service.AbstractService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Timer;
import java.util.TimerTask;
/**
* A {@link HealthReporter} skeleton for regularly checking a specific
* {@link TimerTask} and obtaining information about it.
*
* @see NodeHealthScriptRunner
*/
public abstract class TimedHealthReporterService extends AbstractService
implements HealthReporter {
private static final Logger LOG =
LoggerFactory.getLogger(TimedHealthReporterService.class);
private boolean isHealthy;
private String healthReport;
private long lastReportedTime;
private Timer timer;
private TimerTask task;
private long intervalMs;
TimedHealthReporterService(String name, long intervalMs) {
super(name);
this.isHealthy = true;
this.healthReport = "";
this.lastReportedTime = System.currentTimeMillis();
this.intervalMs = intervalMs;
}
@VisibleForTesting
void setTimerTask(TimerTask timerTask) {
task = timerTask;
}
@VisibleForTesting
TimerTask getTimerTask() {
return task;
}
/**
* Method used to start the health monitoring.
*/
@Override
public void serviceStart() throws Exception {
if (task == null) {
throw new Exception("Health reporting task hasn't been set!");
}
timer = new Timer("HealthReporterService-Timer", true);
timer.scheduleAtFixedRate(task, 0, intervalMs);
super.serviceStart();
}
/**
* Method used to terminate the health monitoring service.
*/
@Override
protected void serviceStop() throws Exception {
if (timer != null) {
timer.cancel();
}
super.serviceStop();
}
@Override
public boolean isHealthy() {
return isHealthy;
}
/**
* Sets if the node is healthy or not.
*
* @param healthy whether the node is healthy
*/
protected synchronized void setHealthy(boolean healthy) {
this.isHealthy = healthy;
}
@Override
public String getHealthReport() {
return healthReport;
}
/**
* Sets the health report from the node health check. Also set the disks'
* health info obtained from DiskHealthCheckerService.
*
* @param report report String
*/
private synchronized void setHealthReport(String report) {
this.healthReport = report;
}
@Override
public long getLastHealthReportTime() {
return lastReportedTime;
}
/**
* Sets the last run time of the node health check.
*
* @param lastReportedTime last reported time in long
*/
private synchronized void setLastReportedTime(long lastReportedTime) {
this.lastReportedTime = lastReportedTime;
}
synchronized void setHealthyWithoutReport() {
this.setHealthy(true);
this.setHealthReport("");
this.setLastReportedTime(System.currentTimeMillis());
}
synchronized void setUnhealthyWithReport(String output) {
LOG.info("Health status being set as: \"" + output + "\".");
this.setHealthy(false);
this.setHealthReport(output);
this.setLastReportedTime(System.currentTimeMillis());
}
}

View File

@ -38,6 +38,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRe
import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.NodeStatus;
import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils;

View File

@ -37,6 +37,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatRe
import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl; import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl;
import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl; import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;

View File

@ -45,6 +45,7 @@ import org.apache.hadoop.yarn.server.api.ResourceTracker;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest; import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
@ -102,8 +103,8 @@ public class TestEventFlow {
DeletionService del = new DeletionService(exec); DeletionService del = new DeletionService(exec);
Dispatcher dispatcher = new AsyncDispatcher(); Dispatcher dispatcher = new AsyncDispatcher();
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( NodeHealthCheckerService healthChecker =
NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); new NodeHealthCheckerService(dirsHandler);
healthChecker.init(conf); healthChecker.init(conf);
NodeManagerMetrics metrics = NodeManagerMetrics.create(); NodeManagerMetrics metrics = NodeManagerMetrics.create();
NodeStatusUpdater nodeStatusUpdater = NodeStatusUpdater nodeStatusUpdater =

View File

@ -64,6 +64,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
import org.apache.hadoop.yarn.server.nodemanager.containermanager.deletion.task.FileDeletionMatcher; import org.apache.hadoop.yarn.server.nodemanager.containermanager.deletion.task.FileDeletionMatcher;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.util.Records; import org.apache.hadoop.yarn.util.Records;
import org.junit.After; import org.junit.After;
import org.junit.Assert; import org.junit.Assert;

View File

@ -86,6 +86,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManag
import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils;

View File

@ -66,6 +66,7 @@ import org.apache.hadoop.yarn.ipc.YarnRPC;
import org.apache.hadoop.yarn.security.NMTokenIdentifier; import org.apache.hadoop.yarn.security.NMTokenIdentifier;
import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.TestContainerManager;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.util.ConverterUtils;
import org.junit.After; import org.junit.After;

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.server.nodemanager; package org.apache.hadoop.yarn.server.nodemanager;
import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse; import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock; import static org.mockito.Mockito.mock;
@ -107,6 +108,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Ap
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
@ -1994,4 +1996,21 @@ public class TestNodeStatusUpdater extends NodeManagerTestBase {
} }
}; };
} }
@Test
public void testExceptionReported() {
nm = new NodeManager();
YarnConfiguration conf = new YarnConfiguration();
nm.init(conf);
NodeStatusUpdater nodeStatusUpdater = nm.getNodeStatusUpdater();
NodeHealthCheckerService nodeHealthChecker = nm.getNodeHealthChecker();
assertThat(nodeHealthChecker.isHealthy()).isTrue();
String message = "exception message";
Exception e = new Exception(message);
nodeStatusUpdater.reportException(e);
assertThat(nodeHealthChecker.isHealthy()).isFalse();
assertThat(nodeHealthChecker.getHealthReport()).isEqualTo(message);
}
} }

View File

@ -56,6 +56,7 @@ import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.api.records.NodeAction;
import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.NodeStatus;
import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider;
import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils;
import org.junit.After; import org.junit.After;

View File

@ -50,6 +50,7 @@ import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.api.records.NodeAction;
import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.NodeStatus;
import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider; import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider;
import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils;
import org.junit.After; import org.junit.After;

View File

@ -75,7 +75,7 @@ import org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.LocalRMInterface; import org.apache.hadoop.yarn.server.nodemanager.LocalRMInterface;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
@ -218,8 +218,7 @@ public abstract class BaseContainerManagerTest {
delSrvc.init(conf); delSrvc.init(conf);
dirsHandler = new LocalDirsHandlerService(); dirsHandler = new LocalDirsHandlerService();
nodeHealthChecker = new NodeHealthCheckerService( nodeHealthChecker = new NodeHealthCheckerService(dirsHandler);
NodeManager.getNodeHealthScriptRunner(conf), dirsHandler);
nodeHealthChecker.init(conf); nodeHealthChecker.init(conf);
containerManager = createContainerManager(delSrvc); containerManager = createContainerManager(delSrvc);
((NMContext)context).setContainerManager(containerManager); ((NMContext)context).setContainerManager(containerManager);

View File

@ -85,8 +85,7 @@ import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService; import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
@ -157,8 +156,7 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
delSrvc.init(conf); delSrvc.init(conf);
exec = createContainerExecutor(); exec = createContainerExecutor();
dirsHandler = new LocalDirsHandlerService(); dirsHandler = new LocalDirsHandlerService();
nodeHealthChecker = new NodeHealthCheckerService( nodeHealthChecker = new NodeHealthCheckerService(dirsHandler);
NodeManager.getNodeHealthScriptRunner(conf), dirsHandler);
nodeHealthChecker.init(conf); nodeHealthChecker.init(conf);
} }

View File

@ -30,7 +30,6 @@ import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase; import org.apache.hadoop.yarn.server.nodemanager.NodeManagerTestBase;
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
@ -44,6 +43,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resource
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerChain;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.*; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.deviceframework.*;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.util.resource.ResourceUtils; import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.apache.hadoop.yarn.util.resource.TestResourceUtils; import org.apache.hadoop.yarn.util.resource.TestResourceUtils;

View File

@ -0,0 +1,41 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.nodemanager.health;
import org.junit.Test;
import static org.assertj.core.api.Assertions.assertThat;
/**
* Tests for the {@link ExceptionReporter} class.
*/
public class TestExceptionReporter {
@Test
public void testUnhealthy() {
ExceptionReporter reporter = new ExceptionReporter();
assertThat(reporter.isHealthy()).isTrue();
assertThat(reporter.getLastHealthReportTime()).isZero();
String message = "test";
Exception exception = new Exception(message);
reporter.reportException(exception);
assertThat(reporter.isHealthy()).isFalse();
assertThat(reporter.getHealthReport()).isEqualTo(message);
assertThat(reporter.getLastHealthReportTime()).isNotEqualTo(0);
}
}

View File

@ -16,12 +16,15 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hadoop.yarn.server.nodemanager; package org.apache.hadoop.yarn.server.nodemanager.health;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.PrintWriter; import java.io.PrintWriter;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -31,7 +34,6 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.NodeHealthScriptRunner;
import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factories.RecordFactory;
@ -42,58 +44,73 @@ import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.fail;
import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.spy; import static org.mockito.Mockito.spy;
public class TestNodeHealthService { /**
* Test class for {@link NodeHealthCheckerService}.
*/
public class TestNodeHealthCheckerService {
private static volatile Logger LOG = private static final Logger LOG =
LoggerFactory.getLogger(TestNodeHealthService.class); LoggerFactory.getLogger(TestNodeHealthCheckerService.class);
protected static File testRootDir = new File("target", private static final File TEST_ROOT_DIR = new File("target",
TestNodeHealthService.class.getName() + "-localDir").getAbsoluteFile(); TestNodeHealthCheckerService.class.getName() + "-localDir")
.getAbsoluteFile();
final static File nodeHealthConfigFile = new File(testRootDir, private static final File NODE_HEALTH_CONFIG_FILE = new File(TEST_ROOT_DIR,
"modified-mapred-site.xml"); "modified-mapred-site.xml");
private File nodeHealthscriptFile = new File(testRootDir, private File nodeHealthScriptFile = new File(TEST_ROOT_DIR,
Shell.appendScriptExtension("failingscript")); Shell.appendScriptExtension("failingscript"));
@Before @Before
public void setup() { public void setup() {
testRootDir.mkdirs(); TEST_ROOT_DIR.mkdirs();
} }
@After @After
public void tearDown() throws Exception { public void tearDown() throws Exception {
if (testRootDir.exists()) { if (TEST_ROOT_DIR.exists()) {
FileContext.getLocalFSFileContext().delete( FileContext.getLocalFSFileContext().delete(
new Path(testRootDir.getAbsolutePath()), true); new Path(TEST_ROOT_DIR.getAbsolutePath()), true);
} }
} }
private void writeNodeHealthScriptFile(String scriptStr, boolean setExecutable) private void writeNodeHealthScriptFile() throws IOException,
throws IOException { InterruptedException {
PrintWriter pw = null; try (PrintWriter pw = new PrintWriter(
try { new FileOutputStream(nodeHealthScriptFile))) {
FileUtil.setWritable(nodeHealthscriptFile, true); FileUtil.chmod(nodeHealthScriptFile.getCanonicalPath(), "u+rwx");
FileUtil.setReadable(nodeHealthscriptFile, true); pw.println("");
pw = new PrintWriter(new FileOutputStream(nodeHealthscriptFile));
pw.println(scriptStr);
pw.flush(); pw.flush();
} finally {
pw.close();
} }
FileUtil.setExecutable(nodeHealthscriptFile, setExecutable);
} }
private Configuration getConfForNodeHealthScript() { private Configuration getConfForNodeHealthScript(String scriptName) {
Configuration conf = new Configuration(); Configuration conf = new Configuration();
conf.set(YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH, conf.set(YarnConfiguration.NM_HEALTH_CHECK_SCRIPTS, scriptName);
nodeHealthscriptFile.getAbsolutePath()); String timeoutConfig =
conf.setLong(YarnConfiguration.NM_HEALTH_CHECK_INTERVAL_MS, 500); String.format(
conf.setLong( YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS_TEMPLATE,
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS, 1000); scriptName);
conf.setLong(timeoutConfig, 1000L);
String intervalConfig =
String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_INTERVAL_MS_TEMPLATE,
scriptName);
conf.setLong(intervalConfig, 500L);
String pathConfig =
String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH_TEMPLATE,
scriptName);
conf.set(pathConfig, nodeHealthScriptFile.getAbsolutePath());
return conf; return conf;
} }
@ -109,16 +126,22 @@ public class TestNodeHealthService {
RecordFactory factory = RecordFactoryProvider.getRecordFactory(null); RecordFactory factory = RecordFactoryProvider.getRecordFactory(null);
NodeHealthStatus healthStatus = NodeHealthStatus healthStatus =
factory.newRecordInstance(NodeHealthStatus.class); factory.newRecordInstance(NodeHealthStatus.class);
Configuration conf = getConfForNodeHealthScript(); String scriptName = "test";
conf.writeXml(new FileOutputStream(nodeHealthConfigFile)); Configuration conf = getConfForNodeHealthScript(scriptName);
conf.addResource(nodeHealthConfigFile.getName()); conf.writeXml(new FileOutputStream(NODE_HEALTH_CONFIG_FILE));
writeNodeHealthScriptFile("", true); conf.addResource(NODE_HEALTH_CONFIG_FILE.getName());
writeNodeHealthScriptFile();
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
NodeHealthScriptRunner nodeHealthScriptRunner = NodeHealthScriptRunner nodeHealthScriptRunner =
spy(NodeManager.getNodeHealthScriptRunner(conf)); NodeHealthScriptRunner.newInstance(scriptName, conf);
NodeHealthCheckerService nodeHealthChecker = new NodeHealthCheckerService( if (nodeHealthScriptRunner == null) {
nodeHealthScriptRunner, dirsHandler); fail("Should have created NodeHealthScriptRunner instance");
}
nodeHealthScriptRunner = spy(nodeHealthScriptRunner);
NodeHealthCheckerService nodeHealthChecker =
new NodeHealthCheckerService(dirsHandler);
nodeHealthChecker.addHealthReporter(nodeHealthScriptRunner);
nodeHealthChecker.init(conf); nodeHealthChecker.init(conf);
doReturn(true).when(nodeHealthScriptRunner).isHealthy(); doReturn(true).when(nodeHealthScriptRunner).isHealthy();
@ -174,4 +197,63 @@ public class TestNodeHealthService {
.getDisksHealthReport(false)) .getDisksHealthReport(false))
))); )));
} }
private abstract class HealthReporterService extends AbstractService
implements HealthReporter {
HealthReporterService() {
super(HealthReporterService.class.getName());
}
}
@Test
public void testCustomHealthReporter() throws Exception {
String healthReport = "dummy health report";
HealthReporterService customHealthReporter = new HealthReporterService() {
private int counter = 0;
@Override
public boolean isHealthy() {
return counter++ % 2 == 0;
}
@Override
public String getHealthReport() {
return healthReport;
}
@Override
public long getLastHealthReportTime() {
return Long.MAX_VALUE;
}
};
Configuration conf = new Configuration();
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
NodeHealthCheckerService nodeHealthChecker =
new NodeHealthCheckerService(dirsHandler);
nodeHealthChecker.addHealthReporter(customHealthReporter);
nodeHealthChecker.init(conf);
assertThat(nodeHealthChecker.isHealthy()).isTrue();
assertThat(nodeHealthChecker.isHealthy()).isFalse();
assertThat(nodeHealthChecker.getHealthReport()).isEqualTo(healthReport);
assertThat(nodeHealthChecker.getLastHealthReportTime())
.isEqualTo(Long.MAX_VALUE);
}
@Test
public void testExceptionReported() {
Configuration conf = new Configuration();
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
NodeHealthCheckerService nodeHealthChecker =
new NodeHealthCheckerService(dirsHandler);
nodeHealthChecker.init(conf);
assertThat(nodeHealthChecker.isHealthy()).isTrue();
String message = "An exception was thrown.";
Exception exception = new Exception(message);
nodeHealthChecker.reportException(exception);
assertThat(nodeHealthChecker.isHealthy()).isFalse();
assertThat(nodeHealthChecker.getHealthReport()).isEqualTo(message);
}
} }

View File

@ -16,7 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hadoop.util; package org.apache.hadoop.yarn.server.nodemanager.health;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
@ -28,14 +28,22 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.junit.After; import org.junit.After;
import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
/**
* Test class for {@link NodeHealthScriptRunner}.
*/
public class TestNodeHealthScriptRunner { public class TestNodeHealthScriptRunner {
protected static File testRootDir = new File("target", private static File testRootDir = new File("target",
TestNodeHealthScriptRunner.class.getName() + TestNodeHealthScriptRunner.class.getName() +
"-localDir").getAbsoluteFile(); "-localDir").getAbsoluteFile();
@ -55,8 +63,8 @@ public class TestNodeHealthScriptRunner {
} }
} }
private void writeNodeHealthScriptFile(String scriptStr, boolean setExecutable) private void writeNodeHealthScriptFile(String scriptStr,
throws IOException { boolean setExecutable) throws IOException {
PrintWriter pw = null; PrintWriter pw = null;
try { try {
FileUtil.setWritable(nodeHealthscriptFile, true); FileUtil.setWritable(nodeHealthscriptFile, true);
@ -70,20 +78,46 @@ public class TestNodeHealthScriptRunner {
FileUtil.setExecutable(nodeHealthscriptFile, setExecutable); FileUtil.setExecutable(nodeHealthscriptFile, setExecutable);
} }
private NodeHealthScriptRunner createNodeHealthScript() {
String scriptName = "custom";
YarnConfiguration conf = new YarnConfiguration();
conf.set(YarnConfiguration.NM_HEALTH_CHECK_SCRIPTS, scriptName);
String timeoutConfig =
String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS_TEMPLATE,
scriptName);
conf.setLong(timeoutConfig, 1000L);
String intervalConfig =
String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_INTERVAL_MS_TEMPLATE,
scriptName);
conf.setLong(intervalConfig, 500L);
String pathConfig =
String.format(
YarnConfiguration.NM_HEALTH_CHECK_SCRIPT_PATH_TEMPLATE,
scriptName);
conf.set(pathConfig, nodeHealthscriptFile.getAbsolutePath());
return NodeHealthScriptRunner.newInstance("custom", conf);
}
@Test @Test
public void testNodeHealthScriptShouldRun() throws IOException { public void testNodeHealthScriptShouldRun() throws IOException {
Assert.assertFalse("Node health script should start", assertFalse("Node health script should start",
NodeHealthScriptRunner.shouldRun( NodeHealthScriptRunner.shouldRun("script",
nodeHealthscriptFile.getAbsolutePath())); nodeHealthscriptFile.getAbsolutePath()));
writeNodeHealthScriptFile("", false); writeNodeHealthScriptFile("", false);
// Node health script should not start if the node health script is not // Node health script should not start if the node health script is not
// executable. // executable.
Assert.assertFalse("Node health script should start", assertFalse("Node health script should start",
NodeHealthScriptRunner.shouldRun( NodeHealthScriptRunner.shouldRun("script",
nodeHealthscriptFile.getAbsolutePath())); nodeHealthscriptFile.getAbsolutePath()));
writeNodeHealthScriptFile("", true); writeNodeHealthScriptFile("", true);
Assert.assertTrue("Node health script should start", assertTrue("Node health script should start",
NodeHealthScriptRunner.shouldRun( NodeHealthScriptRunner.shouldRun("script",
nodeHealthscriptFile.getAbsolutePath())); nodeHealthscriptFile.getAbsolutePath()));
} }
@ -92,54 +126,53 @@ public class TestNodeHealthScriptRunner {
String errorScript = "echo ERROR\n echo \"Tracker not healthy\""; String errorScript = "echo ERROR\n echo \"Tracker not healthy\"";
String normalScript = "echo \"I am all fine\""; String normalScript = "echo \"I am all fine\"";
String timeOutScript = String timeOutScript =
Shell.WINDOWS ? "@echo off\nping -n 4 127.0.0.1 >nul\necho \"I am fine\"" Shell.WINDOWS ?
: "sleep 4\necho \"I am fine\""; "@echo off\nping -n 4 127.0.0.1 >nul\necho \"I am fine\""
: "sleep 4\necho \"I am fine\"";
String exitCodeScript = "exit 127"; String exitCodeScript = "exit 127";
Configuration conf = new Configuration(); Configuration conf = new Configuration();
writeNodeHealthScriptFile(normalScript, true); writeNodeHealthScriptFile(normalScript, true);
NodeHealthScriptRunner nodeHealthScriptRunner = new NodeHealthScriptRunner( NodeHealthScriptRunner nodeHealthScriptRunner = createNodeHealthScript();
nodeHealthscriptFile.getAbsolutePath(),
500, 1000, new String[] {});
nodeHealthScriptRunner.init(conf); nodeHealthScriptRunner.init(conf);
TimerTask timerTask = nodeHealthScriptRunner.getTimerTask(); TimerTask timerTask = nodeHealthScriptRunner.getTimerTask();
timerTask.run(); timerTask.run();
// Normal Script runs successfully // Normal Script runs successfully
Assert.assertTrue("Node health status reported unhealthy", assertTrue("Node health status reported unhealthy",
nodeHealthScriptRunner.isHealthy()); nodeHealthScriptRunner.isHealthy());
Assert.assertEquals("", nodeHealthScriptRunner.getHealthReport()); assertTrue(nodeHealthScriptRunner.getHealthReport().isEmpty());
// Error script. // Error script.
writeNodeHealthScriptFile(errorScript, true); writeNodeHealthScriptFile(errorScript, true);
// Run timer // Run timer
timerTask.run(); timerTask.run();
Assert.assertFalse("Node health status reported healthy", assertFalse("Node health status reported healthy",
nodeHealthScriptRunner.isHealthy()); nodeHealthScriptRunner.isHealthy());
Assert.assertTrue( assertTrue(
nodeHealthScriptRunner.getHealthReport().contains("ERROR")); nodeHealthScriptRunner.getHealthReport().contains("ERROR"));
// Healthy script. // Healthy script.
writeNodeHealthScriptFile(normalScript, true); writeNodeHealthScriptFile(normalScript, true);
timerTask.run(); timerTask.run();
Assert.assertTrue("Node health status reported unhealthy", assertTrue("Node health status reported unhealthy",
nodeHealthScriptRunner.isHealthy()); nodeHealthScriptRunner.isHealthy());
Assert.assertEquals("", nodeHealthScriptRunner.getHealthReport()); assertTrue(nodeHealthScriptRunner.getHealthReport().isEmpty());
// Timeout script. // Timeout script.
writeNodeHealthScriptFile(timeOutScript, true); writeNodeHealthScriptFile(timeOutScript, true);
timerTask.run(); timerTask.run();
Assert.assertFalse("Node health status reported healthy even after timeout", assertFalse("Node health status reported healthy even after timeout",
nodeHealthScriptRunner.isHealthy()); nodeHealthScriptRunner.isHealthy());
Assert.assertEquals( assertEquals(
NodeHealthScriptRunner.NODE_HEALTH_SCRIPT_TIMED_OUT_MSG, NodeHealthScriptRunner.NODE_HEALTH_SCRIPT_TIMED_OUT_MSG,
nodeHealthScriptRunner.getHealthReport()); nodeHealthScriptRunner.getHealthReport());
// Exit code 127 // Exit code 127
writeNodeHealthScriptFile(exitCodeScript, true); writeNodeHealthScriptFile(exitCodeScript, true);
timerTask.run(); timerTask.run();
Assert.assertTrue("Node health status reported unhealthy", assertTrue("Node health status reported unhealthy",
nodeHealthScriptRunner.isHealthy()); nodeHealthScriptRunner.isHealthy());
Assert.assertEquals("", nodeHealthScriptRunner.getHealthReport()); assertEquals("", nodeHealthScriptRunner.getHealthReport());
} }
} }

View File

@ -43,7 +43,6 @@ import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.nativeio.NativeIO; import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.NodeHealthScriptRunner;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
@ -57,7 +56,6 @@ import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
@ -65,6 +63,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Cont
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.webapp.ContainerLogsPage.ContainersLogsBlock; import org.apache.hadoop.yarn.server.nodemanager.webapp.ContainerLogsPage.ContainersLogsBlock;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
@ -79,10 +78,9 @@ import com.google.inject.Module;
public class TestContainerLogsPage { public class TestContainerLogsPage {
private NodeHealthCheckerService createNodeHealthCheckerService(Configuration conf) { private NodeHealthCheckerService createNodeHealthCheckerService() {
NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner(conf);
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
return new NodeHealthCheckerService(scriptRunner, dirsHandler); return new NodeHealthCheckerService(dirsHandler);
} }
@Test(timeout=30000) @Test(timeout=30000)
@ -92,7 +90,7 @@ public class TestContainerLogsPage {
String logdirwithFile = absLogDir.toURI().toString(); String logdirwithFile = absLogDir.toURI().toString();
Configuration conf = new Configuration(); Configuration conf = new Configuration();
conf.set(YarnConfiguration.NM_LOG_DIRS, logdirwithFile); conf.set(YarnConfiguration.NM_LOG_DIRS, logdirwithFile);
NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); NodeHealthCheckerService healthChecker = createNodeHealthCheckerService();
healthChecker.init(conf); healthChecker.init(conf);
LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler();
NMContext nmContext = new NodeManager.NMContext(null, null, dirsHandler, NMContext nmContext = new NodeManager.NMContext(null, null, dirsHandler,
@ -215,7 +213,7 @@ public class TestContainerLogsPage {
"kerberos"); "kerberos");
UserGroupInformation.setConfiguration(conf); UserGroupInformation.setConfiguration(conf);
NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); NodeHealthCheckerService healthChecker = createNodeHealthCheckerService();
healthChecker.init(conf); healthChecker.init(conf);
LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler();
// Add an application and the corresponding containers // Add an application and the corresponding containers

View File

@ -20,14 +20,13 @@ package org.apache.hadoop.yarn.server.nodemanager.webapp;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.util.NodeHealthScriptRunner;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.ResourceView;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.eclipse.jetty.websocket.api.Session; import org.eclipse.jetty.websocket.api.Session;
import org.eclipse.jetty.websocket.api.UpgradeRequest; import org.eclipse.jetty.websocket.api.UpgradeRequest;
@ -105,8 +104,7 @@ public class TestNMContainerWebSocket {
}; };
conf.set(YarnConfiguration.NM_LOCAL_DIRS, TESTROOTDIR.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOCAL_DIRS, TESTROOTDIR.getAbsolutePath());
conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());
NodeHealthCheckerService healthChecker = createNodeHealthCheckerService( NodeHealthCheckerService healthChecker = createNodeHealthCheckerService();
conf);
healthChecker.init(conf); healthChecker.init(conf);
LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler();
conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr);
@ -120,12 +118,9 @@ public class TestNMContainerWebSocket {
} }
} }
private NodeHealthCheckerService createNodeHealthCheckerService( private NodeHealthCheckerService createNodeHealthCheckerService() {
Configuration conf) {
NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner(
conf);
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
return new NodeHealthCheckerService(scriptRunner, dirsHandler); return new NodeHealthCheckerService(dirsHandler);
} }
@Test @Test

View File

@ -28,7 +28,6 @@ import java.io.Writer;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.util.NodeHealthScriptRunner;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerId;
@ -42,19 +41,19 @@ import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.ResourceView;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.junit.After; import org.junit.After;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
@ -79,10 +78,9 @@ public class TestNMWebServer {
FileUtil.fullyDelete(testLogDir); FileUtil.fullyDelete(testLogDir);
} }
private NodeHealthCheckerService createNodeHealthCheckerService(Configuration conf) { private NodeHealthCheckerService createNodeHealthCheckerService() {
NodeHealthScriptRunner scriptRunner = NodeManager.getNodeHealthScriptRunner(conf);
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
return new NodeHealthCheckerService(scriptRunner, dirsHandler); return new NodeHealthCheckerService(dirsHandler);
} }
private int startNMWebAppServer(String webAddr) { private int startNMWebAppServer(String webAddr) {
@ -113,7 +111,7 @@ public class TestNMWebServer {
}; };
conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath());
conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());
NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); NodeHealthCheckerService healthChecker = createNodeHealthCheckerService();
healthChecker.init(conf); healthChecker.init(conf);
LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler();
conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr);
@ -176,7 +174,7 @@ public class TestNMWebServer {
}; };
conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath());
conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());
NodeHealthCheckerService healthChecker = createNodeHealthCheckerService(conf); NodeHealthCheckerService healthChecker = createNodeHealthCheckerService();
healthChecker.init(conf); healthChecker.init(conf);
LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler();

View File

@ -47,7 +47,6 @@ import org.apache.hadoop.yarn.logaggregation.ContainerLogFileInfo;
import org.apache.hadoop.yarn.logaggregation.TestContainerLogsUtils; import org.apache.hadoop.yarn.logaggregation.TestContainerLogsUtils;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.ResourceView;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl;
@ -57,6 +56,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.AssignedGpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.AssignedGpuDevice;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice; import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
@ -141,8 +141,8 @@ public class TestNMWebServices extends JerseyTestBase {
conf.set(YarnConfiguration.YARN_LOG_SERVER_WEBSERVICE_URL, conf.set(YarnConfiguration.YARN_LOG_SERVER_WEBSERVICE_URL,
LOGSERVICEWSADDR); LOGSERVICEWSADDR);
dirsHandler = new LocalDirsHandlerService(); dirsHandler = new LocalDirsHandlerService();
NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( NodeHealthCheckerService healthChecker =
NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); new NodeHealthCheckerService(dirsHandler);
healthChecker.init(conf); healthChecker.init(conf);
aclsManager = new ApplicationACLsManager(conf); aclsManager = new ApplicationACLsManager(conf);
nmContext = new NodeManager.NMContext(null, null, dirsHandler, nmContext = new NodeManager.NMContext(null, null, dirsHandler,

View File

@ -47,13 +47,13 @@ import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.ResourceView;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp;
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.AppsInfo; import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.AppsInfo;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
@ -104,8 +104,8 @@ public class TestNMWebServicesApps extends JerseyTestBase {
conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath());
conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( NodeHealthCheckerService healthChecker =
NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); new NodeHealthCheckerService(dirsHandler);
healthChecker.init(conf); healthChecker.init(conf);
dirsHandler = healthChecker.getDiskHandler(); dirsHandler = healthChecker.getDiskHandler();
aclsManager = new ApplicationACLsManager(conf); aclsManager = new ApplicationACLsManager(conf);

View File

@ -44,11 +44,11 @@ import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.ResourceView;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices; import org.apache.hadoop.yarn.server.nodemanager.containermanager.AuxServices;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.records.AuxServiceRecord; import org.apache.hadoop.yarn.server.nodemanager.containermanager.records.AuxServiceRecord;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.webapp.GenericExceptionHandler; import org.apache.hadoop.yarn.webapp.GenericExceptionHandler;
@ -124,8 +124,8 @@ public class TestNMWebServicesAuxServices extends JerseyTestBase {
conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath());
conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( NodeHealthCheckerService healthChecker =
NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); new NodeHealthCheckerService(dirsHandler);
healthChecker.init(conf); healthChecker.init(conf);
dirsHandler = healthChecker.getDiskHandler(); dirsHandler = healthChecker.getDiskHandler();
ApplicationACLsManager aclsManager = new ApplicationACLsManager(conf); ApplicationACLsManager aclsManager = new ApplicationACLsManager(conf);

View File

@ -28,7 +28,6 @@ import static org.junit.Assert.fail;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
@ -48,16 +47,15 @@ import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.ResourceView;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp; import org.apache.hadoop.yarn.server.nodemanager.webapp.WebServer.NMWebApp;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.webapp.GenericExceptionHandler; import org.apache.hadoop.yarn.webapp.GenericExceptionHandler;
import org.apache.hadoop.yarn.webapp.GuiceServletConfig; import org.apache.hadoop.yarn.webapp.GuiceServletConfig;
import org.apache.hadoop.yarn.webapp.JerseyTestBase; import org.apache.hadoop.yarn.webapp.JerseyTestBase;
@ -131,8 +129,8 @@ public class TestNMWebServicesContainers extends JerseyTestBase {
conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath());
conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
NodeHealthCheckerService healthChecker = new NodeHealthCheckerService( NodeHealthCheckerService healthChecker =
NodeManager.getNodeHealthScriptRunner(conf), dirsHandler); new NodeHealthCheckerService(dirsHandler);
healthChecker.init(conf); healthChecker.init(conf);
dirsHandler = healthChecker.getDiskHandler(); dirsHandler = healthChecker.getDiskHandler();
aclsManager = new ApplicationACLsManager(conf); aclsManager = new ApplicationACLsManager(conf);

View File

@ -26,13 +26,12 @@ import javax.ws.rs.core.MediaType;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.http.JettyUtils; import org.apache.hadoop.http.JettyUtils;
import org.apache.hadoop.util.NodeHealthScriptRunner;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.ResourceView; import org.apache.hadoop.yarn.server.nodemanager.ResourceView;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
import org.junit.After; import org.junit.After;
import org.junit.Before; import org.junit.Before;
@ -54,12 +53,9 @@ public class TestNMWebTerminal {
private WebServer server; private WebServer server;
private int port; private int port;
private NodeHealthCheckerService createNodeHealthCheckerService( private NodeHealthCheckerService createNodeHealthCheckerService() {
Configuration conf) {
NodeHealthScriptRunner scriptRunner = NodeManager
.getNodeHealthScriptRunner(conf);
LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService(); LocalDirsHandlerService dirsHandler = new LocalDirsHandlerService();
return new NodeHealthCheckerService(scriptRunner, dirsHandler); return new NodeHealthCheckerService(dirsHandler);
} }
private int startNMWebAppServer(String webAddr) { private int startNMWebAppServer(String webAddr) {
@ -90,7 +86,7 @@ public class TestNMWebTerminal {
}; };
conf.set(YarnConfiguration.NM_LOCAL_DIRS, TESTROOTDIR.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOCAL_DIRS, TESTROOTDIR.getAbsolutePath());
conf.set(YarnConfiguration.NM_LOG_DIRS, TESTLOGDIR.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOG_DIRS, TESTLOGDIR.getAbsolutePath());
healthChecker = createNodeHealthCheckerService(conf); healthChecker = createNodeHealthCheckerService();
healthChecker.init(conf); healthChecker.init(conf);
LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler(); LocalDirsHandlerService dirsHandler = healthChecker.getDiskHandler();
conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr); conf.set(YarnConfiguration.NM_WEBAPP_ADDRESS, webAddr);

View File

@ -70,7 +70,6 @@ import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.Context;
import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.DeletionService;
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl;
@ -80,8 +79,7 @@ import org.apache.hadoop.yarn.server.nodemanager.amrmproxy.RequestInterceptor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl;
import org.apache.hadoop.yarn.server.nodemanager.health.NodeHealthCheckerService;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceTrackerService; import org.apache.hadoop.yarn.server.resourcemanager.ResourceTrackerService;

View File

@ -42,9 +42,9 @@ The following configuration parameters can be used to modify the disk checks:
| `yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage` | Float between 0-100 | The maximum percentage of disk space that may be utilized before a disk is marked as unhealthy by the disk checker service. This check is run for every disk used by the NodeManager. The default value is 90 i.e. 90% of the disk can be used. | | `yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage` | Float between 0-100 | The maximum percentage of disk space that may be utilized before a disk is marked as unhealthy by the disk checker service. This check is run for every disk used by the NodeManager. The default value is 90 i.e. 90% of the disk can be used. |
| `yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb` | Integer | The minimum amount of free space that must be available on the disk for the disk checker service to mark the disk as healthy. This check is run for every disk used by the NodeManager. The default value is 0 i.e. the entire disk can be used. | | `yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb` | Integer | The minimum amount of free space that must be available on the disk for the disk checker service to mark the disk as healthy. This check is run for every disk used by the NodeManager. The default value is 0 i.e. the entire disk can be used. |
###External Health Script ### External Health Script
Users may specify their own health checker script that will be invoked by the health checker service. Users may specify a timeout as well as options to be passed to the script. If the script times out, results in an exception being thrown or outputs a line which begins with the string ERROR, the node is marked as unhealthy. Please note that: Users may specify their own health checker scripts that will be invoked by the health checker service. Users may specify a timeout as well as options to be passed to the script. If the script times out, results in an exception being thrown or outputs a line which begins with the string ERROR, the node is marked as unhealthy. Please note that:
* Exit code other than 0 is **not** considered to be a failure because it might have been caused by a syntax error. Therefore the node will **not** be marked as unhealthy. * Exit code other than 0 is **not** considered to be a failure because it might have been caused by a syntax error. Therefore the node will **not** be marked as unhealthy.
@ -52,15 +52,24 @@ Users may specify their own health checker script that will be invoked by the he
* Specifying a health check script is not mandatory. If no script is specified, only the disk checker status will be used to determine the health of the node. * Specifying a health check script is not mandatory. If no script is specified, only the disk checker status will be used to determine the health of the node.
The following configuration parameters can be used to set the health script: Users can specify up to 4 scripts to run individually with the `yarn.nodemanager.health-checker.scripts` configuration. Also these options can be configured for all scripts (global configurations):
| Configuration Name | Allowed Values | Description | | Configuration Name | Allowed Values | Description |
|:---- |:---- |:---- | |:---- |:---- |:---- |
| `yarn.nodemanager.health-checker.interval-ms` | Postive integer | The interval, in milliseconds, at which health checker service runs; the default value is 10 minutes. | |`yarn.nodemanager.health-checker.script`| String | The keywords for the health checker scripts separated by a comma. The default is "script". |
| `yarn.nodemanager.health-checker.script.timeout-ms` | Postive integer | The timeout for the health script that's executed; the default value is 20 minutes. | | `yarn.nodemanager.health-checker.interval-ms` | Positive integer | The interval, in milliseconds, at which health checker service runs; the default value is 10 minutes. |
| `yarn.nodemanager.health-checker.script.path` | String | Absolute path to the health check script to be run. | | `yarn.nodemanager.health-checker.timeout-ms` | Positive integer | The timeout for the health script that's executed; the default value is 20 minutes. |
| `yarn.nodemanager.health-checker.script.opts` | String | Arguments to be passed to the script when the script is executed. |
The following options can be set for every health checker script. The %s symbol is substituted with each keyword provided in `yarn.nodemanager.health-checker.script`.
| Configuration Name | Allowed Values | Description |
|:---- |:---- |:---- |
| `yarn.nodemanager.health-checker.%s.path` | String | Absolute path to the health check script to be run. Mandatory argument for each script. |
| `yarn.nodemanager.health-checker.%s.opts` | String | Arguments to be passed to the script when the script is executed. Mandatory argument for each script. |
| `yarn.nodemanager.health-checker.%s.interval-ms` | Positive integer | The interval, in milliseconds, at which health checker service runs. |
| `yarn.nodemanager.health-checker.%s.timeout-ms` | Positive integer | The timeout for the health script that's executed. |
The interval and timeout options are not required to be specified. In that case the global configurations will be used.
NodeManager Restart NodeManager Restart
------------------- -------------------