HBASE-7351, HBASE-7399, HBASE-7406. Periodic health check chore (Vandana Ayyalasomayajula)
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1428141 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
60007bbe63
commit
a1079157ec
|
@ -741,7 +741,19 @@ public final class HConstants {
|
|||
Arrays.asList(new String[] { HREGION_LOGDIR_NAME, HREGION_OLDLOGDIR_NAME, CORRUPT_DIR_NAME,
|
||||
toString(META_TABLE_NAME), toString(ROOT_TABLE_NAME), SPLIT_LOGDIR_NAME,
|
||||
HBCK_SIDELINEDIR_NAME, HFILE_ARCHIVE_DIRECTORY }));
|
||||
|
||||
/** Health script related settings. */
|
||||
public static final String HEALTH_SCRIPT_LOC = "hbase.node.health.script.location";
|
||||
public static final String HEALTH_SCRIPT_TIMEOUT = "hbase.node.health.script.timeout";
|
||||
public static final String HEALTH_CHORE_WAKE_FREQ =
|
||||
"hbase.node.health.script.frequency";
|
||||
public static final long DEFAULT_HEALTH_SCRIPT_TIMEOUT = 60000;
|
||||
/**
|
||||
* The maximum number of health check failures a server can encounter consecutively.
|
||||
*/
|
||||
public static final String HEALTH_FAILURE_THRESHOLD =
|
||||
"hbase.node.health.failure.threshold";
|
||||
public static final int DEFAULT_HEALTH_FAILURE_THRESHOLD = 3;
|
||||
|
||||
private HConstants() {
|
||||
// Can't be instantiated with this ctor.
|
||||
}
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
#!/bin/bash
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This is an example script for checking health of a node ( master or region server).
|
||||
# The health chore script should essentially output an message containing "ERROR" at an undesirable
|
||||
# outcome of the checks in the script.
|
||||
|
||||
err=0;
|
||||
|
||||
function check_disks {
|
||||
|
||||
for m in `awk '$3~/ext3/ {printf" %s ",$2}' /etc/fstab` ; do
|
||||
fsdev=""
|
||||
fsdev=`awk -v m=$m '$2==m {print $1}' /proc/mounts`;
|
||||
if [ -z "$fsdev" ] ; then
|
||||
msg_="$msg_ $m(u)"
|
||||
else
|
||||
msg_="$msg_`awk -v m=$m '$2==m { if ( $4 ~ /^ro,/ ) {printf"%s(ro)",$2 } ; }' /proc/mounts`"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -z "$msg_" ] ; then
|
||||
echo "disks ok" ; exit 0
|
||||
else
|
||||
echo "$msg_" ; exit 2
|
||||
fi
|
||||
|
||||
}
|
||||
|
||||
function check_link {
|
||||
/usr/bin/snmpwalk -t 5 -Oe -Oq -Os -v 1 -c public localhost if | \
|
||||
awk ' {
|
||||
split($1,a,".") ;
|
||||
if ( a[1] == "ifIndex" ) { ifIndex[a[2]] = $2 }
|
||||
if ( a[1] == "ifDescr" ) { ifDescr[a[2]] = $2 }
|
||||
if ( a[1] == "ifType" ) { ifType[a[2]] = $2 }
|
||||
if ( a[1] == "ifSpeed" ) { ifSpeed[a[2]] = $2 }
|
||||
if ( a[1] == "ifAdminStatus" ) { ifAdminStatus[a[2]] = $2 }
|
||||
if ( a[1] == "ifOperStatus" ) { ifOperStatus[a[2]] = $2 }
|
||||
}
|
||||
END {
|
||||
up=0;
|
||||
for (i in ifIndex ) {
|
||||
if ( ifType[i] == 6 && ifAdminStatus[i] == 1 && ifOperStatus[i] == 1 && ifSpeed[i] == 1000000000 ) {
|
||||
up=i;
|
||||
}
|
||||
}
|
||||
if ( up == 0 ) { print "check link" ; exit 2 }
|
||||
else { print ifDescr[up],"ok" }
|
||||
}'
|
||||
exit $? ;
|
||||
}
|
||||
|
||||
for check in disks link ; do
|
||||
msg=`check_${check}` ;
|
||||
if [ $? -eq 0 ] ; then
|
||||
ok_msg="$ok_msg$msg,"
|
||||
else
|
||||
err_msg="$err_msg$msg,"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ! -z "$err_msg" ] ; then
|
||||
echo -n "ERROR $err_msg "
|
||||
fi
|
||||
if [ ! -z "$ok_msg" ] ; then
|
||||
echo -n "OK: $ok_msg"
|
||||
fi
|
||||
echo
|
||||
exit 0
|
|
@ -0,0 +1,98 @@
|
|||
/**
|
||||
* Copyright 2011 The Apache Software Foundation
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.Chore;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.Stoppable;
|
||||
import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
|
||||
/**
|
||||
* The Class HealthCheckChore for running health checker regularly.
|
||||
*/
|
||||
public class HealthCheckChore extends Chore {
|
||||
private static Log LOG = LogFactory.getLog(HealthCheckChore.class);
|
||||
private HealthChecker healthChecker;
|
||||
private Configuration config;
|
||||
private int threshold;
|
||||
private int numTimesUnhealthy = 0;
|
||||
private long failureWindow;
|
||||
private long startWindow;
|
||||
|
||||
public HealthCheckChore(int sleepTime, Stoppable stopper, Configuration conf) {
|
||||
super("HealthChecker", sleepTime, stopper);
|
||||
LOG.info("Health Check Chore runs every " + StringUtils.formatTime(sleepTime));
|
||||
this.config = conf;
|
||||
String healthCheckScript = this.config.get(HConstants.HEALTH_SCRIPT_LOC);
|
||||
long scriptTimeout = this.config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT,
|
||||
HConstants.DEFAULT_HEALTH_SCRIPT_TIMEOUT);
|
||||
healthChecker = new HealthChecker();
|
||||
healthChecker.init(healthCheckScript, scriptTimeout);
|
||||
this.threshold = config.getInt(HConstants.HEALTH_FAILURE_THRESHOLD,
|
||||
HConstants.DEFAULT_HEALTH_FAILURE_THRESHOLD);
|
||||
this.failureWindow = this.threshold * sleepTime;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void chore() {
|
||||
HealthReport report = healthChecker.checkHealth();
|
||||
boolean isHealthy = (report.getStatus() == HealthCheckerExitStatus.SUCCESS);
|
||||
if (!isHealthy) {
|
||||
boolean needToStop = decideToStop();
|
||||
if (needToStop) {
|
||||
this.stopper.stop("The node reported unhealthy " + threshold
|
||||
+ " number of times consecutively.");
|
||||
}
|
||||
// Always log health report.
|
||||
LOG.info("Health status at " + StringUtils.formatTime(System.currentTimeMillis()) + " : "
|
||||
+ report.getHealthReport());
|
||||
}
|
||||
}
|
||||
|
||||
private boolean decideToStop() {
|
||||
boolean stop = false;
|
||||
if (numTimesUnhealthy == 0) {
|
||||
// First time we are seeing a failure. No need to stop, just
|
||||
// record the time.
|
||||
numTimesUnhealthy++;
|
||||
startWindow = System.currentTimeMillis();
|
||||
} else {
|
||||
if ((System.currentTimeMillis() - startWindow) < failureWindow) {
|
||||
numTimesUnhealthy++;
|
||||
if (numTimesUnhealthy == threshold) {
|
||||
stop = true;
|
||||
} else {
|
||||
stop = false;
|
||||
}
|
||||
} else {
|
||||
// Outside of failure window, so we reset to 1.
|
||||
numTimesUnhealthy = 1;
|
||||
startWindow = System.currentTimeMillis();
|
||||
stop = false;
|
||||
}
|
||||
}
|
||||
return stop;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,128 @@
|
|||
/**
|
||||
* Copyright 2011 The Apache Software Foundation
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.util.Shell.ExitCodeException;
|
||||
import org.apache.hadoop.util.Shell.ShellCommandExecutor;
|
||||
|
||||
/**
|
||||
* A utility for executing an external script that checks the health of
|
||||
* the node. An example script can be found at
|
||||
* <tt>src/main/sh/healthcheck/healthcheck.sh</tt> in the
|
||||
* <tt>hbase-examples</tt> module.
|
||||
*/
|
||||
class HealthChecker {
|
||||
|
||||
private static Log LOG = LogFactory.getLog(HealthChecker.class);
|
||||
private ShellCommandExecutor shexec = null;
|
||||
private String exceptionStackTrace;
|
||||
|
||||
/** Pattern used for searching in the output of the node health script */
|
||||
static private final String ERROR_PATTERN = "ERROR";
|
||||
|
||||
private String healthCheckScript;
|
||||
private long scriptTimeout;
|
||||
|
||||
enum HealthCheckerExitStatus {
|
||||
SUCCESS,
|
||||
TIMED_OUT,
|
||||
FAILED_WITH_EXIT_CODE,
|
||||
FAILED_WITH_EXCEPTION,
|
||||
FAILED
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize.
|
||||
*
|
||||
* @param configuration
|
||||
*/
|
||||
public void init(String location, long timeout) {
|
||||
this.healthCheckScript = location;
|
||||
this.scriptTimeout = timeout;
|
||||
ArrayList<String> execScript = new ArrayList<String>();
|
||||
execScript.add(healthCheckScript);
|
||||
this.shexec = new ShellCommandExecutor(execScript.toArray(new String[execScript.size()]), null,
|
||||
null, scriptTimeout);
|
||||
LOG.info("HealthChecker initialized.");
|
||||
}
|
||||
|
||||
public HealthReport checkHealth() {
|
||||
HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS;
|
||||
try {
|
||||
shexec.execute();
|
||||
} catch (ExitCodeException e) {
|
||||
// ignore the exit code of the script
|
||||
LOG.warn("Caught exception : " + e);
|
||||
status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE;
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Caught exception : " + e);
|
||||
if (!shexec.isTimedOut()) {
|
||||
status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION;
|
||||
exceptionStackTrace = org.apache.hadoop.util.StringUtils.stringifyException(e);
|
||||
} else {
|
||||
status = HealthCheckerExitStatus.TIMED_OUT;
|
||||
}
|
||||
} finally {
|
||||
if (status == HealthCheckerExitStatus.SUCCESS) {
|
||||
if (hasErrors(shexec.getOutput())) {
|
||||
status = HealthCheckerExitStatus.FAILED;
|
||||
}
|
||||
}
|
||||
}
|
||||
return new HealthReport(status, getHealthReport(status));
|
||||
}
|
||||
|
||||
private boolean hasErrors(String output) {
|
||||
String[] splits = output.split("\n");
|
||||
for (String split : splits) {
|
||||
if (split.startsWith(ERROR_PATTERN)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private String getHealthReport(HealthCheckerExitStatus status){
|
||||
String healthReport = null;
|
||||
switch (status) {
|
||||
case SUCCESS:
|
||||
healthReport = "Server is healthy.";
|
||||
break;
|
||||
case TIMED_OUT:
|
||||
healthReport = "Health script timed out";
|
||||
break;
|
||||
case FAILED_WITH_EXCEPTION:
|
||||
healthReport = exceptionStackTrace;
|
||||
break;
|
||||
case FAILED_WITH_EXIT_CODE:
|
||||
healthReport = "Health script failed with exit code.";
|
||||
break;
|
||||
case FAILED:
|
||||
healthReport = shexec.getOutput();
|
||||
break;
|
||||
}
|
||||
return healthReport;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase;
|
||||
|
||||
import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
|
||||
|
||||
/**
|
||||
* The Class HealthReport containing information about health of the node.
|
||||
*/
|
||||
class HealthReport {
|
||||
|
||||
private HealthCheckerExitStatus status;
|
||||
private String healthReport;
|
||||
|
||||
HealthReport(HealthCheckerExitStatus status, String healthReport) {
|
||||
super();
|
||||
this.status = status;
|
||||
this.healthReport = healthReport;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the status of the region server.
|
||||
*
|
||||
* @return HealthCheckerExitStatus
|
||||
*/
|
||||
HealthCheckerExitStatus getStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the health report of the region server.
|
||||
*
|
||||
* @return String
|
||||
*/
|
||||
String getHealthReport() {
|
||||
return healthReport;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + ((healthReport == null) ? 0 : healthReport.hashCode());
|
||||
result = prime * result + ((status == null) ? 0 : status.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (!(obj instanceof HealthReport)) {
|
||||
return false;
|
||||
}
|
||||
HealthReport other = (HealthReport) obj;
|
||||
if (healthReport == null) {
|
||||
if (other.healthReport != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!healthReport.equals(other.healthReport)) {
|
||||
return false;
|
||||
}
|
||||
if (status != other.status) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
|
@ -54,6 +54,7 @@ import org.apache.hadoop.hbase.HColumnDescriptor;
|
|||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.HRegionInfo;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.HealthCheckChore;
|
||||
import org.apache.hadoop.hbase.MasterAdminProtocol;
|
||||
import org.apache.hadoop.hbase.MasterMonitorProtocol;
|
||||
import org.apache.hadoop.hbase.MasterNotRunningException;
|
||||
|
@ -320,6 +321,9 @@ Server {
|
|||
|
||||
private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
|
||||
|
||||
/** The health check chore. */
|
||||
private HealthCheckChore healthCheckChore;
|
||||
|
||||
/**
|
||||
* Initializes the HMaster. The steps are as follows:
|
||||
* <p>
|
||||
|
@ -399,6 +403,13 @@ Server {
|
|||
this.masterCheckCompression = conf.getBoolean("hbase.master.check.compression", true);
|
||||
|
||||
this.metricsMaster = new MetricsMaster( new MetricsMasterWrapperImpl(this));
|
||||
|
||||
// Health checker thread.
|
||||
int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ,
|
||||
HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
|
||||
if (isHealthCheckerConfigured()) {
|
||||
healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1069,6 +1080,11 @@ Server {
|
|||
this.infoServer.start();
|
||||
}
|
||||
|
||||
// Start the health checker
|
||||
if (this.healthCheckChore != null) {
|
||||
Threads.setDaemonThreadRunning(this.healthCheckChore.getThread(), n + ".healthChecker");
|
||||
}
|
||||
|
||||
// Start allowing requests to happen.
|
||||
this.rpcServer.openServer();
|
||||
this.rpcServerOpen = true;
|
||||
|
@ -1104,6 +1120,9 @@ Server {
|
|||
}
|
||||
}
|
||||
if (this.executorService != null) this.executorService.shutdown();
|
||||
if (this.healthCheckChore != null) {
|
||||
this.healthCheckChore.interrupt();
|
||||
}
|
||||
}
|
||||
|
||||
private static Thread getAndStartClusterStatusChore(HMaster master) {
|
||||
|
@ -2429,4 +2448,9 @@ Server {
|
|||
public HFileCleaner getHFileCleaner() {
|
||||
return this.hfileCleaner;
|
||||
}
|
||||
|
||||
private boolean isHealthCheckerConfigured() {
|
||||
String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC);
|
||||
return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -65,6 +65,7 @@ import org.apache.hadoop.hbase.DoNotRetryIOException;
|
|||
import org.apache.hadoop.hbase.FailedSanityCheckException;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.HealthCheckChore;
|
||||
import org.apache.hadoop.hbase.HRegionInfo;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.KeyValue;
|
||||
|
@ -387,6 +388,9 @@ public class HRegionServer implements ClientProtocol,
|
|||
// reference to the Thrift Server.
|
||||
volatile private HRegionThriftServer thriftServer;
|
||||
|
||||
/** The health check chore. */
|
||||
private HealthCheckChore healthCheckChore;
|
||||
|
||||
/**
|
||||
* The server name the Master sees us as. Its made from the hostname the
|
||||
* master passes us, port, and server startcode. Gets set after registration
|
||||
|
@ -809,6 +813,12 @@ public class HRegionServer implements ClientProtocol,
|
|||
".multiplier", 1000);
|
||||
this.compactionChecker = new CompactionChecker(this,
|
||||
this.threadWakeFrequency * multiplier, this);
|
||||
// Health checker thread.
|
||||
int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ,
|
||||
HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
|
||||
if (isHealthCheckerConfigured()) {
|
||||
healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
|
||||
}
|
||||
|
||||
this.leases = new Leases(this.threadWakeFrequency);
|
||||
|
||||
|
@ -924,6 +934,9 @@ public class HRegionServer implements ClientProtocol,
|
|||
if (this.hlogRoller != null) this.hlogRoller.interruptIfNecessary();
|
||||
if (this.compactionChecker != null)
|
||||
this.compactionChecker.interrupt();
|
||||
if (this.healthCheckChore != null) {
|
||||
this.healthCheckChore.interrupt();
|
||||
}
|
||||
|
||||
if (this.killed) {
|
||||
// Just skip out w/o closing regions. Used when testing.
|
||||
|
@ -1479,6 +1492,10 @@ public class HRegionServer implements ClientProtocol,
|
|||
handler);
|
||||
Threads.setDaemonThreadRunning(this.compactionChecker.getThread(), n +
|
||||
".compactionChecker", handler);
|
||||
if (this.healthCheckChore != null) {
|
||||
Threads
|
||||
.setDaemonThreadRunning(this.healthCheckChore.getThread(), n + ".healthChecker", handler);
|
||||
}
|
||||
|
||||
// Leases is not a Thread. Internally it runs a daemon thread. If it gets
|
||||
// an unhandled exception, it will just exit.
|
||||
|
@ -1703,6 +1720,9 @@ public class HRegionServer implements ClientProtocol,
|
|||
protected void join() {
|
||||
Threads.shutdown(this.compactionChecker.getThread());
|
||||
Threads.shutdown(this.cacheFlusher.getThread());
|
||||
if (this.healthCheckChore != null) {
|
||||
Threads.shutdown(this.healthCheckChore.getThread());
|
||||
}
|
||||
if (this.hlogRoller != null) {
|
||||
Threads.shutdown(this.hlogRoller.getThread());
|
||||
}
|
||||
|
@ -4012,4 +4032,9 @@ public class HRegionServer implements ClientProtocol,
|
|||
this.s = s;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isHealthCheckerConfigured() {
|
||||
String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC);
|
||||
return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,143 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.SmallTests;
|
||||
import org.apache.hadoop.hbase.Stoppable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
|
||||
import org.junit.After;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
@Category(SmallTests.class)
|
||||
public class TestNodeHealthCheckChore {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(TestNodeHealthCheckChore.class);
|
||||
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
|
||||
private File healthScriptFile;
|
||||
|
||||
|
||||
@After
|
||||
public void cleanUp() throws IOException {
|
||||
// delete and recreate the test directory, ensuring a clean test dir between tests
|
||||
Path testDir = UTIL.getDataTestDir();
|
||||
FileSystem fs = UTIL.getTestFileSystem();
|
||||
fs.delete(testDir, true);
|
||||
fs.mkdirs(testDir);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHealthChecker() throws Exception {
|
||||
Configuration config = getConfForNodeHealthScript();
|
||||
config.addResource(healthScriptFile.getName());
|
||||
String location = healthScriptFile.getAbsolutePath();
|
||||
long timeout = config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT, 100);
|
||||
|
||||
String normalScript = "echo \"I am all fine\"";
|
||||
createScript(normalScript, true);
|
||||
HealthChecker checker = new HealthChecker();
|
||||
checker.init(location, timeout);
|
||||
HealthReport report = checker.checkHealth();
|
||||
assertTrue(report.getStatus() == HealthCheckerExitStatus.SUCCESS);
|
||||
LOG.info("Health Status:" + checker);
|
||||
|
||||
String errorScript = "echo ERROR\n echo \"Server not healthy\"";
|
||||
createScript(errorScript, true);
|
||||
report = checker.checkHealth();
|
||||
assertTrue(report.getStatus() == HealthCheckerExitStatus.FAILED);
|
||||
LOG.info("Health Status:" + report.getHealthReport());
|
||||
|
||||
String timeOutScript = "sleep 4\n echo\"I am fine\"";
|
||||
createScript(timeOutScript, true);
|
||||
report = checker.checkHealth();
|
||||
assertTrue(report.getStatus() == HealthCheckerExitStatus.TIMED_OUT);
|
||||
LOG.info("Health Status:" + report.getHealthReport());
|
||||
|
||||
healthScriptFile.delete();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRSHealthChore() throws Exception{
|
||||
Stoppable stop = new StoppableImplementation();
|
||||
Configuration conf = getConfForNodeHealthScript();
|
||||
String errorScript = "echo ERROR\n echo \"Server not healthy\"";
|
||||
createScript(errorScript, true);
|
||||
HealthCheckChore rsChore = new HealthCheckChore(100, stop, conf);
|
||||
//Default threshold is three.
|
||||
rsChore.chore();
|
||||
rsChore.chore();
|
||||
assertFalse("Stoppable must not be stopped.", stop.isStopped());
|
||||
rsChore.chore();
|
||||
assertTrue("Stoppable must have been stopped.", stop.isStopped());
|
||||
}
|
||||
|
||||
private void createScript(String scriptStr, boolean setExecutable)
|
||||
throws Exception {
|
||||
healthScriptFile.createNewFile();
|
||||
PrintWriter pw = new PrintWriter(new FileOutputStream(healthScriptFile));
|
||||
pw.println(scriptStr);
|
||||
pw.flush();
|
||||
pw.close();
|
||||
healthScriptFile.setExecutable(setExecutable);
|
||||
}
|
||||
|
||||
private Configuration getConfForNodeHealthScript() {
|
||||
Configuration conf = UTIL.getConfiguration();
|
||||
File tempDir = new File(UTIL.getDataTestDir().toString());
|
||||
tempDir.mkdirs();
|
||||
healthScriptFile = new File(tempDir.getAbsolutePath(), "HealthScript.sh");
|
||||
conf.set(HConstants.HEALTH_SCRIPT_LOC,
|
||||
healthScriptFile.getAbsolutePath());
|
||||
conf.setLong(HConstants.HEALTH_FAILURE_THRESHOLD, 3);
|
||||
conf.setLong(HConstants.HEALTH_SCRIPT_TIMEOUT, 100);
|
||||
return conf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple helper class that just keeps track of whether or not its stopped.
|
||||
*/
|
||||
private static class StoppableImplementation implements Stoppable {
|
||||
private volatile boolean stop = false;
|
||||
|
||||
@Override
|
||||
public void stop(String why) {
|
||||
this.stop = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isStopped() {
|
||||
return this.stop;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue