HBASE-7351, HBASE-7399, HBASE-7406. Periodic health check chore (Vandana Ayyalasomayajula)

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1428141 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andrew Kyle Purtell 2013-01-03 02:12:24 +00:00
parent 60007bbe63
commit a1079157ec
8 changed files with 603 additions and 1 deletions

View File

@ -741,7 +741,19 @@ public final class HConstants {
Arrays.asList(new String[] { HREGION_LOGDIR_NAME, HREGION_OLDLOGDIR_NAME, CORRUPT_DIR_NAME,
toString(META_TABLE_NAME), toString(ROOT_TABLE_NAME), SPLIT_LOGDIR_NAME,
HBCK_SIDELINEDIR_NAME, HFILE_ARCHIVE_DIRECTORY }));
/** Health script related settings. */
public static final String HEALTH_SCRIPT_LOC = "hbase.node.health.script.location";
public static final String HEALTH_SCRIPT_TIMEOUT = "hbase.node.health.script.timeout";
public static final String HEALTH_CHORE_WAKE_FREQ =
"hbase.node.health.script.frequency";
public static final long DEFAULT_HEALTH_SCRIPT_TIMEOUT = 60000;
/**
* The maximum number of health check failures a server can encounter consecutively.
*/
public static final String HEALTH_FAILURE_THRESHOLD =
"hbase.node.health.failure.threshold";
public static final int DEFAULT_HEALTH_FAILURE_THRESHOLD = 3;
private HConstants() {
// Can't be instantiated with this ctor.
}

View File

@ -0,0 +1,84 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This is an example script for checking health of a node ( master or region server).
# The health chore script should essentially output an message containing "ERROR" at an undesirable
# outcome of the checks in the script.
err=0;
function check_disks {
for m in `awk '$3~/ext3/ {printf" %s ",$2}' /etc/fstab` ; do
fsdev=""
fsdev=`awk -v m=$m '$2==m {print $1}' /proc/mounts`;
if [ -z "$fsdev" ] ; then
msg_="$msg_ $m(u)"
else
msg_="$msg_`awk -v m=$m '$2==m { if ( $4 ~ /^ro,/ ) {printf"%s(ro)",$2 } ; }' /proc/mounts`"
fi
done
if [ -z "$msg_" ] ; then
echo "disks ok" ; exit 0
else
echo "$msg_" ; exit 2
fi
}
function check_link {
/usr/bin/snmpwalk -t 5 -Oe -Oq -Os -v 1 -c public localhost if | \
awk ' {
split($1,a,".") ;
if ( a[1] == "ifIndex" ) { ifIndex[a[2]] = $2 }
if ( a[1] == "ifDescr" ) { ifDescr[a[2]] = $2 }
if ( a[1] == "ifType" ) { ifType[a[2]] = $2 }
if ( a[1] == "ifSpeed" ) { ifSpeed[a[2]] = $2 }
if ( a[1] == "ifAdminStatus" ) { ifAdminStatus[a[2]] = $2 }
if ( a[1] == "ifOperStatus" ) { ifOperStatus[a[2]] = $2 }
}
END {
up=0;
for (i in ifIndex ) {
if ( ifType[i] == 6 && ifAdminStatus[i] == 1 && ifOperStatus[i] == 1 && ifSpeed[i] == 1000000000 ) {
up=i;
}
}
if ( up == 0 ) { print "check link" ; exit 2 }
else { print ifDescr[up],"ok" }
}'
exit $? ;
}
for check in disks link ; do
msg=`check_${check}` ;
if [ $? -eq 0 ] ; then
ok_msg="$ok_msg$msg,"
else
err_msg="$err_msg$msg,"
fi
done
if [ ! -z "$err_msg" ] ; then
echo -n "ERROR $err_msg "
fi
if [ ! -z "$ok_msg" ] ; then
echo -n "OK: $ok_msg"
fi
echo
exit 0

View File

@ -0,0 +1,98 @@
/**
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Chore;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
import org.apache.hadoop.util.StringUtils;
/**
* The Class HealthCheckChore for running health checker regularly.
*/
public class HealthCheckChore extends Chore {
private static Log LOG = LogFactory.getLog(HealthCheckChore.class);
private HealthChecker healthChecker;
private Configuration config;
private int threshold;
private int numTimesUnhealthy = 0;
private long failureWindow;
private long startWindow;
public HealthCheckChore(int sleepTime, Stoppable stopper, Configuration conf) {
super("HealthChecker", sleepTime, stopper);
LOG.info("Health Check Chore runs every " + StringUtils.formatTime(sleepTime));
this.config = conf;
String healthCheckScript = this.config.get(HConstants.HEALTH_SCRIPT_LOC);
long scriptTimeout = this.config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT,
HConstants.DEFAULT_HEALTH_SCRIPT_TIMEOUT);
healthChecker = new HealthChecker();
healthChecker.init(healthCheckScript, scriptTimeout);
this.threshold = config.getInt(HConstants.HEALTH_FAILURE_THRESHOLD,
HConstants.DEFAULT_HEALTH_FAILURE_THRESHOLD);
this.failureWindow = this.threshold * sleepTime;
}
@Override
protected void chore() {
HealthReport report = healthChecker.checkHealth();
boolean isHealthy = (report.getStatus() == HealthCheckerExitStatus.SUCCESS);
if (!isHealthy) {
boolean needToStop = decideToStop();
if (needToStop) {
this.stopper.stop("The node reported unhealthy " + threshold
+ " number of times consecutively.");
}
// Always log health report.
LOG.info("Health status at " + StringUtils.formatTime(System.currentTimeMillis()) + " : "
+ report.getHealthReport());
}
}
private boolean decideToStop() {
boolean stop = false;
if (numTimesUnhealthy == 0) {
// First time we are seeing a failure. No need to stop, just
// record the time.
numTimesUnhealthy++;
startWindow = System.currentTimeMillis();
} else {
if ((System.currentTimeMillis() - startWindow) < failureWindow) {
numTimesUnhealthy++;
if (numTimesUnhealthy == threshold) {
stop = true;
} else {
stop = false;
}
} else {
// Outside of failure window, so we reset to 1.
numTimesUnhealthy = 1;
startWindow = System.currentTimeMillis();
stop = false;
}
}
return stop;
}
}

View File

@ -0,0 +1,128 @@
/**
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.util.Shell.ExitCodeException;
import org.apache.hadoop.util.Shell.ShellCommandExecutor;
/**
* A utility for executing an external script that checks the health of
* the node. An example script can be found at
* <tt>src/main/sh/healthcheck/healthcheck.sh</tt> in the
* <tt>hbase-examples</tt> module.
*/
class HealthChecker {
private static Log LOG = LogFactory.getLog(HealthChecker.class);
private ShellCommandExecutor shexec = null;
private String exceptionStackTrace;
/** Pattern used for searching in the output of the node health script */
static private final String ERROR_PATTERN = "ERROR";
private String healthCheckScript;
private long scriptTimeout;
enum HealthCheckerExitStatus {
SUCCESS,
TIMED_OUT,
FAILED_WITH_EXIT_CODE,
FAILED_WITH_EXCEPTION,
FAILED
}
/**
* Initialize.
*
* @param configuration
*/
public void init(String location, long timeout) {
this.healthCheckScript = location;
this.scriptTimeout = timeout;
ArrayList<String> execScript = new ArrayList<String>();
execScript.add(healthCheckScript);
this.shexec = new ShellCommandExecutor(execScript.toArray(new String[execScript.size()]), null,
null, scriptTimeout);
LOG.info("HealthChecker initialized.");
}
public HealthReport checkHealth() {
HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS;
try {
shexec.execute();
} catch (ExitCodeException e) {
// ignore the exit code of the script
LOG.warn("Caught exception : " + e);
status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE;
} catch (IOException e) {
LOG.warn("Caught exception : " + e);
if (!shexec.isTimedOut()) {
status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION;
exceptionStackTrace = org.apache.hadoop.util.StringUtils.stringifyException(e);
} else {
status = HealthCheckerExitStatus.TIMED_OUT;
}
} finally {
if (status == HealthCheckerExitStatus.SUCCESS) {
if (hasErrors(shexec.getOutput())) {
status = HealthCheckerExitStatus.FAILED;
}
}
}
return new HealthReport(status, getHealthReport(status));
}
private boolean hasErrors(String output) {
String[] splits = output.split("\n");
for (String split : splits) {
if (split.startsWith(ERROR_PATTERN)) {
return true;
}
}
return false;
}
private String getHealthReport(HealthCheckerExitStatus status){
String healthReport = null;
switch (status) {
case SUCCESS:
healthReport = "Server is healthy.";
break;
case TIMED_OUT:
healthReport = "Health script timed out";
break;
case FAILED_WITH_EXCEPTION:
healthReport = exceptionStackTrace;
break;
case FAILED_WITH_EXIT_CODE:
healthReport = "Health script failed with exit code.";
break;
case FAILED:
healthReport = shexec.getOutput();
break;
}
return healthReport;
}
}

View File

@ -0,0 +1,88 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;
import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
/**
* The Class HealthReport containing information about health of the node.
*/
class HealthReport {
private HealthCheckerExitStatus status;
private String healthReport;
HealthReport(HealthCheckerExitStatus status, String healthReport) {
super();
this.status = status;
this.healthReport = healthReport;
}
/**
* Gets the status of the region server.
*
* @return HealthCheckerExitStatus
*/
HealthCheckerExitStatus getStatus() {
return status;
}
/**
* Gets the health report of the region server.
*
* @return String
*/
String getHealthReport() {
return healthReport;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((healthReport == null) ? 0 : healthReport.hashCode());
result = prime * result + ((status == null) ? 0 : status.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (!(obj instanceof HealthReport)) {
return false;
}
HealthReport other = (HealthReport) obj;
if (healthReport == null) {
if (other.healthReport != null) {
return false;
}
} else if (!healthReport.equals(other.healthReport)) {
return false;
}
if (status != other.status) {
return false;
}
return true;
}
}

View File

@ -54,6 +54,7 @@ import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.HealthCheckChore;
import org.apache.hadoop.hbase.MasterAdminProtocol;
import org.apache.hadoop.hbase.MasterMonitorProtocol;
import org.apache.hadoop.hbase.MasterNotRunningException;
@ -320,6 +321,9 @@ Server {
private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
/** The health check chore. */
private HealthCheckChore healthCheckChore;
/**
* Initializes the HMaster. The steps are as follows:
* <p>
@ -399,6 +403,13 @@ Server {
this.masterCheckCompression = conf.getBoolean("hbase.master.check.compression", true);
this.metricsMaster = new MetricsMaster( new MetricsMasterWrapperImpl(this));
// Health checker thread.
int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ,
HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
if (isHealthCheckerConfigured()) {
healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
}
}
/**
@ -1069,6 +1080,11 @@ Server {
this.infoServer.start();
}
// Start the health checker
if (this.healthCheckChore != null) {
Threads.setDaemonThreadRunning(this.healthCheckChore.getThread(), n + ".healthChecker");
}
// Start allowing requests to happen.
this.rpcServer.openServer();
this.rpcServerOpen = true;
@ -1104,6 +1120,9 @@ Server {
}
}
if (this.executorService != null) this.executorService.shutdown();
if (this.healthCheckChore != null) {
this.healthCheckChore.interrupt();
}
}
private static Thread getAndStartClusterStatusChore(HMaster master) {
@ -2429,4 +2448,9 @@ Server {
public HFileCleaner getHFileCleaner() {
return this.hfileCleaner;
}
private boolean isHealthCheckerConfigured() {
String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC);
return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation);
}
}

View File

@ -65,6 +65,7 @@ import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.FailedSanityCheckException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HealthCheckChore;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
@ -387,6 +388,9 @@ public class HRegionServer implements ClientProtocol,
// reference to the Thrift Server.
volatile private HRegionThriftServer thriftServer;
/** The health check chore. */
private HealthCheckChore healthCheckChore;
/**
* The server name the Master sees us as. Its made from the hostname the
* master passes us, port, and server startcode. Gets set after registration
@ -809,6 +813,12 @@ public class HRegionServer implements ClientProtocol,
".multiplier", 1000);
this.compactionChecker = new CompactionChecker(this,
this.threadWakeFrequency * multiplier, this);
// Health checker thread.
int sleepTime = this.conf.getInt(HConstants.HEALTH_CHORE_WAKE_FREQ,
HConstants.DEFAULT_THREAD_WAKE_FREQUENCY);
if (isHealthCheckerConfigured()) {
healthCheckChore = new HealthCheckChore(sleepTime, this, getConfiguration());
}
this.leases = new Leases(this.threadWakeFrequency);
@ -924,6 +934,9 @@ public class HRegionServer implements ClientProtocol,
if (this.hlogRoller != null) this.hlogRoller.interruptIfNecessary();
if (this.compactionChecker != null)
this.compactionChecker.interrupt();
if (this.healthCheckChore != null) {
this.healthCheckChore.interrupt();
}
if (this.killed) {
// Just skip out w/o closing regions. Used when testing.
@ -1479,6 +1492,10 @@ public class HRegionServer implements ClientProtocol,
handler);
Threads.setDaemonThreadRunning(this.compactionChecker.getThread(), n +
".compactionChecker", handler);
if (this.healthCheckChore != null) {
Threads
.setDaemonThreadRunning(this.healthCheckChore.getThread(), n + ".healthChecker", handler);
}
// Leases is not a Thread. Internally it runs a daemon thread. If it gets
// an unhandled exception, it will just exit.
@ -1703,6 +1720,9 @@ public class HRegionServer implements ClientProtocol,
protected void join() {
Threads.shutdown(this.compactionChecker.getThread());
Threads.shutdown(this.cacheFlusher.getThread());
if (this.healthCheckChore != null) {
Threads.shutdown(this.healthCheckChore.getThread());
}
if (this.hlogRoller != null) {
Threads.shutdown(this.hlogRoller.getThread());
}
@ -4012,4 +4032,9 @@ public class HRegionServer implements ClientProtocol,
this.s = s;
}
}
private boolean isHealthCheckerConfigured() {
String healthScriptLocation = this.conf.get(HConstants.HEALTH_SCRIPT_LOC);
return org.apache.commons.lang.StringUtils.isNotBlank(healthScriptLocation);
}
}

View File

@ -0,0 +1,143 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.SmallTests;
import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HealthChecker.HealthCheckerExitStatus;
import org.junit.After;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@Category(SmallTests.class)
public class TestNodeHealthCheckChore {
private static final Log LOG = LogFactory.getLog(TestNodeHealthCheckChore.class);
private static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
private File healthScriptFile;
@After
public void cleanUp() throws IOException {
// delete and recreate the test directory, ensuring a clean test dir between tests
Path testDir = UTIL.getDataTestDir();
FileSystem fs = UTIL.getTestFileSystem();
fs.delete(testDir, true);
fs.mkdirs(testDir);
}
@Test
public void testHealthChecker() throws Exception {
Configuration config = getConfForNodeHealthScript();
config.addResource(healthScriptFile.getName());
String location = healthScriptFile.getAbsolutePath();
long timeout = config.getLong(HConstants.HEALTH_SCRIPT_TIMEOUT, 100);
String normalScript = "echo \"I am all fine\"";
createScript(normalScript, true);
HealthChecker checker = new HealthChecker();
checker.init(location, timeout);
HealthReport report = checker.checkHealth();
assertTrue(report.getStatus() == HealthCheckerExitStatus.SUCCESS);
LOG.info("Health Status:" + checker);
String errorScript = "echo ERROR\n echo \"Server not healthy\"";
createScript(errorScript, true);
report = checker.checkHealth();
assertTrue(report.getStatus() == HealthCheckerExitStatus.FAILED);
LOG.info("Health Status:" + report.getHealthReport());
String timeOutScript = "sleep 4\n echo\"I am fine\"";
createScript(timeOutScript, true);
report = checker.checkHealth();
assertTrue(report.getStatus() == HealthCheckerExitStatus.TIMED_OUT);
LOG.info("Health Status:" + report.getHealthReport());
healthScriptFile.delete();
}
@Test
public void testRSHealthChore() throws Exception{
Stoppable stop = new StoppableImplementation();
Configuration conf = getConfForNodeHealthScript();
String errorScript = "echo ERROR\n echo \"Server not healthy\"";
createScript(errorScript, true);
HealthCheckChore rsChore = new HealthCheckChore(100, stop, conf);
//Default threshold is three.
rsChore.chore();
rsChore.chore();
assertFalse("Stoppable must not be stopped.", stop.isStopped());
rsChore.chore();
assertTrue("Stoppable must have been stopped.", stop.isStopped());
}
private void createScript(String scriptStr, boolean setExecutable)
throws Exception {
healthScriptFile.createNewFile();
PrintWriter pw = new PrintWriter(new FileOutputStream(healthScriptFile));
pw.println(scriptStr);
pw.flush();
pw.close();
healthScriptFile.setExecutable(setExecutable);
}
private Configuration getConfForNodeHealthScript() {
Configuration conf = UTIL.getConfiguration();
File tempDir = new File(UTIL.getDataTestDir().toString());
tempDir.mkdirs();
healthScriptFile = new File(tempDir.getAbsolutePath(), "HealthScript.sh");
conf.set(HConstants.HEALTH_SCRIPT_LOC,
healthScriptFile.getAbsolutePath());
conf.setLong(HConstants.HEALTH_FAILURE_THRESHOLD, 3);
conf.setLong(HConstants.HEALTH_SCRIPT_TIMEOUT, 100);
return conf;
}
/**
* Simple helper class that just keeps track of whether or not its stopped.
*/
private static class StoppableImplementation implements Stoppable {
private volatile boolean stop = false;
@Override
public void stop(String why) {
this.stop = true;
}
@Override
public boolean isStopped() {
return this.stop;
}
}
}