HBASE-8882 Create an Integration Test to Test MTTR

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1501042 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
eclark 2013-07-09 01:44:39 +00:00
parent 26d5bcf418
commit 631b20dfe3
4 changed files with 531 additions and 14 deletions

View File

@ -74,7 +74,7 @@ public class IntegrationTestRebalanceAndKillServersTargeted extends IngestIntegr
private static final long WAIT_AFTER_BALANCE_MS = 5 * 1000;
@Override
protected void perform() throws Exception {
public void perform() throws Exception {
ClusterStatus status = this.cluster.getClusterStatus();
List<ServerName> victimServers = new LinkedList<ServerName>(status.getServers());
int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size());

View File

@ -0,0 +1,458 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.mttr;
import com.google.common.base.Objects;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.ClusterStatus;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.IntegrationTestingUtility;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.KeyOnlyFilter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ChaosMonkey;
import org.apache.hadoop.hbase.util.LoadTestTool;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import static junit.framework.Assert.assertEquals;
/**
* Integration test that should benchmark how fast HBase can recover from failures. This test starts
* different threads:
* <ol>
* <li>
* Load Test Tool.<br/>
* This runs so that all RegionServers will have some load and HLogs will be full.
* </li>
* <li>
* Scan thread.<br/>
* This thread runs a very short scan over and over again recording how log it takes to respond.
* The longest response is assumed to be the time it took to recover.
* </li>
* <li>
* Put thread.<br/>
* This thread just like the scan thread except it does a very small put.
* </li>
* <li>
* Admin thread. <br/>
* This thread will continually go to the master to try and get the cluster status. Just like the
* put and scan threads, the time to respond is recorded.
* </li>
* <li>
* Chaos Monkey thread.<br/>
* This thread runs a ChaosMonkey.Action.
* </li>
* </ol>
* <p/>
* The ChaosMonkey actions currently run are:
* <ul>
* <li>Restart the RegionServer holding meta.</li>
* <li>Restart the RegionServer holding the table the scan and put threads are targeting.</li>
* <li>Move the Regions of the table used by the scan and put threads.</li>
* <li>Restart the master.</li>
* </ul>
* <p/>
* At the end of the test a log line is output on the INFO level containing the timing data that was
*/
public class IntegrationTestMTTR {
/**
* Constants.
*/
private static final byte[] FAMILY = Bytes.toBytes("d");
private static final Log LOG = LogFactory.getLog(IntegrationTestMTTR.class);
private static final long SLEEP_TIME = 60 * 1000l;
/**
* Configurable table names.
*/
private static String tableName;
private static byte[] tableNameBytes;
private static String loadTableName;
/**
* Util to get at the cluster.
*/
private static IntegrationTestingUtility util;
/**
* Executor for test threads.
*/
private static ExecutorService executorService;
/**
* All of the chaos monkey actions used.
*/
private static ChaosMonkey.Action restartRSAction;
private static ChaosMonkey.Action restartMetaAction;
private static ChaosMonkey.Action moveRegionAction;
private static ChaosMonkey.Action restartMasterAction;
/**
* The load test tool used to create load and make sure that HLogs aren't empty.
*/
private static LoadTestTool loadTool;
@BeforeClass
public static void setUp() throws Exception {
// Set up the integration test util
if (util == null) {
util = new IntegrationTestingUtility();
}
// Make sure there are three servers.
util.initializeCluster(3);
// Set up the load test tool.
loadTool = new LoadTestTool();
loadTool.setConf(util.getConfiguration());
// Create executor with enough threads to restart rs's,
// run scans, puts, admin ops and load test tool.
executorService = Executors.newFixedThreadPool(8);
// Set up the tables needed.
setupTables();
// Set up the actions.
setupActions();
}
private static void setupActions() throws IOException {
// Set up the action that will restart a region server holding a region from our table
// because this table should only have one region we should be good.
restartRSAction = new ChaosMonkey.RestartRsHoldingTable(SLEEP_TIME, tableName);
// Set up the action that will kill the region holding meta.
restartMetaAction = new ChaosMonkey.RestartRsHoldingMeta(SLEEP_TIME);
// Set up the action that will move the regions of our table.
moveRegionAction = new ChaosMonkey.MoveRegionsOfTable(SLEEP_TIME, tableName);
// Kill the master (No sleep time because there is only one master running at this time.)
restartMasterAction = new ChaosMonkey.RestartActiveMaster(0l);
// Give the action the access to the cluster.
ChaosMonkey.ActionContext actionContext = new ChaosMonkey.ActionContext(util);
restartRSAction.init(actionContext);
restartMetaAction.init(actionContext);
moveRegionAction.init(actionContext);
restartMasterAction.init(actionContext);
}
private static void setupTables() throws IOException {
// Get the table name.
tableName = util.getConfiguration()
.get("hbase.IntegrationTestMTTR.tableName", "IntegrationTestMTTR");
tableNameBytes = Bytes.toBytes(tableName);
loadTableName = util.getConfiguration()
.get("hbase.IntegrationTestMTTR.loadTableName", "IntegrationTestMTTRLoadTestTool");
if (util.getHBaseAdmin().tableExists(tableNameBytes)) {
util.deleteTable(tableNameBytes);
}
if (util.getHBaseAdmin().tableExists(loadTableName)) {
util.deleteTable(loadTableName);
}
// Create the table. If this fails then fail everything.
HTableDescriptor tableDescriptor = new HTableDescriptor(tableNameBytes);
// Make the max file size huge so that splits don't happen during the test.
tableDescriptor.setMaxFileSize(2 * 1024 * 1024 * 1024);
HColumnDescriptor descriptor = new HColumnDescriptor(FAMILY);
descriptor.setMaxVersions(1);
tableDescriptor.addFamily(descriptor);
util.getHBaseAdmin().createTable(tableDescriptor);
// Setup the table for LoadTestTool
int ret = loadTool.run(new String[]{"-tn", loadTableName, "-init_only"});
assertEquals("Failed to initialize LoadTestTool", 0, ret);
}
@AfterClass
public static void after() throws IOException {
// Clean everything up.
util.restoreCluster();
util = null;
// Stop the threads so that we know everything is complete.
executorService.shutdown();
executorService = null;
// Clean up the actions.
moveRegionAction = null;
restartMetaAction = null;
restartRSAction = null;
restartMasterAction = null;
loadTool = null;
}
@Test
public void testRestartRsHoldingTable() throws Exception {
run(new ActionCallable(restartRSAction), "RestartRsHoldingTable");
}
@Test
public void testKillRsHoldingMeta() throws Exception {
run(new ActionCallable(restartMetaAction), "KillRsHoldingMeta");
}
@Test
public void testMoveRegion() throws Exception {
run(new ActionCallable(moveRegionAction), "MoveRegion");
}
@Test
public void testRestartMaster() throws Exception {
run(new ActionCallable(restartMasterAction), "RestartMaster");
}
public void run(Callable<Boolean> monkeyCallable, String testName) throws Exception {
int maxIters = util.getHBaseClusterInterface().isDistributedCluster() ? 10 : 3;
// Array to keep track of times.
ArrayList<Long> timesPutMs = new ArrayList<Long>(maxIters);
ArrayList<Long> timesScanMs = new ArrayList<Long>(maxIters);
ArrayList<Long> timesAdminMs = new ArrayList<Long>(maxIters);
long start = System.nanoTime();
// We're going to try this multiple times
for (int fullIterations = 0; fullIterations < maxIters; fullIterations++) {
// Create and start executing a callable that will kill the servers
Future<Boolean> monkeyFuture = executorService.submit(monkeyCallable);
// Pass that future to the timing Callables.
Future<Long> putFuture = executorService.submit(new PutCallable(monkeyFuture));
Future<Long> scanFuture = executorService.submit(new ScanCallable(monkeyFuture));
Future<Long> adminFuture = executorService.submit(new AdminCallable(monkeyFuture));
Future<Boolean> loadFuture = executorService.submit(new LoadCallable(monkeyFuture));
monkeyFuture.get();
loadFuture.get();
// Get the values from the futures.
long putTime = TimeUnit.MILLISECONDS.convert(putFuture.get(), TimeUnit.NANOSECONDS);
long scanTime = TimeUnit.MILLISECONDS.convert(scanFuture.get(), TimeUnit.NANOSECONDS);
long adminTime = TimeUnit.MILLISECONDS.convert(adminFuture.get(), TimeUnit.NANOSECONDS);
// Store the times to display later.
timesPutMs.add(putTime);
timesScanMs.add(scanTime);
timesAdminMs.add(adminTime);
// Wait some time for everything to settle down.
Thread.sleep(5000l);
}
long runtimeMs = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
Objects.ToStringHelper helper = Objects.toStringHelper("MTTRResults")
.add("timesPutMs", timesPutMs)
.add("timesScanMs", timesScanMs)
.add("timesAdminMs", timesAdminMs)
.add("totalRuntimeMs", runtimeMs)
.add("name", testName);
// Log the info
LOG.info(helper.toString());
}
/**
* Base class for actions that need to record the time needed to recover from a failure.
*/
public abstract class TimingCallable implements Callable<Long> {
protected final Future future;
public TimingCallable(Future f) {
future = f;
}
@Override
public Long call() throws Exception {
// TODO(eclark): Do more than just max. Full histogram support.
long maxTime = 0;
int numAfterDone = 0;
// Keep trying until the rs is back up and we've gotten a put through
while (numAfterDone < 10) {
long start = System.nanoTime();
try {
boolean result = doAction();
if (result && future.isDone()) {
numAfterDone ++;
}
} catch (Exception e) {
numAfterDone = 0;
}
maxTime = Math.max(maxTime, System.nanoTime() - start);
}
return maxTime;
}
protected abstract boolean doAction() throws Exception;
}
/**
* Callable that will keep putting small amounts of data into a table
* until the future supplied returns. It keeps track of the max time.
*/
public class PutCallable extends TimingCallable {
private final HTable table;
public PutCallable(Future f) throws IOException {
super(f);
this.table = new HTable(util.getConfiguration(), tableNameBytes);
}
@Override
protected boolean doAction() throws Exception {
Put p = new Put(Bytes.toBytes(RandomStringUtils.randomAlphanumeric(5)));
p.add(FAMILY, Bytes.toBytes("\0"), Bytes.toBytes(RandomStringUtils.randomAscii(5)));
table.put(p);
table.flushCommits();
return true;
}
}
/**
* Callable that will keep scanning for small amounts of data until the
* supplied future returns. Returns the max time taken to scan.
*/
public class ScanCallable extends TimingCallable {
private final HTable table;
public ScanCallable(Future f) throws IOException {
super(f);
this.table = new HTable(util.getConfiguration(), tableNameBytes);
}
@Override
protected boolean doAction() throws Exception {
ResultScanner rs = null;
try {
Scan s = new Scan();
s.setBatch(2);
s.addFamily(FAMILY);
s.setFilter(new KeyOnlyFilter());
s.setMaxVersions(1);
rs = table.getScanner(s);
Result result = rs.next();
return rs != null && result != null && result.size() > 0;
} finally {
if (rs != null) {
rs.close();
}
}
}
}
/**
* Callable that will keep going to the master for cluster status. Returns the max time taken.
*/
public class AdminCallable extends TimingCallable {
public AdminCallable(Future f) throws IOException {
super(f);
}
@Override
protected boolean doAction() throws Exception {
HBaseAdmin admin = new HBaseAdmin(util.getConfiguration());
ClusterStatus status = admin.getClusterStatus();
return status != null;
}
}
public class ActionCallable implements Callable<Boolean> {
private final ChaosMonkey.Action action;
public ActionCallable(ChaosMonkey.Action action) {
this.action = action;
}
@Override
public Boolean call() throws Exception {
this.action.perform();
return true;
}
}
/**
* Callable used to make sure the cluster has some load on it.
* This callable uses LoadTest tool to
*/
public class LoadCallable implements Callable<Boolean> {
private final Future future;
public LoadCallable(Future f) {
future = f;
}
@Override
public Boolean call() throws Exception {
int colsPerKey = 10;
int recordSize = 500;
int numServers = util.getHBaseClusterInterface().getInitialClusterStatus().getServersSize();
int numKeys = numServers * 5000;
int writeThreads = 10;
// Loop until the chaos monkey future is done.
// But always go in just in case some action completes quickly
do {
int ret = loadTool.run(new String[]{
"-tn", loadTableName,
"-write", String.format("%d:%d:%d", colsPerKey, recordSize, writeThreads),
"-num_keys", String.valueOf(numKeys),
"-skip_init"
});
assertEquals("Load failed", 0, ret);
} while (!future.isDone());
return true;
}
}
}

View File

@ -29,6 +29,7 @@ import java.util.Queue;
import java.util.Random;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.lang.math.RandomUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@ -42,6 +43,7 @@ import org.apache.hadoop.hbase.ServerLoad;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
@ -110,10 +112,10 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
/**
* Context for Action's
*/
private static class ActionContext {
public static class ActionContext {
private IntegrationTestingUtility util;
ActionContext(IntegrationTestingUtility util) {
public ActionContext(IntegrationTestingUtility util) {
this.util = util;
}
@ -141,7 +143,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
protected ServerName[] initialServers;
protected Random random = new Random();
void init(ActionContext context) throws Exception {
public void init(ActionContext context) throws IOException {
this.context = context;
cluster = context.getHBaseCluster();
initialStatus = cluster.getInitialClusterStatus();
@ -149,7 +151,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
initialServers = regionServers.toArray(new ServerName[regionServers.size()]);
}
protected void perform() throws Exception { };
public void perform() throws Exception { };
// TODO: perhaps these methods should be elsewhere?
/** Returns current region servers */
@ -254,7 +256,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
super(sleepTime);
}
@Override
protected void perform() throws Exception {
public void perform() throws Exception {
LOG.info("Performing action: Restart active master");
ServerName master = cluster.getClusterStatus().getMaster();
@ -268,7 +270,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
}
@Override
protected void perform() throws Exception {
public void perform() throws Exception {
LOG.info("Performing action: Restart random region server");
ServerName server = selectRandomItem(getCurrentServers());
@ -276,12 +278,12 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
}
}
public static class RestartRsHoldingMeta extends RestartRandomRs {
public static class RestartRsHoldingMeta extends RestartActionBase {
public RestartRsHoldingMeta(long sleepTime) {
super(sleepTime);
}
@Override
protected void perform() throws Exception {
public void perform() throws Exception {
LOG.info("Performing action: Restart region server holding META");
ServerName server = cluster.getServerHoldingMeta();
if (server == null) {
@ -292,6 +294,62 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
}
}
public static class RestartRsHoldingTable extends RestartActionBase {
private final String tableName;
public RestartRsHoldingTable(long sleepTime, String tableName) {
super(sleepTime);
this.tableName = tableName;
}
@Override
public void perform() throws Exception {
HTable table = null;
try {
Configuration conf = context.getHaseIntegrationTestingUtility().getConfiguration();
table = new HTable(conf, tableName);
} catch (IOException e) {
LOG.debug("Error creating HTable used to get list of region locations.", e);
return;
}
Collection<ServerName> serverNames = table.getRegionLocations().values();
ServerName[] nameArray = serverNames.toArray(new ServerName[serverNames.size()]);
restartRs(nameArray[random.nextInt(nameArray.length)], sleepTime);
}
}
public static class MoveRegionsOfTable extends Action {
private final long sleepTime;
private final byte[] tableNameBytes;
public MoveRegionsOfTable(long sleepTime, String tableName) {
this.sleepTime = sleepTime;
this.tableNameBytes = Bytes.toBytes(tableName);
}
@Override
public void perform() throws Exception {
HBaseAdmin admin = this.context.getHaseIntegrationTestingUtility().getHBaseAdmin();
List<HRegionInfo> regions = admin.getTableRegions(tableNameBytes);
Collection<ServerName> serversList = admin.getClusterStatus().getServers();
ServerName[] servers = serversList.toArray(new ServerName[serversList.size()]);
for (HRegionInfo regionInfo:regions) {
try {
byte[] destServerName =
Bytes.toBytes(servers[RandomUtils.nextInt(servers.length)].getServerName());
admin.move(regionInfo.getRegionName(), destServerName);
} catch (Exception e) {
LOG.debug("Error moving region", e);
}
}
Thread.sleep(sleepTime);
}
}
/**
* Restarts a ratio of the running regionservers at the same time
*/
@ -304,7 +362,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
}
@Override
protected void perform() throws Exception {
public void perform() throws Exception {
LOG.info(String.format("Performing action: Batch restarting %d%% of region servers",
(int)(ratio * 100)));
List<ServerName> selectedServers = selectRandomItems(getCurrentServers(), ratio);
@ -346,7 +404,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
}
@Override
protected void perform() throws Exception {
public void perform() throws Exception {
LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
(int)(ratio * 100)));
List<ServerName> selectedServers = selectRandomItems(getCurrentServers(), ratio);
@ -394,7 +452,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
}
@Override
protected void perform() throws Exception {
public void perform() throws Exception {
LOG.info("Unbalancing regions");
ClusterStatus status = this.cluster.getClusterStatus();
List<ServerName> victimServers = new LinkedList<ServerName>(status.getServers());
@ -410,7 +468,7 @@ public class ChaosMonkey extends AbstractHBaseTool implements Stoppable {
public static class ForceBalancerAction extends Action {
@Override
protected void perform() throws Exception {
public void perform() throws Exception {
LOG.info("Balancing regions");
forceBalancer();
}

View File

@ -16,6 +16,7 @@
*/
package org.apache.hadoop.hbase.util;
import java.io.IOException;
import java.util.Set;
import java.util.TreeSet;
@ -79,7 +80,7 @@ public abstract class AbstractHBaseTool implements Tool {
}
@Override
public final int run(String[] args) throws Exception {
public final int run(String[] args) throws IOException {
if (conf == null) {
LOG.error("Tool configuration is not initialized");
throw new NullPointerException("conf");