HBASE-14625 Chaos Monkey should shut down faster
This commit is contained in:
parent
f4a4c3a489
commit
16af8df41f
|
@ -210,6 +210,11 @@ public class Action {
|
|||
+ " servers to " + toServers.size() + " different servers");
|
||||
Admin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
|
||||
for (byte[] victimRegion : victimRegions) {
|
||||
// Don't keep moving regions if we're
|
||||
// trying to stop the monkey.
|
||||
if (context.isStopping()) {
|
||||
break;
|
||||
}
|
||||
int targetIx = RandomUtils.nextInt(toServers.size());
|
||||
admin.move(victimRegion, Bytes.toBytes(toServers.get(targetIx).getServerName()));
|
||||
}
|
||||
|
@ -249,5 +254,9 @@ public class Action {
|
|||
public HBaseCluster getHBaseCluster() {
|
||||
return util.getHBaseClusterInterface();
|
||||
}
|
||||
|
||||
public boolean isStopping() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,6 +53,11 @@ public class AddColumnAction extends Action {
|
|||
columnDescriptor = new HColumnDescriptor(RandomStringUtils.randomAlphabetic(5));
|
||||
}
|
||||
|
||||
// Don't try the modify if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
LOG.debug("Performing action: Adding " + columnDescriptor + " to " + tableName);
|
||||
|
||||
tableDescriptor.addFamily(columnDescriptor);
|
||||
|
|
|
@ -18,7 +18,9 @@
|
|||
|
||||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||
|
@ -41,29 +43,39 @@ public class BatchRestartRsAction extends RestartActionBaseAction {
|
|||
List<ServerName> selectedServers = PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(),
|
||||
ratio);
|
||||
|
||||
for (ServerName server : selectedServers) {
|
||||
LOG.info("Killing region server:" + server);
|
||||
cluster.killRegionServer(server);
|
||||
}
|
||||
Set<ServerName> killedServers = new HashSet<ServerName>();
|
||||
|
||||
for (ServerName server : selectedServers) {
|
||||
// Don't keep killing servers if we're
|
||||
// trying to stop the monkey.
|
||||
if (context.isStopping()) {
|
||||
break;
|
||||
}
|
||||
LOG.info("Killing region server:" + server);
|
||||
cluster.killRegionServer(server);
|
||||
killedServers.add(server);
|
||||
}
|
||||
|
||||
for (ServerName server : killedServers) {
|
||||
cluster.waitForRegionServerToStop(server, PolicyBasedChaosMonkey.TIMEOUT);
|
||||
}
|
||||
|
||||
LOG.info("Killed " + selectedServers.size() + " region servers. Reported num of rs:"
|
||||
LOG.info("Killed " + killedServers.size() + " region servers. Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
|
||||
sleep(sleepTime);
|
||||
|
||||
for (ServerName server : selectedServers) {
|
||||
for (ServerName server : killedServers) {
|
||||
LOG.info("Starting region server:" + server.getHostname());
|
||||
cluster.startRegionServer(server.getHostname(), server.getPort());
|
||||
|
||||
}
|
||||
for (ServerName server : selectedServers) {
|
||||
cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), PolicyBasedChaosMonkey.TIMEOUT);
|
||||
for (ServerName server : killedServers) {
|
||||
cluster.waitForRegionServerToStart(server.getHostname(),
|
||||
server.getPort(),
|
||||
PolicyBasedChaosMonkey.TIMEOUT);
|
||||
}
|
||||
LOG.info("Started " + selectedServers.size() +" region servers. Reported num of rs:"
|
||||
LOG.info("Started " + killedServers.size() +" region servers. Reported num of rs:"
|
||||
+ cluster.getClusterStatus().getServersSize());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -73,6 +73,10 @@ public class ChangeBloomFilterAction extends Action {
|
|||
+ descriptor.getNameAsString() + " of table " + tableName);
|
||||
}
|
||||
|
||||
// Don't try the modify if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
admin.modifyTable(tableName, tableDescriptor);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -74,6 +74,11 @@ public class ChangeCompressionAction extends Action {
|
|||
}
|
||||
}
|
||||
|
||||
// Don't try the modify if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
admin.modifyTable(tableName, tableDescriptor);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -66,6 +66,10 @@ public class ChangeEncodingAction extends Action {
|
|||
+ " to: " + descriptor.getDataBlockEncoding());
|
||||
}
|
||||
|
||||
// Don't try the modify if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
admin.modifyTable(tableName, tableDescriptor);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -61,6 +61,10 @@ public class ChangeVersionsAction extends Action {
|
|||
for(HColumnDescriptor descriptor:columnDescriptors) {
|
||||
descriptor.setVersions(versions, versions);
|
||||
}
|
||||
// Don't try the modify if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
LOG.debug("Performing action: Changing versions on " + tableName.getNameAsString());
|
||||
admin.modifyTable(tableName, tableDescriptor);
|
||||
}
|
||||
|
|
|
@ -68,6 +68,11 @@ public class DecreaseMaxHFileSizeAction extends Action {
|
|||
// Change the table descriptor.
|
||||
htd.setMaxFileSize(newValue);
|
||||
|
||||
// Don't try the modify if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// modify the table.
|
||||
admin.modifyTable(tableName, htd);
|
||||
|
||||
|
|
|
@ -43,6 +43,11 @@ public class FlushTableAction extends Action {
|
|||
HBaseTestingUtility util = context.getHBaseIntegrationTestingUtility();
|
||||
Admin admin = util.getHBaseAdmin();
|
||||
|
||||
// Don't try the flush if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
LOG.info("Performing action: Flush table " + tableName);
|
||||
try {
|
||||
admin.flush(tableName);
|
||||
|
|
|
@ -24,6 +24,10 @@ package org.apache.hadoop.hbase.chaos.actions;
|
|||
public class ForceBalancerAction extends Action {
|
||||
@Override
|
||||
public void perform() throws Exception {
|
||||
// Don't try the flush if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
LOG.info("Balancing regions");
|
||||
forceBalancer();
|
||||
}
|
||||
|
|
|
@ -58,6 +58,12 @@ public class MergeRandomAdjacentRegionsOfTableAction extends Action {
|
|||
HRegionInfo a = regions.get(i++);
|
||||
HRegionInfo b = regions.get(i);
|
||||
LOG.debug("Merging " + a.getRegionNameAsString() + " and " + b.getRegionNameAsString());
|
||||
|
||||
// Don't try the merge if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
admin.mergeRegions(a.getEncodedNameAsBytes(), b.getEncodedNameAsBytes(), false);
|
||||
} catch (Exception ex) {
|
||||
|
|
|
@ -69,6 +69,12 @@ public class MoveRegionsOfTableAction extends Action {
|
|||
|
||||
long start = System.currentTimeMillis();
|
||||
for (HRegionInfo regionInfo:regions) {
|
||||
|
||||
// Don't try the move if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
String destServerName =
|
||||
servers[RandomUtils.nextInt(servers.length)].getServerName();
|
||||
|
|
|
@ -68,6 +68,10 @@ public class RemoveColumnAction extends Action {
|
|||
+ tableName.getNameAsString());
|
||||
tableDescriptor.removeFamily(colDescName);
|
||||
|
||||
// Don't try the modify if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
admin.modifyTable(tableName, tableDescriptor);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,6 +40,11 @@ public class RestartActionBaseAction extends Action {
|
|||
|
||||
void restartMaster(ServerName server, long sleepTime) throws IOException {
|
||||
sleepTime = Math.max(sleepTime, 1000);
|
||||
// Don't try the kill if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
killMaster(server);
|
||||
sleep(sleepTime);
|
||||
startMaster(server);
|
||||
|
@ -47,6 +52,10 @@ public class RestartActionBaseAction extends Action {
|
|||
|
||||
void restartRs(ServerName server, long sleepTime) throws IOException {
|
||||
sleepTime = Math.max(sleepTime, 1000);
|
||||
// Don't try the kill if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
killRs(server);
|
||||
sleep(sleepTime);
|
||||
startRs(server);
|
||||
|
@ -54,6 +63,10 @@ public class RestartActionBaseAction extends Action {
|
|||
|
||||
void restartZKNode(ServerName server, long sleepTime) throws IOException {
|
||||
sleepTime = Math.max(sleepTime, 1000);
|
||||
// Don't try the kill if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
killZKNode(server);
|
||||
sleep(sleepTime);
|
||||
startZKNode(server);
|
||||
|
@ -61,6 +74,10 @@ public class RestartActionBaseAction extends Action {
|
|||
|
||||
void restartDataNode(ServerName server, long sleepTime) throws IOException {
|
||||
sleepTime = Math.max(sleepTime, 1000);
|
||||
// Don't try the kill if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
killDataNode(server);
|
||||
sleep(sleepTime);
|
||||
startDataNode(server);
|
||||
|
|
|
@ -63,7 +63,7 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction {
|
|||
Queue<ServerName> deadServers = new LinkedList<ServerName>();
|
||||
|
||||
// loop while there are servers to be killed or dead servers to be restarted
|
||||
while (!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) {
|
||||
while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) {
|
||||
KillOrStart action = KillOrStart.KILL;
|
||||
|
||||
if (serversToBeKilled.isEmpty()) { // no more servers to kill
|
||||
|
|
|
@ -44,6 +44,11 @@ public class SnapshotTableAction extends Action {
|
|||
String snapshotName = tableName + "-it-" + System.currentTimeMillis();
|
||||
Admin admin = util.getHBaseAdmin();
|
||||
|
||||
// Don't try the snapshot if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
LOG.info("Performing action: Snapshot table " + tableName);
|
||||
admin.snapshot(snapshotName, tableName);
|
||||
if (sleepTime > 0) {
|
||||
|
|
|
@ -34,7 +34,10 @@ public class SplitAllRegionOfTableAction extends Action {
|
|||
public void perform() throws Exception {
|
||||
HBaseTestingUtility util = context.getHBaseIntegrationTestingUtility();
|
||||
Admin admin = util.getHBaseAdmin();
|
||||
|
||||
// Don't try the split if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
LOG.info("Performing action: Split all regions of " + tableName);
|
||||
admin.split(tableName);
|
||||
}
|
||||
|
|
|
@ -53,6 +53,10 @@ public class SplitRandomRegionOfTableAction extends Action {
|
|||
LOG.info("Table " + tableName + " doesn't have regions to split");
|
||||
return;
|
||||
}
|
||||
// Don't try the split if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
HRegionInfo region = PolicyBasedChaosMonkey.selectRandomItem(
|
||||
regions.toArray(new HRegionInfo[regions.size()]));
|
||||
|
|
|
@ -42,6 +42,11 @@ public class TruncateTableAction extends Action {
|
|||
HBaseTestingUtility util = context.getHBaseIntegrationTestingUtility();
|
||||
Admin admin = util.getHBaseAdmin();
|
||||
|
||||
// Don't try the truncate if we're stopping
|
||||
if (context.isStopping()) {
|
||||
return;
|
||||
}
|
||||
|
||||
boolean preserveSplits = random.nextBoolean();
|
||||
LOG.info("Performing action: Truncate table " + tableName.getNameAsString() +
|
||||
"preserve splits " + preserveSplits);
|
||||
|
|
|
@ -19,8 +19,10 @@
|
|||
package org.apache.hadoop.hbase.chaos.actions;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang.math.RandomUtils;
|
||||
import org.apache.hadoop.hbase.ClusterStatus;
|
||||
|
@ -51,6 +53,8 @@ public class UnbalanceKillAndRebalanceAction extends Action {
|
|||
public void perform() throws Exception {
|
||||
ClusterStatus status = this.cluster.getClusterStatus();
|
||||
List<ServerName> victimServers = new LinkedList<ServerName>(status.getServers());
|
||||
Set<ServerName> killedServers = new HashSet<ServerName>();
|
||||
|
||||
int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size());
|
||||
int deadCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size());
|
||||
Assert.assertTrue((liveCount + deadCount) < victimServers.size());
|
||||
|
@ -62,13 +66,20 @@ public class UnbalanceKillAndRebalanceAction extends Action {
|
|||
unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS);
|
||||
Thread.sleep(waitForUnbalanceMilliSec);
|
||||
for (int i = 0; i < liveCount; ++i) {
|
||||
// Don't keep killing servers if we're
|
||||
// trying to stop the monkey.
|
||||
if (context.isStopping()) {
|
||||
break;
|
||||
}
|
||||
killRs(targetServers.get(i));
|
||||
killedServers.add(targetServers.get(i));
|
||||
}
|
||||
|
||||
Thread.sleep(waitForKillsMilliSec);
|
||||
forceBalancer();
|
||||
Thread.sleep(waitAfterBalanceMilliSec);
|
||||
for (int i = 0; i < liveCount; ++i) {
|
||||
startRs(targetServers.get(i));
|
||||
for (ServerName server:killedServers) {
|
||||
startRs(server);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,14 +35,29 @@ public abstract class Policy extends StoppableImplementation implements Runnable
|
|||
|
||||
public void init(PolicyContext context) throws Exception {
|
||||
this.context = context;
|
||||
|
||||
// Used to wire up stopping.
|
||||
context.setPolicy(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* A context for a Policy
|
||||
*/
|
||||
public static class PolicyContext extends Action.ActionContext {
|
||||
|
||||
Policy policy = null;
|
||||
|
||||
public PolicyContext(IntegrationTestingUtility util) {
|
||||
super(util);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isStopping() {
|
||||
return policy.isStopped();
|
||||
}
|
||||
|
||||
public void setPolicy(Policy policy) {
|
||||
this.policy = policy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue