HBASE-22982: region server suspend/resume and graceful rolling restart actions (#592)
* Add chaos monkey action for suspend/resume region servers * Add chaos monkey action for graceful rolling restart * Add these to relevant chaos monkeys Signed-off-by: Balazs Meszaros <meszibalu@apache.org> Signed-off-by: Peter Somogyi <psomogyi@apache.org>
This commit is contained in:
parent
ea24ea7dd5
commit
f0dddd1cc2
@ -97,13 +97,13 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void startRegionServer(String hostname, int port) throws IOException {
|
public void startRegionServer(String hostname, int port) throws IOException {
|
||||||
LOG.info("Starting RS on: " + hostname);
|
LOG.info("Starting RS on: {}", hostname);
|
||||||
clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port);
|
clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void killRegionServer(ServerName serverName) throws IOException {
|
public void killRegionServer(ServerName serverName) throws IOException {
|
||||||
LOG.info("Aborting RS: " + serverName.getServerName());
|
LOG.info("Aborting RS: {}", serverName.getServerName());
|
||||||
killedRegionServers.add(serverName);
|
killedRegionServers.add(serverName);
|
||||||
clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
|
clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
|
||||||
serverName.getHostname(), serverName.getPort());
|
serverName.getHostname(), serverName.getPort());
|
||||||
@ -116,7 +116,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void stopRegionServer(ServerName serverName) throws IOException {
|
public void stopRegionServer(ServerName serverName) throws IOException {
|
||||||
LOG.info("Stopping RS: " + serverName.getServerName());
|
LOG.info("Stopping RS: {}", serverName.getServerName());
|
||||||
clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
|
clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
|
||||||
serverName.getHostname(), serverName.getPort());
|
serverName.getHostname(), serverName.getPort());
|
||||||
}
|
}
|
||||||
@ -126,22 +126,36 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
|
waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void suspendRegionServer(ServerName serverName) throws IOException {
|
||||||
|
LOG.info("Suspend RS: {}", serverName.getServerName());
|
||||||
|
clusterManager.suspend(ServiceType.HBASE_REGIONSERVER,
|
||||||
|
serverName.getHostname(), serverName.getPort());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void resumeRegionServer(ServerName serverName) throws IOException {
|
||||||
|
LOG.info("Resume RS: {}", serverName.getServerName());
|
||||||
|
clusterManager.resume(ServiceType.HBASE_REGIONSERVER,
|
||||||
|
serverName.getHostname(), serverName.getPort());
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void startZkNode(String hostname, int port) throws IOException {
|
public void startZkNode(String hostname, int port) throws IOException {
|
||||||
LOG.info("Starting ZooKeeper node on: " + hostname);
|
LOG.info("Starting ZooKeeper node on: {}", hostname);
|
||||||
clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port);
|
clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void killZkNode(ServerName serverName) throws IOException {
|
public void killZkNode(ServerName serverName) throws IOException {
|
||||||
LOG.info("Aborting ZooKeeper node on: " + serverName.getServerName());
|
LOG.info("Aborting ZooKeeper node on: {}", serverName.getServerName());
|
||||||
clusterManager.kill(ServiceType.ZOOKEEPER_SERVER,
|
clusterManager.kill(ServiceType.ZOOKEEPER_SERVER,
|
||||||
serverName.getHostname(), serverName.getPort());
|
serverName.getHostname(), serverName.getPort());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void stopZkNode(ServerName serverName) throws IOException {
|
public void stopZkNode(ServerName serverName) throws IOException {
|
||||||
LOG.info("Stopping ZooKeeper node: " + serverName.getServerName());
|
LOG.info("Stopping ZooKeeper node: {}", serverName.getServerName());
|
||||||
clusterManager.stop(ServiceType.ZOOKEEPER_SERVER,
|
clusterManager.stop(ServiceType.ZOOKEEPER_SERVER,
|
||||||
serverName.getHostname(), serverName.getPort());
|
serverName.getHostname(), serverName.getPort());
|
||||||
}
|
}
|
||||||
@ -158,21 +172,21 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void startDataNode(ServerName serverName) throws IOException {
|
public void startDataNode(ServerName serverName) throws IOException {
|
||||||
LOG.info("Starting data node on: " + serverName.getServerName());
|
LOG.info("Starting data node on: {}", serverName.getServerName());
|
||||||
clusterManager.start(ServiceType.HADOOP_DATANODE,
|
clusterManager.start(ServiceType.HADOOP_DATANODE,
|
||||||
serverName.getHostname(), serverName.getPort());
|
serverName.getHostname(), serverName.getPort());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void killDataNode(ServerName serverName) throws IOException {
|
public void killDataNode(ServerName serverName) throws IOException {
|
||||||
LOG.info("Aborting data node on: " + serverName.getServerName());
|
LOG.info("Aborting data node on: {}", serverName.getServerName());
|
||||||
clusterManager.kill(ServiceType.HADOOP_DATANODE,
|
clusterManager.kill(ServiceType.HADOOP_DATANODE,
|
||||||
serverName.getHostname(), serverName.getPort());
|
serverName.getHostname(), serverName.getPort());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void stopDataNode(ServerName serverName) throws IOException {
|
public void stopDataNode(ServerName serverName) throws IOException {
|
||||||
LOG.info("Stopping data node on: " + serverName.getServerName());
|
LOG.info("Stopping data node on: {}", serverName.getServerName());
|
||||||
clusterManager.stop(ServiceType.HADOOP_DATANODE,
|
clusterManager.stop(ServiceType.HADOOP_DATANODE,
|
||||||
serverName.getHostname(), serverName.getPort());
|
serverName.getHostname(), serverName.getPort());
|
||||||
}
|
}
|
||||||
@ -189,21 +203,21 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void startNameNode(ServerName serverName) throws IOException {
|
public void startNameNode(ServerName serverName) throws IOException {
|
||||||
LOG.info("Starting name node on: " + serverName.getServerName());
|
LOG.info("Starting name node on: {}", serverName.getServerName());
|
||||||
clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
||||||
serverName.getPort());
|
serverName.getPort());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void killNameNode(ServerName serverName) throws IOException {
|
public void killNameNode(ServerName serverName) throws IOException {
|
||||||
LOG.info("Aborting name node on: " + serverName.getServerName());
|
LOG.info("Aborting name node on: {}", serverName.getServerName());
|
||||||
clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
||||||
serverName.getPort());
|
serverName.getPort());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void stopNameNode(ServerName serverName) throws IOException {
|
public void stopNameNode(ServerName serverName) throws IOException {
|
||||||
LOG.info("Stopping name node on: " + serverName.getServerName());
|
LOG.info("Stopping name node on: {}", serverName.getServerName());
|
||||||
clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
|
||||||
serverName.getPort());
|
serverName.getPort());
|
||||||
}
|
}
|
||||||
@ -220,7 +234,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
|
|
||||||
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
|
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
|
LOG.info("Waiting for service: {} to stop: {}", service, serverName.getServerName());
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
|
|
||||||
while ((System.currentTimeMillis() - start) < timeout) {
|
while ((System.currentTimeMillis() - start) < timeout) {
|
||||||
@ -234,7 +248,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
|
|
||||||
private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
|
private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName());
|
LOG.info("Waiting for service: {} to start: ", service, serverName.getServerName());
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
|
|
||||||
while ((System.currentTimeMillis() - start) < timeout) {
|
while ((System.currentTimeMillis() - start) < timeout) {
|
||||||
@ -248,19 +262,19 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void startMaster(String hostname, int port) throws IOException {
|
public void startMaster(String hostname, int port) throws IOException {
|
||||||
LOG.info("Starting Master on: " + hostname + ":" + port);
|
LOG.info("Starting Master on: {}:{}", hostname, port);
|
||||||
clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
|
clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void killMaster(ServerName serverName) throws IOException {
|
public void killMaster(ServerName serverName) throws IOException {
|
||||||
LOG.info("Aborting Master: " + serverName.getServerName());
|
LOG.info("Aborting Master: {}", serverName.getServerName());
|
||||||
clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
|
clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void stopMaster(ServerName serverName) throws IOException {
|
public void stopMaster(ServerName serverName) throws IOException {
|
||||||
LOG.info("Stopping Master: " + serverName.getServerName());
|
LOG.info("Stopping Master: {}", serverName.getServerName());
|
||||||
clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
|
clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -294,7 +308,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
regionLoc = locator.getRegionLocation(startKey, true);
|
regionLoc = locator.getRegionLocation(startKey, true);
|
||||||
}
|
}
|
||||||
if (regionLoc == null) {
|
if (regionLoc == null) {
|
||||||
LOG.warn("Cannot find region server holding region " + Bytes.toStringBinary(regionName));
|
LOG.warn("Cannot find region server holding region {}", Bytes.toStringBinary(regionName));
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return regionLoc.getServerName();
|
return regionLoc.getServerName();
|
||||||
@ -338,15 +352,15 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
//check whether current master has changed
|
//check whether current master has changed
|
||||||
final ServerName initMaster = initial.getMasterName();
|
final ServerName initMaster = initial.getMasterName();
|
||||||
if (!ServerName.isSameAddress(initMaster, current.getMasterName())) {
|
if (!ServerName.isSameAddress(initMaster, current.getMasterName())) {
|
||||||
LOG.info("Restoring cluster - Initial active master : " + initMaster.getAddress() +
|
LOG.info("Restoring cluster - Initial active master : {} has changed to : {}",
|
||||||
" has changed to : " + current.getMasterName().getAddress());
|
initMaster.getAddress(), current.getMasterName().getAddress());
|
||||||
// If initial master is stopped, start it, before restoring the state.
|
// If initial master is stopped, start it, before restoring the state.
|
||||||
// It will come up as a backup master, if there is already an active master.
|
// It will come up as a backup master, if there is already an active master.
|
||||||
try {
|
try {
|
||||||
if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
|
if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
|
||||||
initMaster.getHostname(), initMaster.getPort())) {
|
initMaster.getHostname(), initMaster.getPort())) {
|
||||||
LOG.info("Restoring cluster - starting initial active master at:"
|
LOG.info("Restoring cluster - starting initial active master at:{}",
|
||||||
+ initMaster.getAddress());
|
initMaster.getAddress());
|
||||||
startMaster(initMaster.getHostname(), initMaster.getPort());
|
startMaster(initMaster.getHostname(), initMaster.getPort());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -356,11 +370,11 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
// 3. Start backup masters
|
// 3. Start backup masters
|
||||||
for (ServerName currentBackup : current.getBackupMasterNames()) {
|
for (ServerName currentBackup : current.getBackupMasterNames()) {
|
||||||
if (!ServerName.isSameAddress(currentBackup, initMaster)) {
|
if (!ServerName.isSameAddress(currentBackup, initMaster)) {
|
||||||
LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
|
LOG.info("Restoring cluster - stopping backup master: {}", currentBackup);
|
||||||
stopMaster(currentBackup);
|
stopMaster(currentBackup);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG.info("Restoring cluster - stopping active master: " + current.getMasterName());
|
LOG.info("Restoring cluster - stopping active master: {}", current.getMasterName());
|
||||||
stopMaster(current.getMasterName());
|
stopMaster(current.getMasterName());
|
||||||
waitForActiveAndReadyMaster(); // wait so that active master takes over
|
waitForActiveAndReadyMaster(); // wait so that active master takes over
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
@ -376,8 +390,8 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
|
if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
|
||||||
backup.getHostname(),
|
backup.getHostname(),
|
||||||
backup.getPort())) {
|
backup.getPort())) {
|
||||||
LOG.info("Restoring cluster - starting initial backup master: "
|
LOG.info("Restoring cluster - starting initial backup master: {}",
|
||||||
+ backup.getAddress());
|
backup.getAddress());
|
||||||
startMaster(backup.getHostname(), backup.getPort());
|
startMaster(backup.getHostname(), backup.getPort());
|
||||||
}
|
}
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
@ -401,7 +415,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
for (ServerName sn:toStart) {
|
for (ServerName sn:toStart) {
|
||||||
try {
|
try {
|
||||||
if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
|
if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
|
||||||
LOG.info("Restoring cluster - starting initial backup master: " + sn.getAddress());
|
LOG.info("Restoring cluster - starting initial backup master: {}", sn.getAddress());
|
||||||
startMaster(sn.getHostname(), sn.getPort());
|
startMaster(sn.getHostname(), sn.getPort());
|
||||||
}
|
}
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
@ -412,7 +426,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
for (ServerName sn:toKill) {
|
for (ServerName sn:toKill) {
|
||||||
try {
|
try {
|
||||||
if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
|
if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
|
||||||
LOG.info("Restoring cluster - stopping backup master: " + sn.getAddress());
|
LOG.info("Restoring cluster - stopping backup master: {}", sn.getAddress());
|
||||||
stopMaster(sn);
|
stopMaster(sn);
|
||||||
}
|
}
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
@ -421,8 +435,8 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!deferred.isEmpty()) {
|
if (!deferred.isEmpty()) {
|
||||||
LOG.warn("Restoring cluster - restoring region servers reported "
|
LOG.warn("Restoring cluster - restoring region servers reported {} errors:",
|
||||||
+ deferred.size() + " errors:");
|
deferred.size());
|
||||||
for (int i=0; i<deferred.size() && i < 3; i++) {
|
for (int i=0; i<deferred.size() && i < 3; i++) {
|
||||||
LOG.warn(Objects.toString(deferred.get(i)));
|
LOG.warn(Objects.toString(deferred.get(i)));
|
||||||
}
|
}
|
||||||
@ -464,7 +478,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
try {
|
try {
|
||||||
if (!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, sn.getHostname(),
|
if (!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, sn.getHostname(),
|
||||||
sn.getPort()) && master.getPort() != sn.getPort()) {
|
sn.getPort()) && master.getPort() != sn.getPort()) {
|
||||||
LOG.info("Restoring cluster - starting initial region server: " + sn.getAddress());
|
LOG.info("Restoring cluster - starting initial region server: {}", sn.getAddress());
|
||||||
startRegionServer(sn.getHostname(), sn.getPort());
|
startRegionServer(sn.getHostname(), sn.getPort());
|
||||||
}
|
}
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
@ -476,7 +490,7 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
try {
|
try {
|
||||||
if (clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, sn.getHostname(),
|
if (clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, sn.getHostname(),
|
||||||
sn.getPort()) && master.getPort() != sn.getPort()) {
|
sn.getPort()) && master.getPort() != sn.getPort()) {
|
||||||
LOG.info("Restoring cluster - stopping initial region server: " + sn.getAddress());
|
LOG.info("Restoring cluster - stopping initial region server: {}", sn.getAddress());
|
||||||
stopRegionServer(sn);
|
stopRegionServer(sn);
|
||||||
}
|
}
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
@ -484,8 +498,8 @@ public class DistributedHBaseCluster extends HBaseCluster {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!deferred.isEmpty()) {
|
if (!deferred.isEmpty()) {
|
||||||
LOG.warn("Restoring cluster - restoring region servers reported "
|
LOG.warn("Restoring cluster - restoring region servers reported {} errors:",
|
||||||
+ deferred.size() + " errors:");
|
deferred.size());
|
||||||
for (int i=0; i<deferred.size() && i < 3; i++) {
|
for (int i=0; i<deferred.size() && i < 3; i++) {
|
||||||
LOG.warn(Objects.toString(deferred.get(i)));
|
LOG.warn(Objects.toString(deferred.get(i)));
|
||||||
}
|
}
|
||||||
|
@ -35,6 +35,7 @@ import org.apache.hadoop.hbase.HBaseCluster;
|
|||||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||||
import org.apache.hadoop.hbase.HRegionInfo;
|
import org.apache.hadoop.hbase.HRegionInfo;
|
||||||
import org.apache.hadoop.hbase.IntegrationTestingUtility;
|
import org.apache.hadoop.hbase.IntegrationTestingUtility;
|
||||||
|
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
||||||
import org.apache.hadoop.hbase.ServerMetrics;
|
import org.apache.hadoop.hbase.ServerMetrics;
|
||||||
import org.apache.hadoop.hbase.ServerName;
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
import org.apache.hadoop.hbase.TableName;
|
import org.apache.hadoop.hbase.TableName;
|
||||||
@ -150,78 +151,106 @@ public class Action {
|
|||||||
}
|
}
|
||||||
|
|
||||||
protected void killMaster(ServerName server) throws IOException {
|
protected void killMaster(ServerName server) throws IOException {
|
||||||
LOG.info("Killing master " + server);
|
LOG.info("Killing master {}", server);
|
||||||
cluster.killMaster(server);
|
cluster.killMaster(server);
|
||||||
cluster.waitForMasterToStop(server, killMasterTimeout);
|
cluster.waitForMasterToStop(server, killMasterTimeout);
|
||||||
LOG.info("Killed master " + server);
|
LOG.info("Killed master " + server);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void startMaster(ServerName server) throws IOException {
|
protected void startMaster(ServerName server) throws IOException {
|
||||||
LOG.info("Starting master " + server.getHostname());
|
LOG.info("Starting master {}", server.getHostname());
|
||||||
cluster.startMaster(server.getHostname(), server.getPort());
|
cluster.startMaster(server.getHostname(), server.getPort());
|
||||||
cluster.waitForActiveAndReadyMaster(startMasterTimeout);
|
cluster.waitForActiveAndReadyMaster(startMasterTimeout);
|
||||||
LOG.info("Started master " + server.getHostname());
|
LOG.info("Started master " + server.getHostname());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void stopRs(ServerName server) throws IOException {
|
||||||
|
LOG.info("Stopping regionserver {}", server);
|
||||||
|
cluster.stopRegionServer(server);
|
||||||
|
cluster.waitForRegionServerToStop(server, killRsTimeout);
|
||||||
|
LOG.info("Stoppiong regionserver {}. Reported num of rs:{}", server,
|
||||||
|
cluster.getClusterMetrics().getLiveServerMetrics().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void suspendRs(ServerName server) throws IOException {
|
||||||
|
LOG.info("Suspending regionserver {}", server);
|
||||||
|
cluster.suspendRegionServer(server);
|
||||||
|
if(!(cluster instanceof MiniHBaseCluster)){
|
||||||
|
cluster.waitForRegionServerToStop(server, killRsTimeout);
|
||||||
|
}
|
||||||
|
LOG.info("Suspending regionserver {}. Reported num of rs:{}", server,
|
||||||
|
cluster.getClusterMetrics().getLiveServerMetrics().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void resumeRs(ServerName server) throws IOException {
|
||||||
|
LOG.info("Resuming regionserver {}", server);
|
||||||
|
cluster.resumeRegionServer(server);
|
||||||
|
if(!(cluster instanceof MiniHBaseCluster)){
|
||||||
|
cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
|
||||||
|
}
|
||||||
|
LOG.info("Resuming regionserver {}. Reported num of rs:{}", server,
|
||||||
|
cluster.getClusterMetrics().getLiveServerMetrics().size());
|
||||||
|
}
|
||||||
|
|
||||||
protected void killRs(ServerName server) throws IOException {
|
protected void killRs(ServerName server) throws IOException {
|
||||||
LOG.info("Killing regionserver " + server);
|
LOG.info("Killing regionserver {}", server);
|
||||||
cluster.killRegionServer(server);
|
cluster.killRegionServer(server);
|
||||||
cluster.waitForRegionServerToStop(server, killRsTimeout);
|
cluster.waitForRegionServerToStop(server, killRsTimeout);
|
||||||
LOG.info("Killed regionserver " + server + ". Reported num of rs:"
|
LOG.info("Killed regionserver {}. Reported num of rs:{}", server,
|
||||||
+ cluster.getClusterMetrics().getLiveServerMetrics().size());
|
cluster.getClusterMetrics().getLiveServerMetrics().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void startRs(ServerName server) throws IOException {
|
protected void startRs(ServerName server) throws IOException {
|
||||||
LOG.info("Starting regionserver " + server.getAddress());
|
LOG.info("Starting regionserver {}", server.getAddress());
|
||||||
cluster.startRegionServer(server.getHostname(), server.getPort());
|
cluster.startRegionServer(server.getHostname(), server.getPort());
|
||||||
cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
|
cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
|
||||||
LOG.info("Started regionserver " + server.getAddress() + ". Reported num of rs:"
|
LOG.info("Started regionserver {}. Reported num of rs:{}", server.getAddress(),
|
||||||
+ cluster.getClusterMetrics().getLiveServerMetrics().size());
|
cluster.getClusterMetrics().getLiveServerMetrics().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void killZKNode(ServerName server) throws IOException {
|
protected void killZKNode(ServerName server) throws IOException {
|
||||||
LOG.info("Killing zookeeper node " + server);
|
LOG.info("Killing zookeeper node {}", server);
|
||||||
cluster.killZkNode(server);
|
cluster.killZkNode(server);
|
||||||
cluster.waitForZkNodeToStop(server, killZkNodeTimeout);
|
cluster.waitForZkNodeToStop(server, killZkNodeTimeout);
|
||||||
LOG.info("Killed zookeeper node " + server + ". Reported num of rs:"
|
LOG.info("Killed zookeeper node {}. Reported num of rs:{}", server,
|
||||||
+ cluster.getClusterMetrics().getLiveServerMetrics().size());
|
cluster.getClusterMetrics().getLiveServerMetrics().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void startZKNode(ServerName server) throws IOException {
|
protected void startZKNode(ServerName server) throws IOException {
|
||||||
LOG.info("Starting zookeeper node " + server.getHostname());
|
LOG.info("Starting zookeeper node {}", server.getHostname());
|
||||||
cluster.startZkNode(server.getHostname(), server.getPort());
|
cluster.startZkNode(server.getHostname(), server.getPort());
|
||||||
cluster.waitForZkNodeToStart(server, startZkNodeTimeout);
|
cluster.waitForZkNodeToStart(server, startZkNodeTimeout);
|
||||||
LOG.info("Started zookeeper node " + server);
|
LOG.info("Started zookeeper node {}", server);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void killDataNode(ServerName server) throws IOException {
|
protected void killDataNode(ServerName server) throws IOException {
|
||||||
LOG.info("Killing datanode " + server);
|
LOG.info("Killing datanode {}", server);
|
||||||
cluster.killDataNode(server);
|
cluster.killDataNode(server);
|
||||||
cluster.waitForDataNodeToStop(server, killDataNodeTimeout);
|
cluster.waitForDataNodeToStop(server, killDataNodeTimeout);
|
||||||
LOG.info("Killed datanode " + server + ". Reported num of rs:"
|
LOG.info("Killed datanode {}. Reported num of rs:{}", server,
|
||||||
+ cluster.getClusterMetrics().getLiveServerMetrics().size());
|
cluster.getClusterMetrics().getLiveServerMetrics().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void startDataNode(ServerName server) throws IOException {
|
protected void startDataNode(ServerName server) throws IOException {
|
||||||
LOG.info("Starting datanode " + server.getHostname());
|
LOG.info("Starting datanode {}", server.getHostname());
|
||||||
cluster.startDataNode(server);
|
cluster.startDataNode(server);
|
||||||
cluster.waitForDataNodeToStart(server, startDataNodeTimeout);
|
cluster.waitForDataNodeToStart(server, startDataNodeTimeout);
|
||||||
LOG.info("Started datanode " + server);
|
LOG.info("Started datanode {}", server);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void killNameNode(ServerName server) throws IOException {
|
protected void killNameNode(ServerName server) throws IOException {
|
||||||
LOG.info("Killing namenode :-" + server.getHostname());
|
LOG.info("Killing namenode :-{}", server.getHostname());
|
||||||
cluster.killNameNode(server);
|
cluster.killNameNode(server);
|
||||||
cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
|
cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
|
||||||
LOG.info("Killed namenode:" + server + ". Reported num of rs:"
|
LOG.info("Killed namenode:{}. Reported num of rs:{}", server,
|
||||||
+ cluster.getClusterMetrics().getLiveServerMetrics().size());
|
cluster.getClusterMetrics().getLiveServerMetrics().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void startNameNode(ServerName server) throws IOException {
|
protected void startNameNode(ServerName server) throws IOException {
|
||||||
LOG.info("Starting Namenode :-" + server.getHostname());
|
LOG.info("Starting Namenode :-{}", server.getHostname());
|
||||||
cluster.startNameNode(server);
|
cluster.startNameNode(server);
|
||||||
cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
|
cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
|
||||||
LOG.info("Started namenode:" + server);
|
LOG.info("Started namenode:{}", server);
|
||||||
}
|
}
|
||||||
protected void unbalanceRegions(ClusterMetrics clusterStatus,
|
protected void unbalanceRegions(ClusterMetrics clusterStatus,
|
||||||
List<ServerName> fromServers, List<ServerName> toServers,
|
List<ServerName> fromServers, List<ServerName> toServers,
|
||||||
@ -234,7 +263,7 @@ public class Action {
|
|||||||
// Ugh.
|
// Ugh.
|
||||||
List<byte[]> regions = new LinkedList<>(serverLoad.getRegionMetrics().keySet());
|
List<byte[]> regions = new LinkedList<>(serverLoad.getRegionMetrics().keySet());
|
||||||
int victimRegionCount = (int)Math.ceil(fractionOfRegions * regions.size());
|
int victimRegionCount = (int)Math.ceil(fractionOfRegions * regions.size());
|
||||||
LOG.debug("Removing " + victimRegionCount + " regions from " + sn);
|
LOG.debug("Removing {} regions from {}", victimRegionCount, sn);
|
||||||
for (int i = 0; i < victimRegionCount; ++i) {
|
for (int i = 0; i < victimRegionCount; ++i) {
|
||||||
int victimIx = RandomUtils.nextInt(0, regions.size());
|
int victimIx = RandomUtils.nextInt(0, regions.size());
|
||||||
String regionId = HRegionInfo.encodeRegionName(regions.remove(victimIx));
|
String regionId = HRegionInfo.encodeRegionName(regions.remove(victimIx));
|
||||||
@ -242,8 +271,8 @@ public class Action {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG.info("Moving " + victimRegions.size() + " regions from " + fromServers.size()
|
LOG.info("Moving {} regions from {} servers to {} different servers", victimRegions.size(),
|
||||||
+ " servers to " + toServers.size() + " different servers");
|
fromServers.size(), toServers.size());
|
||||||
Admin admin = this.context.getHBaseIntegrationTestingUtility().getAdmin();
|
Admin admin = this.context.getHBaseIntegrationTestingUtility().getAdmin();
|
||||||
for (byte[] victimRegion : victimRegions) {
|
for (byte[] victimRegion : victimRegions) {
|
||||||
// Don't keep moving regions if we're
|
// Don't keep moving regions if we're
|
||||||
@ -269,6 +298,15 @@ public class Action {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void setBalancer(boolean onOrOff, boolean synchronous) throws Exception {
|
||||||
|
Admin admin = this.context.getHBaseIntegrationTestingUtility().getAdmin();
|
||||||
|
try {
|
||||||
|
admin.balancerSwitch(onOrOff, synchronous);
|
||||||
|
} catch (Exception e) {
|
||||||
|
LOG.warn("Got exception while switching balance ", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public Configuration getConf() {
|
public Configuration getConf() {
|
||||||
return cluster.getConf();
|
return cluster.getConf();
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,73 @@
|
|||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.hbase.chaos.actions;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.commons.lang3.RandomUtils;
|
||||||
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
|
import org.apache.hadoop.hbase.util.RegionMover;
|
||||||
|
import org.apache.hadoop.util.Shell;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gracefully restarts every regionserver in a rolling fashion. At each step, it unloads,
|
||||||
|
* restarts the loads every rs server sleeping randomly (0-sleepTime) in between servers.
|
||||||
|
*/
|
||||||
|
public class GracefulRollingRestartRsAction extends RestartActionBaseAction {
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(GracefulRollingRestartRsAction.class);
|
||||||
|
|
||||||
|
public GracefulRollingRestartRsAction(long sleepTime) {
|
||||||
|
super(sleepTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void perform() throws Exception {
|
||||||
|
LOG.info("Performing action: Rolling restarting non-master region servers");
|
||||||
|
List<ServerName> selectedServers = selectServers();
|
||||||
|
|
||||||
|
LOG.info("Disabling balancer to make unloading possible");
|
||||||
|
setBalancer(false, true);
|
||||||
|
|
||||||
|
for (ServerName server : selectedServers) {
|
||||||
|
String rsName = server.getAddress().toString();
|
||||||
|
try (RegionMover rm =
|
||||||
|
new RegionMover.RegionMoverBuilder(rsName, getConf()).ack(true).build()) {
|
||||||
|
LOG.info("Unloading {}", server);
|
||||||
|
rm.unload();
|
||||||
|
LOG.info("Restarting {}", server);
|
||||||
|
gracefulRestartRs(server, sleepTime);
|
||||||
|
LOG.info("Loading {}", server);
|
||||||
|
rm.load();
|
||||||
|
} catch (Shell.ExitCodeException e) {
|
||||||
|
LOG.info("Problem restarting but presume successful; code={}", e.getExitCode(), e);
|
||||||
|
}
|
||||||
|
sleep(RandomUtils.nextInt(0, (int)sleepTime));
|
||||||
|
}
|
||||||
|
LOG.info("Enabling balancer");
|
||||||
|
setBalancer(true, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<ServerName> selectServers() throws IOException {
|
||||||
|
return Arrays.asList(getCurrentServers());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -45,19 +45,42 @@ public class RestartActionBaseAction extends Action {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG.info("Killing master: {}", server);
|
||||||
killMaster(server);
|
killMaster(server);
|
||||||
sleep(sleepTime);
|
sleep(sleepTime);
|
||||||
|
LOG.info("Starting master: {}", server);
|
||||||
startMaster(server);
|
startMaster(server);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stop and then restart the region server instead of killing it.
|
||||||
|
* @param server hostname to restart the regionserver on
|
||||||
|
* @param sleepTime number of milliseconds between stop and restart
|
||||||
|
* @throws IOException if something goes wrong
|
||||||
|
*/
|
||||||
|
void gracefulRestartRs(ServerName server, long sleepTime) throws IOException {
|
||||||
|
sleepTime = Math.max(sleepTime, 1000);
|
||||||
|
// Don't try the stop if we're stopping already
|
||||||
|
if (context.isStopping()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
LOG.info("Stopping region server: {}", server);
|
||||||
|
stopRs(server);
|
||||||
|
sleep(sleepTime);
|
||||||
|
LOG.info("Starting region server: {}", server);
|
||||||
|
startRs(server);
|
||||||
|
}
|
||||||
|
|
||||||
void restartRs(ServerName server, long sleepTime) throws IOException {
|
void restartRs(ServerName server, long sleepTime) throws IOException {
|
||||||
sleepTime = Math.max(sleepTime, 1000);
|
sleepTime = Math.max(sleepTime, 1000);
|
||||||
// Don't try the kill if we're stopping
|
// Don't try the kill if we're stopping
|
||||||
if (context.isStopping()) {
|
if (context.isStopping()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
LOG.info("Killing region server: {}", server);
|
||||||
killRs(server);
|
killRs(server);
|
||||||
sleep(sleepTime);
|
sleep(sleepTime);
|
||||||
|
LOG.info("Starting region server: {}", server);
|
||||||
startRs(server);
|
startRs(server);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -67,8 +90,10 @@ public class RestartActionBaseAction extends Action {
|
|||||||
if (context.isStopping()) {
|
if (context.isStopping()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
LOG.info("Killing zookeeper node: {}", server);
|
||||||
killZKNode(server);
|
killZKNode(server);
|
||||||
sleep(sleepTime);
|
sleep(sleepTime);
|
||||||
|
LOG.info("Starting zookeeper node: {}", server);
|
||||||
startZKNode(server);
|
startZKNode(server);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -78,8 +103,10 @@ public class RestartActionBaseAction extends Action {
|
|||||||
if (context.isStopping()) {
|
if (context.isStopping()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
LOG.info("Killing data node: {}", server);
|
||||||
killDataNode(server);
|
killDataNode(server);
|
||||||
sleep(sleepTime);
|
sleep(sleepTime);
|
||||||
|
LOG.info("Starting data node: {}", server);
|
||||||
startDataNode(server);
|
startDataNode(server);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -89,8 +116,10 @@ public class RestartActionBaseAction extends Action {
|
|||||||
if (context.isStopping()) {
|
if (context.isStopping()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
LOG.info("Killing name node: {}", server);
|
||||||
killNameNode(server);
|
killNameNode(server);
|
||||||
sleep(sleepTime);
|
sleep(sleepTime);
|
||||||
|
LOG.info("Starting name node: {}", server);
|
||||||
startNameNode(server);
|
startNameNode(server);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,117 @@
|
|||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hadoop.hbase.chaos.actions;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Queue;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.RandomUtils;
|
||||||
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
|
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||||
|
import org.apache.hadoop.hbase.util.Threads;
|
||||||
|
import org.apache.hadoop.util.Shell;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Suspend then resume a ratio of the regionservers in a rolling fashion. At each step, either
|
||||||
|
* suspend a server, or resume one, sleeping (sleepTime) in between steps. The parameter
|
||||||
|
* maxSuspendedServers limits the maximum number of servers that can be down at the same time
|
||||||
|
* during rolling restarts.
|
||||||
|
*/
|
||||||
|
public class RollingBatchSuspendResumeRsAction extends Action {
|
||||||
|
private static final Logger LOG =
|
||||||
|
LoggerFactory.getLogger(RollingBatchSuspendResumeRsAction.class);
|
||||||
|
private float ratio;
|
||||||
|
private long sleepTime;
|
||||||
|
private int maxSuspendedServers; // number of maximum suspended servers at any given time.
|
||||||
|
|
||||||
|
public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio) {
|
||||||
|
this(sleepTime, ratio, 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
public RollingBatchSuspendResumeRsAction(long sleepTime, float ratio, int maxSuspendedServers) {
|
||||||
|
this.ratio = ratio;
|
||||||
|
this.sleepTime = sleepTime;
|
||||||
|
this.maxSuspendedServers = maxSuspendedServers;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum SuspendOrResume {
|
||||||
|
SUSPEND, RESUME
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void perform() throws Exception {
|
||||||
|
LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
|
||||||
|
(int) (ratio * 100)));
|
||||||
|
List<ServerName> selectedServers = selectServers();
|
||||||
|
|
||||||
|
Queue<ServerName> serversToBeSuspended = new LinkedList<>(selectedServers);
|
||||||
|
Queue<ServerName> suspendedServers = new LinkedList<>();
|
||||||
|
|
||||||
|
// loop while there are servers to be suspended or suspended servers to be resumed
|
||||||
|
while ((!serversToBeSuspended.isEmpty() || !suspendedServers.isEmpty()) && !context
|
||||||
|
.isStopping()) {
|
||||||
|
SuspendOrResume action;
|
||||||
|
|
||||||
|
if (serversToBeSuspended.isEmpty()) { // no more servers to suspend
|
||||||
|
action = SuspendOrResume.RESUME;
|
||||||
|
} else if (suspendedServers.isEmpty()) {
|
||||||
|
action = SuspendOrResume.SUSPEND; // no more servers to resume
|
||||||
|
} else if (suspendedServers.size() >= maxSuspendedServers) {
|
||||||
|
// we have too many suspended servers. Don't suspend any more
|
||||||
|
action = SuspendOrResume.RESUME;
|
||||||
|
} else {
|
||||||
|
// do a coin toss
|
||||||
|
action = RandomUtils.nextBoolean() ? SuspendOrResume.SUSPEND : SuspendOrResume.RESUME;
|
||||||
|
}
|
||||||
|
|
||||||
|
ServerName server;
|
||||||
|
switch (action) {
|
||||||
|
case SUSPEND:
|
||||||
|
server = serversToBeSuspended.remove();
|
||||||
|
try {
|
||||||
|
suspendRs(server);
|
||||||
|
} catch (Shell.ExitCodeException e) {
|
||||||
|
LOG.warn("Problem suspending but presume successful; code={}", e.getExitCode(), e);
|
||||||
|
}
|
||||||
|
suspendedServers.add(server);
|
||||||
|
break;
|
||||||
|
case RESUME:
|
||||||
|
server = suspendedServers.remove();
|
||||||
|
try {
|
||||||
|
resumeRs(server);
|
||||||
|
} catch (Shell.ExitCodeException e) {
|
||||||
|
LOG.info("Problem resuming, will retry; code={}", e.getExitCode(), e);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG.info("Sleeping for:{}", sleepTime);
|
||||||
|
Threads.sleep(sleepTime);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<ServerName> selectServers() throws IOException {
|
||||||
|
return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -45,6 +45,9 @@ public interface MonkeyConstants {
|
|||||||
String UNBALANCE_WAIT_AFTER_BALANCE_MS = "unbalance.action.wait.after.period";
|
String UNBALANCE_WAIT_AFTER_BALANCE_MS = "unbalance.action.wait.after.period";
|
||||||
String UNBALANCE_KILL_META_RS = "unbalance.action.kill.meta.rs";
|
String UNBALANCE_KILL_META_RS = "unbalance.action.kill.meta.rs";
|
||||||
String DECREASE_HFILE_SIZE_SLEEP_TIME = "decrease.hfile.size.sleep.time";
|
String DECREASE_HFILE_SIZE_SLEEP_TIME = "decrease.hfile.size.sleep.time";
|
||||||
|
String GRACEFUL_RESTART_RS_SLEEP_TIME = "graceful.restart.rs.sleep.time";
|
||||||
|
String ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = "rolling.batch.suspend.rs.sleep.time";
|
||||||
|
String ROLLING_BATCH_SUSPEND_RS_RATIO = "rolling.batch.suspend.rs.ratio";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A Set of prefixes which encompasses all of the configuration properties for the ChaosMonky.
|
* A Set of prefixes which encompasses all of the configuration properties for the ChaosMonky.
|
||||||
@ -75,4 +78,7 @@ public interface MonkeyConstants {
|
|||||||
long DEFAULT_UNBALANCE_WAIT_AFTER_BALANCE_MS = 5 * 1000;
|
long DEFAULT_UNBALANCE_WAIT_AFTER_BALANCE_MS = 5 * 1000;
|
||||||
boolean DEFAULT_UNBALANCE_KILL_META_RS = true;
|
boolean DEFAULT_UNBALANCE_KILL_META_RS = true;
|
||||||
long DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME = 30 * 1000;
|
long DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME = 30 * 1000;
|
||||||
|
long DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME = 5000;
|
||||||
|
long DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = 30 * 1000;
|
||||||
|
float DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO = 1.0f;
|
||||||
}
|
}
|
||||||
|
@ -21,11 +21,13 @@ package org.apache.hadoop.hbase.chaos.factories;
|
|||||||
import org.apache.hadoop.hbase.chaos.actions.Action;
|
import org.apache.hadoop.hbase.chaos.actions.Action;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
|
import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
|
||||||
|
import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction;
|
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction;
|
||||||
|
import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
|
||||||
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
||||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||||
import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
|
import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
|
||||||
@ -38,8 +40,13 @@ import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
|
|||||||
*/
|
*/
|
||||||
public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
|
public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
|
||||||
|
|
||||||
|
private long gracefulRollingRestartTSSLeepTime;
|
||||||
|
private long rollingBatchSuspendRSSleepTime;
|
||||||
|
private float rollingBatchSuspendtRSRatio;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ChaosMonkey build() {
|
public ChaosMonkey build() {
|
||||||
|
loadProperties();
|
||||||
|
|
||||||
// Destructive actions to mess things around. Cannot run batch restart.
|
// Destructive actions to mess things around. Cannot run batch restart.
|
||||||
Action[] actions1 = new Action[]{
|
Action[] actions1 = new Action[]{
|
||||||
@ -48,7 +55,10 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
|
|||||||
new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), // only allow 2 servers to be dead.
|
new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), // only allow 2 servers to be dead.
|
||||||
new ForceBalancerAction(),
|
new ForceBalancerAction(),
|
||||||
new RestartRandomDataNodeAction(60000),
|
new RestartRandomDataNodeAction(60000),
|
||||||
new RestartRandomZKNodeAction(60000)
|
new RestartRandomZKNodeAction(60000),
|
||||||
|
new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime),
|
||||||
|
new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
|
||||||
|
rollingBatchSuspendtRSRatio)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Action to log more info for debugging
|
// Action to log more info for debugging
|
||||||
@ -62,4 +72,16 @@ public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {
|
|||||||
new PeriodicRandomActionPolicy(60 * 1000, actions1)),
|
new PeriodicRandomActionPolicy(60 * 1000, actions1)),
|
||||||
new PeriodicRandomActionPolicy(60 * 1000, actions2));
|
new PeriodicRandomActionPolicy(60 * 1000, actions2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void loadProperties() {
|
||||||
|
gracefulRollingRestartTSSLeepTime = Long.parseLong(this.properties.getProperty(
|
||||||
|
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
|
||||||
|
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
|
||||||
|
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||||
|
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||||
|
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||||
|
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||||
|
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||||
|
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -21,9 +21,11 @@ package org.apache.hadoop.hbase.chaos.factories;
|
|||||||
import org.apache.hadoop.hbase.chaos.actions.Action;
|
import org.apache.hadoop.hbase.chaos.actions.Action;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
|
import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
|
||||||
|
import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction;
|
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsExceptMetaAction;
|
||||||
|
import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
|
||||||
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
||||||
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
|
||||||
import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
|
import org.apache.hadoop.hbase.chaos.policies.CompositeSequentialPolicy;
|
||||||
@ -36,15 +38,23 @@ import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
|
|||||||
*/
|
*/
|
||||||
public class ServerKillingMonkeyFactory extends MonkeyFactory {
|
public class ServerKillingMonkeyFactory extends MonkeyFactory {
|
||||||
|
|
||||||
|
private long gracefulRollingRestartTSSLeepTime;
|
||||||
|
private long rollingBatchSuspendRSSleepTime;
|
||||||
|
private float rollingBatchSuspendtRSRatio;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ChaosMonkey build() {
|
public ChaosMonkey build() {
|
||||||
|
loadProperties();
|
||||||
|
|
||||||
// Destructive actions to mess things around. Cannot run batch restart
|
// Destructive actions to mess things around. Cannot run batch restart
|
||||||
Action[] actions1 = new Action[] {
|
Action[] actions1 = new Action[] {
|
||||||
new RestartRandomRsExceptMetaAction(60000),
|
new RestartRandomRsExceptMetaAction(60000),
|
||||||
new RestartActiveMasterAction(5000),
|
new RestartActiveMasterAction(5000),
|
||||||
new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), //only allow 2 servers to be dead
|
new RollingBatchRestartRsExceptMetaAction(5000, 1.0f, 2), //only allow 2 servers to be dead
|
||||||
new ForceBalancerAction()
|
new ForceBalancerAction(),
|
||||||
|
new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime),
|
||||||
|
new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
|
||||||
|
rollingBatchSuspendtRSRatio)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Action to log more info for debugging
|
// Action to log more info for debugging
|
||||||
@ -58,4 +68,16 @@ public class ServerKillingMonkeyFactory extends MonkeyFactory {
|
|||||||
new PeriodicRandomActionPolicy(60 * 1000, actions1)),
|
new PeriodicRandomActionPolicy(60 * 1000, actions1)),
|
||||||
new PeriodicRandomActionPolicy(60 * 1000, actions2));
|
new PeriodicRandomActionPolicy(60 * 1000, actions2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void loadProperties() {
|
||||||
|
gracefulRollingRestartTSSLeepTime = Long.parseLong(this.properties.getProperty(
|
||||||
|
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
|
||||||
|
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
|
||||||
|
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||||
|
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||||
|
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||||
|
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||||
|
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||||
|
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -31,6 +31,7 @@ import org.apache.hadoop.hbase.chaos.actions.DecreaseMaxHFileSizeAction;
|
|||||||
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.FlushTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.FlushTableAction;
|
||||||
|
import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction;
|
||||||
@ -39,6 +40,7 @@ import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
|
|||||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction;
|
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction;
|
||||||
|
import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.SnapshotTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.SnapshotTableAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction;
|
||||||
@ -66,6 +68,9 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
|
|||||||
private float compactTableRatio;
|
private float compactTableRatio;
|
||||||
private float compactRandomRegionRatio;
|
private float compactRandomRegionRatio;
|
||||||
private long decreaseHFileSizeSleepTime;
|
private long decreaseHFileSizeSleepTime;
|
||||||
|
private long gracefulRollingRestartTSSLeepTime;
|
||||||
|
private long rollingBatchSuspendRSSleepTime;
|
||||||
|
private float rollingBatchSuspendtRSRatio;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ChaosMonkey build() {
|
public ChaosMonkey build() {
|
||||||
@ -110,6 +115,9 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
|
|||||||
new RestartRsHoldingMetaAction(restartRsHoldingMetaSleepTime),
|
new RestartRsHoldingMetaAction(restartRsHoldingMetaSleepTime),
|
||||||
new DecreaseMaxHFileSizeAction(decreaseHFileSizeSleepTime, tableName),
|
new DecreaseMaxHFileSizeAction(decreaseHFileSizeSleepTime, tableName),
|
||||||
new SplitAllRegionOfTableAction(tableName),
|
new SplitAllRegionOfTableAction(tableName),
|
||||||
|
new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime),
|
||||||
|
new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
|
||||||
|
rollingBatchSuspendtRSRatio)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Action to log more info for debugging
|
// Action to log more info for debugging
|
||||||
@ -179,5 +187,14 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
|
|||||||
decreaseHFileSizeSleepTime = Long.parseLong(this.properties.getProperty(
|
decreaseHFileSizeSleepTime = Long.parseLong(this.properties.getProperty(
|
||||||
MonkeyConstants.DECREASE_HFILE_SIZE_SLEEP_TIME,
|
MonkeyConstants.DECREASE_HFILE_SIZE_SLEEP_TIME,
|
||||||
MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME + ""));
|
MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME + ""));
|
||||||
|
gracefulRollingRestartTSSLeepTime = Long.parseLong(this.properties.getProperty(
|
||||||
|
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
|
||||||
|
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
|
||||||
|
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||||
|
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||||
|
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||||
|
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||||
|
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||||
|
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.chaos.actions.DecreaseMaxHFileSizeAction;
|
|||||||
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.FlushTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.FlushTableAction;
|
||||||
|
import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction;
|
||||||
@ -34,6 +35,7 @@ import org.apache.hadoop.hbase.chaos.actions.RemoveColumnAction;
|
|||||||
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction;
|
import org.apache.hadoop.hbase.chaos.actions.RestartRsHoldingMetaAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction;
|
import org.apache.hadoop.hbase.chaos.actions.RollingBatchRestartRsAction;
|
||||||
|
import org.apache.hadoop.hbase.chaos.actions.RollingBatchSuspendResumeRsAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.SplitAllRegionOfTableAction;
|
||||||
import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction;
|
import org.apache.hadoop.hbase.chaos.actions.SplitRandomRegionOfTableAction;
|
||||||
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
import org.apache.hadoop.hbase.chaos.monkies.ChaosMonkey;
|
||||||
@ -43,8 +45,15 @@ import org.apache.hadoop.hbase.chaos.policies.DoActionsOncePolicy;
|
|||||||
import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
|
import org.apache.hadoop.hbase.chaos.policies.PeriodicRandomActionPolicy;
|
||||||
|
|
||||||
public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
|
public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
|
||||||
|
|
||||||
|
private long gracefulRollingRestartTSSLeepTime;
|
||||||
|
private long rollingBatchSuspendRSSleepTime;
|
||||||
|
private float rollingBatchSuspendtRSRatio;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ChaosMonkey build() {
|
public ChaosMonkey build() {
|
||||||
|
loadProperties();
|
||||||
|
|
||||||
// Actions that could slow down region movement.
|
// Actions that could slow down region movement.
|
||||||
// These could also get regions stuck if there are issues.
|
// These could also get regions stuck if there are issues.
|
||||||
Action[] actions1 = new Action[]{
|
Action[] actions1 = new Action[]{
|
||||||
@ -72,6 +81,9 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
|
|||||||
new SplitAllRegionOfTableAction(tableName),
|
new SplitAllRegionOfTableAction(tableName),
|
||||||
new DecreaseMaxHFileSizeAction(MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME,
|
new DecreaseMaxHFileSizeAction(MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME,
|
||||||
tableName),
|
tableName),
|
||||||
|
new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime),
|
||||||
|
new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
|
||||||
|
rollingBatchSuspendtRSRatio)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Action to log more info for debugging
|
// Action to log more info for debugging
|
||||||
@ -87,4 +99,16 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
|
|||||||
new PeriodicRandomActionPolicy(90 * 1000, actions3)
|
new PeriodicRandomActionPolicy(90 * 1000, actions3)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void loadProperties() {
|
||||||
|
gracefulRollingRestartTSSLeepTime = Long.parseLong(this.properties.getProperty(
|
||||||
|
MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
|
||||||
|
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
|
||||||
|
rollingBatchSuspendRSSleepTime = Long.parseLong(this.properties.getProperty(
|
||||||
|
MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
|
||||||
|
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
|
||||||
|
rollingBatchSuspendtRSRatio = Float.parseFloat(this.properties.getProperty(
|
||||||
|
MonkeyConstants.ROLLING_BATCH_RESTART_RS_RATIO,
|
||||||
|
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_RATIO + ""));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,8 +18,9 @@
|
|||||||
|
|
||||||
package org.apache.hadoop.hbase.chaos.monkies;
|
package org.apache.hadoop.hbase.chaos.monkies;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.lang3.RandomUtils;
|
import org.apache.commons.lang3.RandomUtils;
|
||||||
@ -90,18 +91,13 @@ public class PolicyBasedChaosMonkey extends ChaosMonkey {
|
|||||||
|
|
||||||
/** Selects and returns ceil(ratio * items.length) random items from the given array */
|
/** Selects and returns ceil(ratio * items.length) random items from the given array */
|
||||||
public static <T> List<T> selectRandomItems(T[] items, float ratio) {
|
public static <T> List<T> selectRandomItems(T[] items, float ratio) {
|
||||||
int remaining = (int)Math.ceil(items.length * ratio);
|
int selectedNumber = (int)Math.ceil(items.length * ratio);
|
||||||
|
|
||||||
List<T> selectedItems = new ArrayList<>(remaining);
|
List<T> originalItems = Arrays.asList(items);
|
||||||
|
Collections.shuffle(originalItems);
|
||||||
|
|
||||||
for (int i=0; i<items.length && remaining > 0; i++) {
|
int startIndex = RandomUtils.nextInt(0, items.length - selectedNumber);
|
||||||
if (RandomUtils.nextFloat() < ((float)remaining/(items.length-i))) {
|
return originalItems.subList(startIndex, startIndex + selectedNumber);
|
||||||
selectedItems.add(items[i]);
|
|
||||||
remaining--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return selectedItems;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Policy[] policies;
|
private Policy[] policies;
|
||||||
|
@ -148,6 +148,20 @@ public abstract class HBaseCluster implements Closeable, Configurable {
|
|||||||
public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
|
public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
|
||||||
throws IOException;
|
throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Suspend the region server
|
||||||
|
* @param serverName the hostname to suspend the regionserver on
|
||||||
|
* @throws IOException if something goes wrong
|
||||||
|
*/
|
||||||
|
public abstract void suspendRegionServer(ServerName serverName) throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resume the region server
|
||||||
|
* @param serverName the hostname to resume the regionserver on
|
||||||
|
* @throws IOException if something goes wrong
|
||||||
|
*/
|
||||||
|
public abstract void resumeRegionServer(ServerName serverName) throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Starts a new zookeeper node on the given hostname or if this is a mini/local cluster,
|
* Starts a new zookeeper node on the given hostname or if this is a mini/local cluster,
|
||||||
* silently logs warning message.
|
* silently logs warning message.
|
||||||
|
@ -291,6 +291,16 @@ public class MiniHBaseCluster extends HBaseCluster {
|
|||||||
stopRegionServer(getRegionServerIndex(serverName));
|
stopRegionServer(getRegionServerIndex(serverName));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void suspendRegionServer(ServerName serverName) throws IOException {
|
||||||
|
suspendRegionServer(getRegionServerIndex(serverName));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void resumeRegionServer(ServerName serverName) throws IOException {
|
||||||
|
resumeRegionServer(getRegionServerIndex(serverName));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
|
public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
|
||||||
//ignore timeout for now
|
//ignore timeout for now
|
||||||
@ -489,6 +499,32 @@ public class MiniHBaseCluster extends HBaseCluster {
|
|||||||
return server;
|
return server;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Suspend the specified region server
|
||||||
|
* @param serverNumber Used as index into a list.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public JVMClusterUtil.RegionServerThread suspendRegionServer(int serverNumber) {
|
||||||
|
JVMClusterUtil.RegionServerThread server =
|
||||||
|
hbaseCluster.getRegionServers().get(serverNumber);
|
||||||
|
LOG.info("Suspending {}", server.toString());
|
||||||
|
server.suspend();
|
||||||
|
return server;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resume the specified region server
|
||||||
|
* @param serverNumber Used as index into a list.
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public JVMClusterUtil.RegionServerThread resumeRegionServer(int serverNumber) {
|
||||||
|
JVMClusterUtil.RegionServerThread server =
|
||||||
|
hbaseCluster.getRegionServers().get(serverNumber);
|
||||||
|
LOG.info("Resuming {}", server.toString());
|
||||||
|
server.resume();
|
||||||
|
return server;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wait for the specified region server to stop. Removes this thread from list
|
* Wait for the specified region server to stop. Removes this thread from list
|
||||||
* of running threads.
|
* of running threads.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user