diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java index 1e00659e4e7..7a2016e056d 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java @@ -159,7 +159,7 @@ public class Action { LOG.info("Stopping regionserver " + server); cluster.stopRegionServer(server); cluster.waitForRegionServerToStop(server, killRsTimeout); - LOG.info(String.format("Stoppiong regionserver %s. Reported num of rs: %s", server, + LOG.info(String.format("Stopping regionserver %s. Reported num of rs: %s", server, cluster.getClusterStatus().getLiveServersLoad().size())); } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java index 347340799c1..7db9f5ac9a5 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java @@ -20,8 +20,10 @@ package org.apache.hadoop.hbase.chaos.actions; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.LinkedList; import java.util.List; +import java.util.Objects; import java.util.Queue; import org.apache.commons.lang.math.RandomUtils; @@ -61,7 +63,7 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction { List selectedServers = selectServers(); Queue serversToBeKilled = new LinkedList(selectedServers); - Queue deadServers = new LinkedList(); + LinkedList deadServers = new LinkedList(); // loop while there are servers to be killed or dead servers to be restarted while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) { @@ -94,13 +96,17 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction { deadServers.add(server); break; case START: + server = Objects.requireNonNull(deadServers.peek()); try { - server = deadServers.remove(); startRs(server); + // only remove the server from the known dead list if `startRs` succeeds. + deadServers.remove(server); } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { // The start may fail but better to just keep going though we may lose server. - // - LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e); + // Shuffle the dead list to avoid getting stuck on a single stubborn host. + Collections.shuffle(deadServers); + LOG.info(String.format( + "Problem starting %s, will retry; code=%s", server, e.getExitCode(), e)); } break; }