diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java index a1f84dca5e0..8f46a26f47d 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java @@ -190,7 +190,7 @@ public abstract class Action { getLogger().info("Stopping regionserver {}", server); cluster.stopRegionServer(server); cluster.waitForRegionServerToStop(server, killRsTimeout); - getLogger().info("Stoppiong regionserver {}. Reported num of rs:{}", server, + getLogger().info("Stopping regionserver {}. Reported num of rs:{}", server, cluster.getClusterMetrics().getLiveServerMetrics().size()); } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java index bd136bb7360..c25a6b31f4b 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java @@ -20,8 +20,10 @@ package org.apache.hadoop.hbase.chaos.actions; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.LinkedList; import java.util.List; +import java.util.Objects; import java.util.Queue; import org.apache.commons.lang3.RandomUtils; import org.apache.hadoop.hbase.ServerName; @@ -70,7 +72,7 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction { List selectedServers = selectServers(); Queue serversToBeKilled = new LinkedList<>(selectedServers); - Queue deadServers = new LinkedList<>(); + LinkedList deadServers = new LinkedList<>(); // loop while there are servers to be killed or dead servers to be restarted while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) { @@ -103,13 +105,17 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction { deadServers.add(server); break; case START: + server = Objects.requireNonNull(deadServers.peek()); try { - server = deadServers.remove(); startRs(server); + // only remove the server from the known dead list if `startRs` succeeds. + deadServers.remove(server); } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { // The start may fail but better to just keep going though we may lose server. - // - getLogger().info("Problem starting, will retry; code={}", e.getExitCode(), e); + // Shuffle the dead list to avoid getting stuck on a single stubborn host. + Collections.shuffle(deadServers); + getLogger().info( + "Problem starting {}, will retry; code={}", server, e.getExitCode(), e); } break; }