From 0dae377f53db6388a933fd7a1ebe231cf0d997d7 Mon Sep 17 00:00:00 2001 From: Nick Dimiduk Date: Mon, 18 May 2020 12:08:52 -0700 Subject: [PATCH] HBASE-24360 RollingBatchRestartRsAction loses track of dead servers `RollingBatchRestartRsAction` doesn't handle failure cases when tracking its list of dead servers. The original author believed that a failure to restart would result in a retry. However, by removing the dead server from the failed list, that state is lost, and retry never occurs. Because this action doesn't ever look back to the current state of the cluster, relying only on its local state for the current action invocation, it never realizes the abandoned server is still dead. Instead, be more careful to only remove the dead server from the list when the `startRs` invocation claims to have been successful. Signed-off-by: stack --- .../apache/hadoop/hbase/chaos/actions/Action.java | 2 +- .../chaos/actions/RollingBatchRestartRsAction.java | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java index a1f84dca5e0..8f46a26f47d 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java @@ -190,7 +190,7 @@ public abstract class Action { getLogger().info("Stopping regionserver {}", server); cluster.stopRegionServer(server); cluster.waitForRegionServerToStop(server, killRsTimeout); - getLogger().info("Stoppiong regionserver {}. Reported num of rs:{}", server, + getLogger().info("Stopping regionserver {}. Reported num of rs:{}", server, cluster.getClusterMetrics().getLiveServerMetrics().size()); } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java index bd136bb7360..c25a6b31f4b 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RollingBatchRestartRsAction.java @@ -20,8 +20,10 @@ package org.apache.hadoop.hbase.chaos.actions; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.LinkedList; import java.util.List; +import java.util.Objects; import java.util.Queue; import org.apache.commons.lang3.RandomUtils; import org.apache.hadoop.hbase.ServerName; @@ -70,7 +72,7 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction { List selectedServers = selectServers(); Queue serversToBeKilled = new LinkedList<>(selectedServers); - Queue deadServers = new LinkedList<>(); + LinkedList deadServers = new LinkedList<>(); // loop while there are servers to be killed or dead servers to be restarted while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) { @@ -103,13 +105,17 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction { deadServers.add(server); break; case START: + server = Objects.requireNonNull(deadServers.peek()); try { - server = deadServers.remove(); startRs(server); + // only remove the server from the known dead list if `startRs` succeeds. + deadServers.remove(server); } catch (org.apache.hadoop.util.Shell.ExitCodeException e) { // The start may fail but better to just keep going though we may lose server. - // - getLogger().info("Problem starting, will retry; code={}", e.getExitCode(), e); + // Shuffle the dead list to avoid getting stuck on a single stubborn host. + Collections.shuffle(deadServers); + getLogger().info( + "Problem starting {}, will retry; code={}", server, e.getExitCode(), e); } break; }