HBASE-24360 RollingBatchRestartRsAction loses track of dead servers
`RollingBatchRestartRsAction` doesn't handle failure cases when
tracking its list of dead servers. The original author believed that a
failure to restart would result in a retry. However, by removing the
dead server from the failed list, that state is lost, and retry never
occurs. Because this action doesn't ever look back to the current
state of the cluster, relying only on its local state for the current
action invocation, it never realizes the abandoned server is still
dead. Instead, be more careful to only remove the dead server from the
list when the `startRs` invocation claims to have been successful.
Signed-off-by: stack <stack@apache.org>
(cherry picked from commit 0dae377f53
)
This commit is contained in:
parent
186373bea4
commit
0224dccdb8
|
@ -159,7 +159,7 @@ public class Action {
|
|||
LOG.info("Stopping regionserver " + server);
|
||||
cluster.stopRegionServer(server);
|
||||
cluster.waitForRegionServerToStop(server, killRsTimeout);
|
||||
LOG.info(String.format("Stoppiong regionserver %s. Reported num of rs: %s", server,
|
||||
LOG.info(String.format("Stopping regionserver %s. Reported num of rs: %s", server,
|
||||
cluster.getClusterStatus().getLiveServersLoad().size()));
|
||||
}
|
||||
|
||||
|
|
|
@ -20,8 +20,10 @@ package org.apache.hadoop.hbase.chaos.actions;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Queue;
|
||||
|
||||
import org.apache.commons.lang.math.RandomUtils;
|
||||
|
@ -61,7 +63,7 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction {
|
|||
List<ServerName> selectedServers = selectServers();
|
||||
|
||||
Queue<ServerName> serversToBeKilled = new LinkedList<ServerName>(selectedServers);
|
||||
Queue<ServerName> deadServers = new LinkedList<ServerName>();
|
||||
LinkedList<ServerName> deadServers = new LinkedList<ServerName>();
|
||||
|
||||
// loop while there are servers to be killed or dead servers to be restarted
|
||||
while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) {
|
||||
|
@ -94,13 +96,17 @@ public class RollingBatchRestartRsAction extends BatchRestartRsAction {
|
|||
deadServers.add(server);
|
||||
break;
|
||||
case START:
|
||||
server = Objects.requireNonNull(deadServers.peek());
|
||||
try {
|
||||
server = deadServers.remove();
|
||||
startRs(server);
|
||||
// only remove the server from the known dead list if `startRs` succeeds.
|
||||
deadServers.remove(server);
|
||||
} catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
|
||||
// The start may fail but better to just keep going though we may lose server.
|
||||
//
|
||||
LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e);
|
||||
// Shuffle the dead list to avoid getting stuck on a single stubborn host.
|
||||
Collections.shuffle(deadServers);
|
||||
LOG.info(String.format(
|
||||
"Problem starting %s, will retry; code=%s", server, e.getExitCode(), e));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue