Fix TransportMasterNodeAction not Retrying NodeClosedException (#51325) (#51437)

Added node closed exception to the retryable remote exceptions as it's possible to run into this exception instead of a connect exception when the master node is just shutting down but still responding to requests.
This commit is contained in:
Armin Braun 2020-01-24 20:33:13 +01:00 committed by GitHub
parent ef0a933541
commit af1ff52e70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 22 additions and 2 deletions

View File

@ -46,6 +46,7 @@ import org.elasticsearch.node.NodeClosedException;
import org.elasticsearch.tasks.Task;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.ConnectTransportException;
import org.elasticsearch.transport.RemoteTransportException;
import org.elasticsearch.transport.TransportException;
import org.elasticsearch.transport.TransportService;
@ -180,7 +181,8 @@ public abstract class TransportMasterNodeAction<Request extends MasterNodeReques
@Override
public void handleException(final TransportException exp) {
Throwable cause = exp.unwrapCause();
if (cause instanceof ConnectTransportException) {
if (cause instanceof ConnectTransportException ||
(exp instanceof RemoteTransportException && cause instanceof NodeClosedException)) {
// we want to retry here a bit to see if a new master is elected
logger.debug("connection exception while trying to forward request with action name [{}] to " +
"master node [{}], scheduling a retry. Error: [{}]",

View File

@ -44,6 +44,7 @@ import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.discovery.MasterNotDiscoveredException;
import org.elasticsearch.node.NodeClosedException;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.tasks.Task;
import org.elasticsearch.test.ESTestCase;
@ -391,7 +392,8 @@ public class TransportMasterNodeActionTests extends ESTestCase {
assertThat(capturedRequest.action, equalTo("internal:testAction"));
if (rejoinSameMaster) {
transport.handleRemoteError(capturedRequest.requestId, new ConnectTransportException(masterNode, "Fake error"));
transport.handleRemoteError(capturedRequest.requestId,
randomBoolean() ? new ConnectTransportException(masterNode, "Fake error") : new NodeClosedException(masterNode));
assertFalse(listener.isDone());
if (randomBoolean()) {
// simulate master node removal

View File

@ -28,7 +28,10 @@ import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.InternalTestCluster;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
@ -294,4 +297,17 @@ public class ClusterHealthIT extends ESIntegTestCase {
assertFalse(healthResponseFuture.get().isTimedOut());
}
public void testHealthOnMasterFailover() throws Exception {
final String node = internalCluster().startDataOnlyNode();
final List<ActionFuture<ClusterHealthResponse>> responseFutures = new ArrayList<>();
// Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled
// without exceptions
for (int i = 0; i < 20; ++i) {
responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).execute());
internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK);
}
for (ActionFuture<ClusterHealthResponse> responseFuture : responseFutures) {
assertSame(responseFuture.get().getStatus(), ClusterHealthStatus.GREEN);
}
}
}