Fix TransportMasterNodeAction not Retrying NodeClosedException (#51325) (#51437)

Added node closed exception to the retryable remote exceptions as it's possible to run into this exception instead of a connect exception when the master node is just shutting down but still responding to requests.
2025-03-24 17:09:48 +00:00 · 2020-01-24 20:33:13 +01:00 · 2020-01-24 20:33:13 +01:00 · af1ff52e70
commit af1ff52e70
parent ef0a933541
3 changed files with 22 additions and 2 deletions
--- a/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java
+++ b/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java
@ -46,6 +46,7 @@ import org.elasticsearch.node.NodeClosedException;
 import org.elasticsearch.tasks.Task;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.ConnectTransportException;
+import org.elasticsearch.transport.RemoteTransportException;
 import org.elasticsearch.transport.TransportException;
 import org.elasticsearch.transport.TransportService;

@ -180,7 +181,8 @@ public abstract class TransportMasterNodeAction<Request extends MasterNodeReques
                                @Override
                                public void handleException(final TransportException exp) {
                                    Throwable cause = exp.unwrapCause();
-                                    if (cause instanceof ConnectTransportException) {
+                                    if (cause instanceof ConnectTransportException ||
+                                        (exp instanceof RemoteTransportException && cause instanceof NodeClosedException)) {
                                        // we want to retry here a bit to see if a new master is elected
                                        logger.debug("connection exception while trying to forward request with action name [{}] to " +
                                                "master node [{}], scheduling a retry. Error: [{}]",
--- a/server/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java
+++ b/server/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java
@ -44,6 +44,7 @@ import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.discovery.MasterNotDiscoveredException;
+import org.elasticsearch.node.NodeClosedException;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.tasks.Task;
 import org.elasticsearch.test.ESTestCase;
@ -391,7 +392,8 @@ public class TransportMasterNodeActionTests extends ESTestCase {
        assertThat(capturedRequest.action, equalTo("internal:testAction"));

        if (rejoinSameMaster) {
-            transport.handleRemoteError(capturedRequest.requestId, new ConnectTransportException(masterNode, "Fake error"));
+            transport.handleRemoteError(capturedRequest.requestId,
+                randomBoolean() ? new ConnectTransportException(masterNode, "Fake error") : new NodeClosedException(masterNode));
            assertFalse(listener.isDone());
            if (randomBoolean()) {
                // simulate master node removal
--- a/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java
+++ b/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java
@ -28,7 +28,10 @@ import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.test.InternalTestCluster;

+import java.util.ArrayList;
+import java.util.List;
 import java.util.concurrent.atomic.AtomicBoolean;

 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
@ -294,4 +297,17 @@ public class ClusterHealthIT extends ESIntegTestCase {
        assertFalse(healthResponseFuture.get().isTimedOut());
    }

+    public void testHealthOnMasterFailover() throws Exception {
+        final String node = internalCluster().startDataOnlyNode();
+        final List<ActionFuture<ClusterHealthResponse>> responseFutures = new ArrayList<>();
+        // Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled
+        // without exceptions
+        for (int i = 0; i < 20; ++i) {
+            responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).execute());
+            internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK);
+        }
+        for (ActionFuture<ClusterHealthResponse> responseFuture : responseFutures) {
+            assertSame(responseFuture.get().getStatus(), ClusterHealthStatus.GREEN);
+        }
+    }
 }