From 9a77f41e554e11478e028dc00a00b498f45e230f Mon Sep 17 00:00:00 2001 From: Henning Andersen <33268011+henningandersen@users.noreply.github.com> Date: Sat, 19 Sep 2020 10:02:05 +0200 Subject: [PATCH] Fix cluster health when closing (#61709) When master shuts down it's cluster service, a waiting health request would fail rather than fail over to a new master. --- .../elasticsearch/cluster/ClusterHealthIT.java | 18 +++++++++++++++++- .../health/TransportClusterHealthAction.java | 3 ++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java index 7488e24b881..db343b0bfbf 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/ClusterHealthIT.java @@ -21,6 +21,7 @@ package org.elasticsearch.cluster; import org.elasticsearch.action.ActionFuture; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; +import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest; import org.elasticsearch.action.support.IndicesOptions; import org.elasticsearch.action.support.PlainActionFuture; import org.elasticsearch.cluster.health.ClusterHealthStatus; @@ -309,14 +310,29 @@ public class ClusterHealthIT extends ESIntegTestCase { public void testHealthOnMasterFailover() throws Exception { final String node = internalCluster().startDataOnlyNode(); + boolean withIndex = randomBoolean(); + if (withIndex) { + // Create index with many shards to provoke the health request to wait (for green) while master is being shut down. + // Notice that this is set to 0 after the test completed starting a number of health requests and master restarts. + // This ensures that the cluster is yellow when the health request is made, making the health request wait on the observer, + // triggering a call to observer.onClusterServiceClose when master is shutdown. + createIndex("test", Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, randomIntBetween(0, 10)).build()); + } final List> responseFutures = new ArrayList<>(); // Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled // without exceptions for (int i = 0; i < 20; ++i) { responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID) - .setWaitForGreenStatus().execute()); + .setWaitForGreenStatus().setMasterNodeTimeout(TimeValue.timeValueMinutes(1)).execute()); internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK); } + if (withIndex) { + assertAcked( + client().admin().indices() + .updateSettings(new UpdateSettingsRequest("test") + .settings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0))).get() + ); + } for (ActionFuture responseFuture : responseFutures) { assertSame(responseFuture.get().getStatus(), ClusterHealthStatus.GREEN); } diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java index 999a9b2122f..0827b57cc46 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/health/TransportClusterHealthAction.java @@ -45,6 +45,7 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.util.CollectionUtils; import org.elasticsearch.index.IndexNotFoundException; +import org.elasticsearch.node.NodeClosedException; import org.elasticsearch.tasks.Task; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.TransportService; @@ -211,7 +212,7 @@ public class TransportClusterHealthAction extends TransportMasterNodeReadAction< @Override public void onClusterServiceClose() { - listener.onFailure(new IllegalStateException("ClusterService was close during health call")); + listener.onFailure(new NodeClosedException(clusterService.localNode())); } @Override