From 250973af1dce7cd96433895ae98dadd8f4eaa89f Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 22 May 2019 14:36:17 +0200 Subject: [PATCH] Fix testCannotJoinIfMasterLostDataFolder Relates to #41047 --- .../discovery/ClusterDisruptionIT.java | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java b/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java index 5e86bd1bcbb..13911f56400 100644 --- a/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java +++ b/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java @@ -31,6 +31,7 @@ import org.elasticsearch.client.Client; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.action.shard.ShardStateAction; import org.elasticsearch.cluster.coordination.ClusterBootstrapService; +import org.elasticsearch.cluster.coordination.LagDetector; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.routing.Murmur3HashFunction; import org.elasticsearch.cluster.routing.ShardRouting; @@ -388,7 +389,6 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase { } } - @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/41047") public void testCannotJoinIfMasterLostDataFolder() throws Exception { String masterNode = internalCluster().startMasterOnlyNode(); String dataNode = internalCluster().startDataOnlyNode(); @@ -401,7 +401,18 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase { @Override public Settings onNodeStopped(String nodeName) { - return Settings.builder().put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName).build(); + return Settings.builder() + .put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName) + /* + * the data node might join while the master is still not fully established as master just yet and bypasses the join + * validation that is done before adding the node to the cluster. Only the join validation when handling the publish + * request takes place, but at this point the cluster state has been successfully committed, and will subsequently be + * exposed to the applier. The health check below therefore sees the cluster state with the 2 nodes and thinks all is + * good, even though the data node never accepted this state. What's worse is that it takes 90 seconds for the data + * node to be kicked out of the cluster (lag detection). We speed this up here. + */ + .put(LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING.getKey(), "10s") + .build(); } @Override @@ -410,9 +421,11 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase { } }); - assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut()); - assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get() - .isTimedOut()); + assertBusy(() -> { + assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut()); + assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get() + .isTimedOut()); + }, 30, TimeUnit.SECONDS); internalCluster().stopRandomNode(InternalTestCluster.nameFilter(dataNode)); // otherwise we will fail during clean-up }