From 250973af1dce7cd96433895ae98dadd8f4eaa89f Mon Sep 17 00:00:00 2001
From: Yannick Welsch <yannick@welsch.lu>
Date: Wed, 22 May 2019 14:36:17 +0200
Subject: [PATCH] Fix testCannotJoinIfMasterLostDataFolder

Relates to #41047
---
 .../discovery/ClusterDisruptionIT.java        | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java b/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java
index 5e86bd1bcbb..13911f56400 100644
--- a/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java
+++ b/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java
@@ -31,6 +31,7 @@ import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.action.shard.ShardStateAction;
 import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
+import org.elasticsearch.cluster.coordination.LagDetector;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.routing.Murmur3HashFunction;
 import org.elasticsearch.cluster.routing.ShardRouting;
@@ -388,7 +389,6 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
         }
     }
 
-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/41047")
     public void testCannotJoinIfMasterLostDataFolder() throws Exception {
         String masterNode = internalCluster().startMasterOnlyNode();
         String dataNode = internalCluster().startDataOnlyNode();
@@ -401,7 +401,18 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
 
             @Override
             public Settings onNodeStopped(String nodeName) {
-                return Settings.builder().put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName).build();
+                return Settings.builder()
+                    .put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName)
+                    /*
+                     * the data node might join while the master is still not fully established as master just yet and bypasses the join
+                     * validation that is done before adding the node to the cluster. Only the join validation when handling the publish
+                     * request takes place, but at this point the cluster state has been successfully committed, and will subsequently be
+                     * exposed to the applier. The health check below therefore sees the cluster state with the 2 nodes and thinks all is
+                     * good, even though the data node never accepted this state. What's worse is that it takes 90 seconds for the data
+                     * node to be kicked out of the cluster (lag detection). We speed this up here.
+                     */
+                    .put(LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING.getKey(), "10s")
+                    .build();
             }
 
             @Override
@@ -410,9 +421,11 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
             }
         });
 
-        assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut());
-        assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get()
-            .isTimedOut());
+        assertBusy(() -> {
+            assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut());
+            assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get()
+                .isTimedOut());
+        }, 30, TimeUnit.SECONDS);
         internalCluster().stopRandomNode(InternalTestCluster.nameFilter(dataNode)); // otherwise we will fail during clean-up
     }