Fix testCannotJoinIfMasterLostDataFolder

Relates to #41047
2025-02-08 22:14:59 +00:00 · 2019-05-22 14:36:17 +02:00 · 2019-05-22 14:36:17 +02:00 · 250973af1d
commit 250973af1d
parent a79cd77e5c
1 changed files with 18 additions and 5 deletions
--- a/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java
+++ b/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java
@ -31,6 +31,7 @@ import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.action.shard.ShardStateAction;
 import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
+import org.elasticsearch.cluster.coordination.LagDetector;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.routing.Murmur3HashFunction;
 import org.elasticsearch.cluster.routing.ShardRouting;
@ -388,7 +389,6 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
        }
    }

-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/41047")
    public void testCannotJoinIfMasterLostDataFolder() throws Exception {
        String masterNode = internalCluster().startMasterOnlyNode();
        String dataNode = internalCluster().startDataOnlyNode();
@ -401,7 +401,18 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {

            @Override
            public Settings onNodeStopped(String nodeName) {
-                return Settings.builder().put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName).build();
+                return Settings.builder()
+                    .put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName)
+                    /*
+                     * the data node might join while the master is still not fully established as master just yet and bypasses the join
+                     * validation that is done before adding the node to the cluster. Only the join validation when handling the publish
+                     * request takes place, but at this point the cluster state has been successfully committed, and will subsequently be
+                     * exposed to the applier. The health check below therefore sees the cluster state with the 2 nodes and thinks all is
+                     * good, even though the data node never accepted this state. What's worse is that it takes 90 seconds for the data
+                     * node to be kicked out of the cluster (lag detection). We speed this up here.
+                     */
+                    .put(LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING.getKey(), "10s")
+                    .build();
            }

            @Override
@ -410,9 +421,11 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
            }
        });

-        assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut());
-        assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get()
-            .isTimedOut());
+        assertBusy(() -> {
+            assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut());
+            assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get()
+                .isTimedOut());
+        }, 30, TimeUnit.SECONDS);
        internalCluster().stopRandomNode(InternalTestCluster.nameFilter(dataNode)); // otherwise we will fail during clean-up
    }