[ML] Increase timeout in MlDistributedFailureIT (#58997) (#59013)

Doubles the timeout on the ensureStableClusterOnAllNodes method to 60s to account for v slow ci
2020-07-06 11:30:41 +01:00 · 2020-07-06 11:30:41 +01:00 · 0fc12194bf
parent 62eabdac6e
commit 0fc12194bf
1 changed files with 14 additions and 16 deletions
--- a/x-pack/plugin/ml/src/internalClusterTest/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java
+++ b/x-pack/plugin/ml/src/internalClusterTest/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java
@ -82,13 +82,13 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {

    public void testFailOver() throws Exception {
        internalCluster().ensureAtLeastNumDataNodes(3);
-        ensureStableClusterOnAllNodes(3);
+        ensureStableCluster();
        run("fail-over-job", () -> {
            GetJobsStatsAction.Request  request = new GetJobsStatsAction.Request("fail-over-job");
            GetJobsStatsAction.Response response = client().execute(GetJobsStatsAction.INSTANCE, request).actionGet();
            DiscoveryNode discoveryNode = response.getResponse().results().get(0).getNode();
            internalCluster().stopRandomNode(settings -> discoveryNode.getName().equals(settings.get("node.name")));
-            ensureStableClusterOnAllNodes(2);
+            ensureStableCluster();
        });
    }

@ -100,7 +100,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
        String mlAndDataNode = internalCluster().startNode(
            onlyRoles(Collections.unmodifiableSet(new HashSet<>(Arrays.asList(DiscoveryNodeRole.DATA_ROLE, MachineLearning.ML_ROLE))))
        );
-        ensureStableClusterOnAllNodes(2);
+        ensureStableCluster();
        run("lose-dedicated-master-node-job", () -> {
            logger.info("Stopping dedicated master node");
            Settings masterDataPathSettings = internalCluster().dataPathSettings(internalCluster().getMasterName());
@ -115,18 +115,18 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
                    .put(masterDataPathSettings)
                    .put(masterOnlyNode())
                    .build());
-            ensureStableClusterOnAllNodes(2);
+            ensureStableCluster();
        });
    }

    public void testFullClusterRestart() throws Exception {
        internalCluster().ensureAtLeastNumDataNodes(3);
-        ensureStableClusterOnAllNodes(3);
+        ensureStableCluster();
        run("full-cluster-restart-job", () -> {
            logger.info("Restarting all nodes");
            internalCluster().fullRestart();
            logger.info("Restarted all nodes");
-            ensureStableClusterOnAllNodes(3);
+            ensureStableCluster();
        });
    }

@ -140,7 +140,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
        internalCluster().startNode(onlyRoles(
            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(DiscoveryNodeRole.DATA_ROLE, MachineLearning.ML_ROLE)))
        ));
-        ensureStableClusterOnAllNodes(2);
+        ensureStableCluster();

        // index some datafeed data
        client().admin().indices().prepareCreate("data")
@ -219,7 +219,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
        }
        logger.info("Starting dedicated ml node...");
        internalCluster().startNode(onlyRole(MachineLearning.ML_ROLE));
-        ensureStableClusterOnAllNodes(4);
+        ensureStableCluster();

        // index some datafeed data
        client().admin().indices().prepareCreate("data")
@ -328,7 +328,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
        internalCluster().startNode(onlyRoles(
            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(DiscoveryNodeRole.DATA_ROLE, MachineLearning.ML_ROLE)))
        ));
-        ensureStableClusterOnAllNodes(2);
+        ensureStableCluster();

        // index some datafeed data
        client().admin().indices().prepareCreate("data")
@ -374,7 +374,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
    public void testJobRelocationIsMemoryAware() throws Exception {

        internalCluster().ensureAtLeastNumDataNodes(1);
-        ensureStableClusterOnAllNodes(1);
+        ensureStableCluster();

        // Open 4 small jobs.  Since there is only 1 node in the cluster they'll have to go on that node.

@ -387,7 +387,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
        // same node because we don't rebalance jobs that are happily running.

        internalCluster().ensureAtLeastNumDataNodes(3);
-        ensureStableClusterOnAllNodes(3);
+        ensureStableCluster();

        // Wait for the cluster to be green - this means the indices have been replicated.

@ -400,7 +400,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
        // Stop the current master node - this should be the one with the 4 small jobs on.

        internalCluster().stopCurrentMasterNode();
-        ensureStableClusterOnAllNodes(2);
+        ensureStableCluster();

        // If memory requirements are used to reallocate the 4 small jobs (as we expect) then they should
        // all reallocate to the same node, that being the one that doesn't have the big job on.  If job counts
@ -552,9 +552,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
        }, 30, TimeUnit.SECONDS);
    }

-    private void ensureStableClusterOnAllNodes(int nodeCount) {
-        for (String nodeName : internalCluster().getNodeNames()) {
-            ensureStableCluster(nodeCount, nodeName);
-        }
+    private void ensureStableCluster() {
+        ensureStableCluster(internalCluster().getNodeNames().length, TimeValue.timeValueSeconds(60));
    }
 }