[ML] Increase timeout in MlDistributedFailureIT (#58997) (#59013)

Doubles the timeout on the ensureStableClusterOnAllNodes method to 60s
to account for v slow ci
This commit is contained in:
David Kyle 2020-07-06 11:30:41 +01:00 committed by GitHub
parent 62eabdac6e
commit 0fc12194bf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 14 additions and 16 deletions

View File

@ -82,13 +82,13 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
public void testFailOver() throws Exception {
internalCluster().ensureAtLeastNumDataNodes(3);
ensureStableClusterOnAllNodes(3);
ensureStableCluster();
run("fail-over-job", () -> {
GetJobsStatsAction.Request request = new GetJobsStatsAction.Request("fail-over-job");
GetJobsStatsAction.Response response = client().execute(GetJobsStatsAction.INSTANCE, request).actionGet();
DiscoveryNode discoveryNode = response.getResponse().results().get(0).getNode();
internalCluster().stopRandomNode(settings -> discoveryNode.getName().equals(settings.get("node.name")));
ensureStableClusterOnAllNodes(2);
ensureStableCluster();
});
}
@ -100,7 +100,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
String mlAndDataNode = internalCluster().startNode(
onlyRoles(Collections.unmodifiableSet(new HashSet<>(Arrays.asList(DiscoveryNodeRole.DATA_ROLE, MachineLearning.ML_ROLE))))
);
ensureStableClusterOnAllNodes(2);
ensureStableCluster();
run("lose-dedicated-master-node-job", () -> {
logger.info("Stopping dedicated master node");
Settings masterDataPathSettings = internalCluster().dataPathSettings(internalCluster().getMasterName());
@ -115,18 +115,18 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
.put(masterDataPathSettings)
.put(masterOnlyNode())
.build());
ensureStableClusterOnAllNodes(2);
ensureStableCluster();
});
}
public void testFullClusterRestart() throws Exception {
internalCluster().ensureAtLeastNumDataNodes(3);
ensureStableClusterOnAllNodes(3);
ensureStableCluster();
run("full-cluster-restart-job", () -> {
logger.info("Restarting all nodes");
internalCluster().fullRestart();
logger.info("Restarted all nodes");
ensureStableClusterOnAllNodes(3);
ensureStableCluster();
});
}
@ -140,7 +140,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
internalCluster().startNode(onlyRoles(
Collections.unmodifiableSet(new HashSet<>(Arrays.asList(DiscoveryNodeRole.DATA_ROLE, MachineLearning.ML_ROLE)))
));
ensureStableClusterOnAllNodes(2);
ensureStableCluster();
// index some datafeed data
client().admin().indices().prepareCreate("data")
@ -219,7 +219,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
}
logger.info("Starting dedicated ml node...");
internalCluster().startNode(onlyRole(MachineLearning.ML_ROLE));
ensureStableClusterOnAllNodes(4);
ensureStableCluster();
// index some datafeed data
client().admin().indices().prepareCreate("data")
@ -328,7 +328,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
internalCluster().startNode(onlyRoles(
Collections.unmodifiableSet(new HashSet<>(Arrays.asList(DiscoveryNodeRole.DATA_ROLE, MachineLearning.ML_ROLE)))
));
ensureStableClusterOnAllNodes(2);
ensureStableCluster();
// index some datafeed data
client().admin().indices().prepareCreate("data")
@ -374,7 +374,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
public void testJobRelocationIsMemoryAware() throws Exception {
internalCluster().ensureAtLeastNumDataNodes(1);
ensureStableClusterOnAllNodes(1);
ensureStableCluster();
// Open 4 small jobs. Since there is only 1 node in the cluster they'll have to go on that node.
@ -387,7 +387,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
// same node because we don't rebalance jobs that are happily running.
internalCluster().ensureAtLeastNumDataNodes(3);
ensureStableClusterOnAllNodes(3);
ensureStableCluster();
// Wait for the cluster to be green - this means the indices have been replicated.
@ -400,7 +400,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
// Stop the current master node - this should be the one with the 4 small jobs on.
internalCluster().stopCurrentMasterNode();
ensureStableClusterOnAllNodes(2);
ensureStableCluster();
// If memory requirements are used to reallocate the 4 small jobs (as we expect) then they should
// all reallocate to the same node, that being the one that doesn't have the big job on. If job counts
@ -552,9 +552,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
}, 30, TimeUnit.SECONDS);
}
private void ensureStableClusterOnAllNodes(int nodeCount) {
for (String nodeName : internalCluster().getNodeNames()) {
ensureStableCluster(nodeCount, nodeName);
}
private void ensureStableCluster() {
ensureStableCluster(internalCluster().getNodeNames().length, TimeValue.timeValueSeconds(60));
}
}