From 9c49868bc5053673fe08b995bf4d1a22a3fbf0da Mon Sep 17 00:00:00 2001
From: David Roberts <dave.roberts@elastic.co>
Date: Tue, 18 Feb 2020 11:16:54 +0000
Subject: [PATCH] [TEST] Use busy asserts in ML distributed failure test
 (#52461)

When changing a job state using a mechanism that doesn't
wait for the desired state to be reached within the production
code the test code needs to loop until the cluster state has
been updated.

Closes #52451
---
 .../integration/MlDistributedFailureIT.java   | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java
index 7733aab6cff..1af23a0c498 100644
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java
@@ -231,10 +231,12 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
         PostDataAction.Response postDataResponse = client().execute(PostDataAction.INSTANCE, postDataRequest).actionGet();
         assertEquals(1L, postDataResponse.getDataCounts().getInputRecordCount());
 
-        // Confirm the job state is now failed
-        jobStatsRequest = new GetJobsStatsAction.Request(jobId);
-        jobStatsResponse = client().execute(GetJobsStatsAction.INSTANCE, jobStatsRequest).actionGet();
-        assertEquals(JobState.FAILED, jobStatsResponse.getResponse().results().get(0).getState());
+        // Confirm the job state is now failed - this may take a while to update in cluster state
+        assertBusy(() -> {
+            GetJobsStatsAction.Request jobStatsRequest2 = new GetJobsStatsAction.Request(jobId);
+            GetJobsStatsAction.Response jobStatsResponse2 = client().execute(GetJobsStatsAction.INSTANCE, jobStatsRequest2).actionGet();
+            assertEquals(JobState.FAILED, jobStatsResponse2.getResponse().results().get(0).getState());
+        });
 
         // It's impossible to reliably get the datafeed into a stopping state at the point when the ML node is removed from the cluster
         // using externally accessible actions.  The only way this situation could occur in reality is through extremely unfortunate
@@ -248,11 +250,13 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
             client().execute(UpdatePersistentTaskStatusAction.INSTANCE, updatePersistentTaskStatusRequest).actionGet();
         assertNotNull(updatePersistentTaskStatusResponse.getTask());
 
-        // Confirm the datafeed state is now stopping
-        GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(datafeedId);
-        GetDatafeedsStatsAction.Response datafeedStatsResponse =
-            client().execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest).actionGet();
-        assertEquals(DatafeedState.STOPPING, datafeedStatsResponse.getResponse().results().get(0).getDatafeedState());
+        // Confirm the datafeed state is now stopping - this may take a while to update in cluster state
+        assertBusy(() -> {
+            GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(datafeedId);
+            GetDatafeedsStatsAction.Response datafeedStatsResponse =
+                client().execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest).actionGet();
+            assertEquals(DatafeedState.STOPPING, datafeedStatsResponse.getResponse().results().get(0).getDatafeedState());
+        });
 
         // Stop the node running the failed job/stopping datafeed
         ensureGreen(); // replicas must be assigned, otherwise we could lose a whole index
@@ -265,10 +269,12 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
         StopDatafeedAction.Response stopDatafeedResponse = client().execute(StopDatafeedAction.INSTANCE, stopDatafeedRequest).actionGet();
         assertTrue(stopDatafeedResponse.isStopped());
 
-        // Confirm the datafeed state is now stopped
-        datafeedStatsRequest = new GetDatafeedsStatsAction.Request(datafeedId);
-        datafeedStatsResponse = client().execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest).actionGet();
-        assertEquals(DatafeedState.STOPPED, datafeedStatsResponse.getResponse().results().get(0).getDatafeedState());
+        // Confirm the datafeed state is now stopped - shouldn't need a busy check here as
+        // the stop endpoint shouldn't return until its effects are externally visible
+        GetDatafeedsStatsAction.Request datafeedStatsRequest2 = new GetDatafeedsStatsAction.Request(datafeedId);
+        GetDatafeedsStatsAction.Response datafeedStatsResponse2 =
+            client().execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest2).actionGet();
+        assertEquals(DatafeedState.STOPPED, datafeedStatsResponse2.getResponse().results().get(0).getDatafeedState());
 
         // We should be allowed to force stop the unassigned failed job
         CloseJobAction.Request closeJobRequest = new CloseJobAction.Request(jobId);