From 9c49868bc5053673fe08b995bf4d1a22a3fbf0da Mon Sep 17 00:00:00 2001 From: David Roberts Date: Tue, 18 Feb 2020 11:16:54 +0000 Subject: [PATCH] [TEST] Use busy asserts in ML distributed failure test (#52461) When changing a job state using a mechanism that doesn't wait for the desired state to be reached within the production code the test code needs to loop until the cluster state has been updated. Closes #52451 --- .../integration/MlDistributedFailureIT.java | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java index 7733aab6cff..1af23a0c498 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java @@ -231,10 +231,12 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase { PostDataAction.Response postDataResponse = client().execute(PostDataAction.INSTANCE, postDataRequest).actionGet(); assertEquals(1L, postDataResponse.getDataCounts().getInputRecordCount()); - // Confirm the job state is now failed - jobStatsRequest = new GetJobsStatsAction.Request(jobId); - jobStatsResponse = client().execute(GetJobsStatsAction.INSTANCE, jobStatsRequest).actionGet(); - assertEquals(JobState.FAILED, jobStatsResponse.getResponse().results().get(0).getState()); + // Confirm the job state is now failed - this may take a while to update in cluster state + assertBusy(() -> { + GetJobsStatsAction.Request jobStatsRequest2 = new GetJobsStatsAction.Request(jobId); + GetJobsStatsAction.Response jobStatsResponse2 = client().execute(GetJobsStatsAction.INSTANCE, jobStatsRequest2).actionGet(); + assertEquals(JobState.FAILED, jobStatsResponse2.getResponse().results().get(0).getState()); + }); // It's impossible to reliably get the datafeed into a stopping state at the point when the ML node is removed from the cluster // using externally accessible actions. The only way this situation could occur in reality is through extremely unfortunate @@ -248,11 +250,13 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase { client().execute(UpdatePersistentTaskStatusAction.INSTANCE, updatePersistentTaskStatusRequest).actionGet(); assertNotNull(updatePersistentTaskStatusResponse.getTask()); - // Confirm the datafeed state is now stopping - GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(datafeedId); - GetDatafeedsStatsAction.Response datafeedStatsResponse = - client().execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest).actionGet(); - assertEquals(DatafeedState.STOPPING, datafeedStatsResponse.getResponse().results().get(0).getDatafeedState()); + // Confirm the datafeed state is now stopping - this may take a while to update in cluster state + assertBusy(() -> { + GetDatafeedsStatsAction.Request datafeedStatsRequest = new GetDatafeedsStatsAction.Request(datafeedId); + GetDatafeedsStatsAction.Response datafeedStatsResponse = + client().execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest).actionGet(); + assertEquals(DatafeedState.STOPPING, datafeedStatsResponse.getResponse().results().get(0).getDatafeedState()); + }); // Stop the node running the failed job/stopping datafeed ensureGreen(); // replicas must be assigned, otherwise we could lose a whole index @@ -265,10 +269,12 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase { StopDatafeedAction.Response stopDatafeedResponse = client().execute(StopDatafeedAction.INSTANCE, stopDatafeedRequest).actionGet(); assertTrue(stopDatafeedResponse.isStopped()); - // Confirm the datafeed state is now stopped - datafeedStatsRequest = new GetDatafeedsStatsAction.Request(datafeedId); - datafeedStatsResponse = client().execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest).actionGet(); - assertEquals(DatafeedState.STOPPED, datafeedStatsResponse.getResponse().results().get(0).getDatafeedState()); + // Confirm the datafeed state is now stopped - shouldn't need a busy check here as + // the stop endpoint shouldn't return until its effects are externally visible + GetDatafeedsStatsAction.Request datafeedStatsRequest2 = new GetDatafeedsStatsAction.Request(datafeedId); + GetDatafeedsStatsAction.Response datafeedStatsResponse2 = + client().execute(GetDatafeedsStatsAction.INSTANCE, datafeedStatsRequest2).actionGet(); + assertEquals(DatafeedState.STOPPED, datafeedStatsResponse2.getResponse().results().get(0).getDatafeedState()); // We should be allowed to force stop the unassigned failed job CloseJobAction.Request closeJobRequest = new CloseJobAction.Request(jobId);