Fix DiskThresholdDeciderIT.testHighWatermarkNotExceeded (#63112) (#63385)

The first refreshDiskUsage() refreshes the ClusterInfo update which in turn calls listeners like DiskThreshMonitor. This one triggers a reroute as expected and turns an internal checkInProgress flag before submitting a cluster state update to relocate shards (the internal flag is toggled again once the cluster state update is processed). In the test I suspect that the second refreshDiskUsage() may complete before DiskThreshMonitor's internal flag is set back to its initial state, resulting in the second ClusterInfo update to be ignored and message like "[node_t0] skipping monitor as a check is already in progress" to be logged. Adding another wait for languid events to be processed before executing the second refreshDiskUsage() should help here. Closes #62326
2020-10-07 11:27:25 +02:00 · 2020-10-07 11:27:25 +02:00 · 581490d83c
parent d45f7de3fb
commit 581490d83c
1 changed files with 15 additions and 4 deletions
--- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java
+++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java
@ -136,11 +136,11 @@ public class DiskThresholdDeciderIT extends ESIntegTestCase {
        return Collections.singletonList(InternalSettingsPlugin.class);
    }
    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/62326")
    public void testHighWatermarkNotExceeded() throws Exception {
        internalCluster().startMasterOnlyNode();
        internalCluster().startDataOnlyNode();
        final String dataNodeName = internalCluster().startDataOnlyNode();
        ensureStableCluster(3);
        final InternalClusterInfoService clusterInfoService
                = (InternalClusterInfoService) internalCluster().getMasterNodeInstance(ClusterInfoService.class);
@ -276,6 +276,13 @@ public class DiskThresholdDeciderIT extends ESIntegTestCase {
    }
    private void refreshDiskUsage() {
        assertFalse(client().admin().cluster().prepareHealth()
            .setWaitForEvents(Priority.LANGUID)
            .setWaitForNoRelocatingShards(true)
            .setWaitForNoInitializingShards(true)
            .get()
            .isTimedOut());
        final ClusterInfoService clusterInfoService = internalCluster().getMasterNodeInstance(ClusterInfoService.class);
        ((InternalClusterInfoService) clusterInfoService).refresh();
        // if the nodes were all under the low watermark already (but unbalanced) then a change in the disk usage doesn't trigger a reroute
@ -284,9 +291,13 @@ public class DiskThresholdDeciderIT extends ESIntegTestCase {
            .allMatch(cur -> cur.value.getFreeBytes() > WATERMARK_BYTES)) {
            assertAcked(client().admin().cluster().prepareReroute());
        }
-        assertFalse(client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID)
+
-                .setWaitForNoRelocatingShards(true)
+        assertFalse(client().admin().cluster().prepareHealth()
-                .setWaitForNoInitializingShards(true).get().isTimedOut());
+            .setWaitForEvents(Priority.LANGUID)
            .setWaitForNoRelocatingShards(true)
            .setWaitForNoInitializingShards(true)
            .get()
            .isTimedOut());
    }
    private static class TestFileStore extends FilterFileStore {