Fix DiskThresholdDeciderIT.testHighWatermarkNotExceeded (#63112) (#63385)

The first refreshDiskUsage() refreshes the ClusterInfo update which in turn 
calls listeners like DiskThreshMonitor. This one triggers a reroute as 
expected and turns an internal checkInProgress flag before submitting 
a cluster state update to relocate shards (the internal flag is toggled 
again once the cluster state update is processed).

In the test I suspect that the second refreshDiskUsage() may complete 
before DiskThreshMonitor's internal flag is set back to its initial state, 
resulting in the second ClusterInfo update to be ignored and message 
like "[node_t0] skipping monitor as a check is already in progress" to 
be logged. Adding another wait for languid events to be processed 
before executing the second refreshDiskUsage() should help here.

Closes #62326
This commit is contained in:
Tanguy Leroux 2020-10-07 11:27:25 +02:00 committed by GitHub
parent d45f7de3fb
commit 581490d83c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 15 additions and 4 deletions

View File

@ -136,11 +136,11 @@ public class DiskThresholdDeciderIT extends ESIntegTestCase {
return Collections.singletonList(InternalSettingsPlugin.class); return Collections.singletonList(InternalSettingsPlugin.class);
} }
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/62326")
public void testHighWatermarkNotExceeded() throws Exception { public void testHighWatermarkNotExceeded() throws Exception {
internalCluster().startMasterOnlyNode(); internalCluster().startMasterOnlyNode();
internalCluster().startDataOnlyNode(); internalCluster().startDataOnlyNode();
final String dataNodeName = internalCluster().startDataOnlyNode(); final String dataNodeName = internalCluster().startDataOnlyNode();
ensureStableCluster(3);
final InternalClusterInfoService clusterInfoService final InternalClusterInfoService clusterInfoService
= (InternalClusterInfoService) internalCluster().getMasterNodeInstance(ClusterInfoService.class); = (InternalClusterInfoService) internalCluster().getMasterNodeInstance(ClusterInfoService.class);
@ -276,6 +276,13 @@ public class DiskThresholdDeciderIT extends ESIntegTestCase {
} }
private void refreshDiskUsage() { private void refreshDiskUsage() {
assertFalse(client().admin().cluster().prepareHealth()
.setWaitForEvents(Priority.LANGUID)
.setWaitForNoRelocatingShards(true)
.setWaitForNoInitializingShards(true)
.get()
.isTimedOut());
final ClusterInfoService clusterInfoService = internalCluster().getMasterNodeInstance(ClusterInfoService.class); final ClusterInfoService clusterInfoService = internalCluster().getMasterNodeInstance(ClusterInfoService.class);
((InternalClusterInfoService) clusterInfoService).refresh(); ((InternalClusterInfoService) clusterInfoService).refresh();
// if the nodes were all under the low watermark already (but unbalanced) then a change in the disk usage doesn't trigger a reroute // if the nodes were all under the low watermark already (but unbalanced) then a change in the disk usage doesn't trigger a reroute
@ -284,9 +291,13 @@ public class DiskThresholdDeciderIT extends ESIntegTestCase {
.allMatch(cur -> cur.value.getFreeBytes() > WATERMARK_BYTES)) { .allMatch(cur -> cur.value.getFreeBytes() > WATERMARK_BYTES)) {
assertAcked(client().admin().cluster().prepareReroute()); assertAcked(client().admin().cluster().prepareReroute());
} }
assertFalse(client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID)
.setWaitForNoRelocatingShards(true) assertFalse(client().admin().cluster().prepareHealth()
.setWaitForNoInitializingShards(true).get().isTimedOut()); .setWaitForEvents(Priority.LANGUID)
.setWaitForNoRelocatingShards(true)
.setWaitForNoInitializingShards(true)
.get()
.isTimedOut());
} }
private static class TestFileStore extends FilterFileStore { private static class TestFileStore extends FilterFileStore {