Fix ClusterInfoServiceIT timeouts (#36758)
The test testClusterInfoServiceInformationClearOnError relies on timing behavior. It sets InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING to 1s and relies on the fact that the stats request completes within that timeframe (which our ever-so-slow CI seems to violate at times). Unfortunately the logging has been misimplemented in InternalClusterInfoService, so the corresponding log messages showing that the requests have timed out are missing for this. The issue can be locally reproduced by reducing the timeout to something lower. Closes #36554
This commit is contained in:
parent
18691daebe
commit
8f141b8a41
|
@ -345,17 +345,19 @@ public class InternalClusterInfoService implements ClusterInfoService, LocalNode
|
|||
});
|
||||
|
||||
try {
|
||||
nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS);
|
||||
if (nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
|
||||
logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt(); // restore interrupt status
|
||||
logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
|
||||
}
|
||||
|
||||
try {
|
||||
indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS);
|
||||
if (indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
|
||||
logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt(); // restore interrupt status
|
||||
logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
|
||||
}
|
||||
ClusterInfo clusterInfo = getClusterInfo();
|
||||
try {
|
||||
|
|
|
@ -111,7 +111,6 @@ public class ClusterInfoServiceIT extends ESIntegTestCase {
|
|||
.put(super.nodeSettings(nodeOrdinal))
|
||||
// manual collection or upon cluster forming.
|
||||
.put(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.getKey(), 2)
|
||||
.put(InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING.getKey(), "1s")
|
||||
.build();
|
||||
}
|
||||
|
||||
|
@ -120,6 +119,11 @@ public class ClusterInfoServiceIT extends ESIntegTestCase {
|
|||
return Arrays.asList(TestPlugin.class, MockTransportService.TestPlugin.class);
|
||||
}
|
||||
|
||||
private void setClusterInfoTimeout(String timeValue) {
|
||||
assertAcked(client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder()
|
||||
.put(InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING.getKey(), timeValue).build()));
|
||||
}
|
||||
|
||||
public void testClusterInfoServiceCollectsInformation() throws Exception {
|
||||
internalCluster().startNodes(2);
|
||||
assertAcked(prepareCreate("test").setSettings(Settings.builder()
|
||||
|
@ -204,6 +208,7 @@ public class ClusterInfoServiceIT extends ESIntegTestCase {
|
|||
});
|
||||
}
|
||||
|
||||
setClusterInfoTimeout("1s");
|
||||
// timeouts shouldn't clear the info
|
||||
timeout.set(true);
|
||||
info = infoService.refresh();
|
||||
|
@ -237,6 +242,7 @@ public class ClusterInfoServiceIT extends ESIntegTestCase {
|
|||
|
||||
// check we recover
|
||||
blockingActionFilter.blockActions();
|
||||
setClusterInfoTimeout("15s");
|
||||
info = infoService.refresh();
|
||||
assertNotNull("info should not be null", info);
|
||||
assertThat(info.getNodeLeastAvailableDiskUsages().size(), equalTo(2));
|
||||
|
|
Loading…
Reference in New Issue