Fix ClusterInfoServiceIT timeouts (#36758)

The test testClusterInfoServiceInformationClearOnError relies on timing behavior. It sets
InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING to 1s and relies on the
fact that the stats request completes within that timeframe (which our ever-so-slow CI seems to
violate at times). Unfortunately the logging has been misimplemented in InternalClusterInfoService,
so the corresponding log messages showing that the requests have timed out are missing for this.
The issue can be locally reproduced by reducing the timeout to something lower.

Closes #36554
This commit is contained in:
Yannick Welsch 2018-12-19 13:59:58 +01:00 committed by GitHub
parent 18691daebe
commit 8f141b8a41
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 13 additions and 5 deletions

View File

@ -345,17 +345,19 @@ public class InternalClusterInfoService implements ClusterInfoService, LocalNode
}); });
try { try {
nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS); if (nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
}
} catch (InterruptedException e) { } catch (InterruptedException e) {
Thread.currentThread().interrupt(); // restore interrupt status Thread.currentThread().interrupt(); // restore interrupt status
logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
} }
try { try {
indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS); if (indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
}
} catch (InterruptedException e) { } catch (InterruptedException e) {
Thread.currentThread().interrupt(); // restore interrupt status Thread.currentThread().interrupt(); // restore interrupt status
logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
} }
ClusterInfo clusterInfo = getClusterInfo(); ClusterInfo clusterInfo = getClusterInfo();
try { try {

View File

@ -111,7 +111,6 @@ public class ClusterInfoServiceIT extends ESIntegTestCase {
.put(super.nodeSettings(nodeOrdinal)) .put(super.nodeSettings(nodeOrdinal))
// manual collection or upon cluster forming. // manual collection or upon cluster forming.
.put(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.getKey(), 2) .put(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.getKey(), 2)
.put(InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING.getKey(), "1s")
.build(); .build();
} }
@ -120,6 +119,11 @@ public class ClusterInfoServiceIT extends ESIntegTestCase {
return Arrays.asList(TestPlugin.class, MockTransportService.TestPlugin.class); return Arrays.asList(TestPlugin.class, MockTransportService.TestPlugin.class);
} }
private void setClusterInfoTimeout(String timeValue) {
assertAcked(client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder()
.put(InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING.getKey(), timeValue).build()));
}
public void testClusterInfoServiceCollectsInformation() throws Exception { public void testClusterInfoServiceCollectsInformation() throws Exception {
internalCluster().startNodes(2); internalCluster().startNodes(2);
assertAcked(prepareCreate("test").setSettings(Settings.builder() assertAcked(prepareCreate("test").setSettings(Settings.builder()
@ -204,6 +208,7 @@ public class ClusterInfoServiceIT extends ESIntegTestCase {
}); });
} }
setClusterInfoTimeout("1s");
// timeouts shouldn't clear the info // timeouts shouldn't clear the info
timeout.set(true); timeout.set(true);
info = infoService.refresh(); info = infoService.refresh();
@ -237,6 +242,7 @@ public class ClusterInfoServiceIT extends ESIntegTestCase {
// check we recover // check we recover
blockingActionFilter.blockActions(); blockingActionFilter.blockActions();
setClusterInfoTimeout("15s");
info = infoService.refresh(); info = infoService.refresh();
assertNotNull("info should not be null", info); assertNotNull("info should not be null", info);
assertThat(info.getNodeLeastAvailableDiskUsages().size(), equalTo(2)); assertThat(info.getNodeLeastAvailableDiskUsages().size(), equalTo(2));