From 8f141b8a41554f07217e1eaa6c0028d21c833ae4 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 19 Dec 2018 13:59:58 +0100 Subject: [PATCH] Fix ClusterInfoServiceIT timeouts (#36758) The test testClusterInfoServiceInformationClearOnError relies on timing behavior. It sets InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING to 1s and relies on the fact that the stats request completes within that timeframe (which our ever-so-slow CI seems to violate at times). Unfortunately the logging has been misimplemented in InternalClusterInfoService, so the corresponding log messages showing that the requests have timed out are missing for this. The issue can be locally reproduced by reducing the timeout to something lower. Closes #36554 --- .../cluster/InternalClusterInfoService.java | 10 ++++++---- .../elasticsearch/cluster/ClusterInfoServiceIT.java | 8 +++++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java b/server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java index 559b554b007..e8261ca9f09 100644 --- a/server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java +++ b/server/src/main/java/org/elasticsearch/cluster/InternalClusterInfoService.java @@ -345,17 +345,19 @@ public class InternalClusterInfoService implements ClusterInfoService, LocalNode }); try { - nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS); + if (nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) { + logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout); + } } catch (InterruptedException e) { Thread.currentThread().interrupt(); // restore interrupt status - logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout); } try { - indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS); + if (indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) { + logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout); + } } catch (InterruptedException e) { Thread.currentThread().interrupt(); // restore interrupt status - logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout); } ClusterInfo clusterInfo = getClusterInfo(); try { diff --git a/server/src/test/java/org/elasticsearch/cluster/ClusterInfoServiceIT.java b/server/src/test/java/org/elasticsearch/cluster/ClusterInfoServiceIT.java index ae1caa787d4..99f89548524 100644 --- a/server/src/test/java/org/elasticsearch/cluster/ClusterInfoServiceIT.java +++ b/server/src/test/java/org/elasticsearch/cluster/ClusterInfoServiceIT.java @@ -111,7 +111,6 @@ public class ClusterInfoServiceIT extends ESIntegTestCase { .put(super.nodeSettings(nodeOrdinal)) // manual collection or upon cluster forming. .put(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.getKey(), 2) - .put(InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING.getKey(), "1s") .build(); } @@ -120,6 +119,11 @@ public class ClusterInfoServiceIT extends ESIntegTestCase { return Arrays.asList(TestPlugin.class, MockTransportService.TestPlugin.class); } + private void setClusterInfoTimeout(String timeValue) { + assertAcked(client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder() + .put(InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING.getKey(), timeValue).build())); + } + public void testClusterInfoServiceCollectsInformation() throws Exception { internalCluster().startNodes(2); assertAcked(prepareCreate("test").setSettings(Settings.builder() @@ -204,6 +208,7 @@ public class ClusterInfoServiceIT extends ESIntegTestCase { }); } + setClusterInfoTimeout("1s"); // timeouts shouldn't clear the info timeout.set(true); info = infoService.refresh(); @@ -237,6 +242,7 @@ public class ClusterInfoServiceIT extends ESIntegTestCase { // check we recover blockingActionFilter.blockActions(); + setClusterInfoTimeout("15s"); info = infoService.refresh(); assertNotNull("info should not be null", info); assertThat(info.getNodeLeastAvailableDiskUsages().size(), equalTo(2));