Add extra logging for investigation into #52000 (#52472)

It looks like #52000 is caused by a slowdown in cluster state application (maybe due to #50907) but I would like to understand the details to ensure that there's nothing else going on here too before simply increasing the timeout. This commit enables some relevant `DEBUG` loggers and also captures stack traces from all threads rather than just the three hottest ones.
2025-03-24 17:09:48 +00:00 · 2020-02-18 13:01:56 +00:00 · 2020-02-18 13:01:56 +00:00 · 3d57a78deb
commit 3d57a78deb
parent 84de601551
2 changed files with 5 additions and 2 deletions
--- a/server/src/test/java/org/elasticsearch/cluster/allocation/ClusterRerouteIT.java
+++ b/server/src/test/java/org/elasticsearch/cluster/allocation/ClusterRerouteIT.java
@ -57,6 +57,7 @@ import org.elasticsearch.test.ESIntegTestCase.ClusterScope;
 import org.elasticsearch.test.ESIntegTestCase.Scope;
 import org.elasticsearch.test.InternalTestCluster;
 import org.elasticsearch.test.MockLogAppender;
+import org.elasticsearch.test.junit.annotations.TestLogging;

 import java.nio.file.Path;
 import java.util.Arrays;
@ -187,6 +188,8 @@ public class ClusterRerouteIT extends ESIntegTestCase {
        rerouteWithAllocateLocalGateway(commonSettings);
    }

+    @TestLogging(reason = "https://github.com/elastic/elasticsearch/issues/52000",
+        value = "org.elasticsearch.gateway.PersistedClusterStateService:DEBUG,org.elasticsearch.cluster.service.MasterService:DEBUG")
    public void testDelayWithALargeAmountOfShards() throws Exception {
        Settings commonSettings = Settings.builder()
                .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(), 1)
--- a/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java
@ -919,8 +919,8 @@ public abstract class ESIntegTestCase extends ESTestCase {

        ClusterHealthResponse actionGet = client().admin().cluster().health(healthRequest).actionGet();
        if (actionGet.isTimedOut()) {
-            final String hotThreads = client().admin().cluster().prepareNodesHotThreads().setIgnoreIdleThreads(false).get().getNodes()
-                .stream().map(NodeHotThreads::getHotThreads).collect(Collectors.joining("\n"));
+            final String hotThreads = client().admin().cluster().prepareNodesHotThreads().setThreads(99999).setIgnoreIdleThreads(false)
+                .get().getNodes().stream().map(NodeHotThreads::getHotThreads).collect(Collectors.joining("\n"));
            logger.info("{} timed out, cluster state:\n{}\npending tasks:\n{}\nhot threads:\n{}\n",
                method,
                client().admin().cluster().prepareState().get().getState(),