Add extra logging for investigation into #52000 (#52472)

It looks like #52000 is caused by a slowdown in cluster state application
(maybe due to #50907) but I would like to understand the details to ensure that
there's nothing else going on here too before simply increasing the timeout.
This commit enables some relevant `DEBUG` loggers and also captures stack
traces from all threads rather than just the three hottest ones.
This commit is contained in:
David Turner 2020-02-18 13:01:56 +00:00
parent 84de601551
commit 3d57a78deb
2 changed files with 5 additions and 2 deletions

View File

@ -57,6 +57,7 @@ import org.elasticsearch.test.ESIntegTestCase.ClusterScope;
import org.elasticsearch.test.ESIntegTestCase.Scope;
import org.elasticsearch.test.InternalTestCluster;
import org.elasticsearch.test.MockLogAppender;
import org.elasticsearch.test.junit.annotations.TestLogging;
import java.nio.file.Path;
import java.util.Arrays;
@ -187,6 +188,8 @@ public class ClusterRerouteIT extends ESIntegTestCase {
rerouteWithAllocateLocalGateway(commonSettings);
}
@TestLogging(reason = "https://github.com/elastic/elasticsearch/issues/52000",
value = "org.elasticsearch.gateway.PersistedClusterStateService:DEBUG,org.elasticsearch.cluster.service.MasterService:DEBUG")
public void testDelayWithALargeAmountOfShards() throws Exception {
Settings commonSettings = Settings.builder()
.put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(), 1)

View File

@ -919,8 +919,8 @@ public abstract class ESIntegTestCase extends ESTestCase {
ClusterHealthResponse actionGet = client().admin().cluster().health(healthRequest).actionGet();
if (actionGet.isTimedOut()) {
final String hotThreads = client().admin().cluster().prepareNodesHotThreads().setIgnoreIdleThreads(false).get().getNodes()
.stream().map(NodeHotThreads::getHotThreads).collect(Collectors.joining("\n"));
final String hotThreads = client().admin().cluster().prepareNodesHotThreads().setThreads(99999).setIgnoreIdleThreads(false)
.get().getNodes().stream().map(NodeHotThreads::getHotThreads).collect(Collectors.joining("\n"));
logger.info("{} timed out, cluster state:\n{}\npending tasks:\n{}\nhot threads:\n{}\n",
method,
client().admin().cluster().prepareState().get().getState(),