Add extra logging for investigation into #52000 (#52472)

It looks like #52000 is caused by a slowdown in cluster state application
(maybe due to #50907) but I would like to understand the details to ensure that
there's nothing else going on here too before simply increasing the timeout.
This commit enables some relevant `DEBUG` loggers and also captures stack
traces from all threads rather than just the three hottest ones.
This commit is contained in:
David Turner 2020-02-18 13:01:56 +00:00
parent 84de601551
commit 3d57a78deb
2 changed files with 5 additions and 2 deletions

View File

@ -57,6 +57,7 @@ import org.elasticsearch.test.ESIntegTestCase.ClusterScope;
import org.elasticsearch.test.ESIntegTestCase.Scope; import org.elasticsearch.test.ESIntegTestCase.Scope;
import org.elasticsearch.test.InternalTestCluster; import org.elasticsearch.test.InternalTestCluster;
import org.elasticsearch.test.MockLogAppender; import org.elasticsearch.test.MockLogAppender;
import org.elasticsearch.test.junit.annotations.TestLogging;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Arrays; import java.util.Arrays;
@ -187,6 +188,8 @@ public class ClusterRerouteIT extends ESIntegTestCase {
rerouteWithAllocateLocalGateway(commonSettings); rerouteWithAllocateLocalGateway(commonSettings);
} }
@TestLogging(reason = "https://github.com/elastic/elasticsearch/issues/52000",
value = "org.elasticsearch.gateway.PersistedClusterStateService:DEBUG,org.elasticsearch.cluster.service.MasterService:DEBUG")
public void testDelayWithALargeAmountOfShards() throws Exception { public void testDelayWithALargeAmountOfShards() throws Exception {
Settings commonSettings = Settings.builder() Settings commonSettings = Settings.builder()
.put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(), 1) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(), 1)

View File

@ -919,8 +919,8 @@ public abstract class ESIntegTestCase extends ESTestCase {
ClusterHealthResponse actionGet = client().admin().cluster().health(healthRequest).actionGet(); ClusterHealthResponse actionGet = client().admin().cluster().health(healthRequest).actionGet();
if (actionGet.isTimedOut()) { if (actionGet.isTimedOut()) {
final String hotThreads = client().admin().cluster().prepareNodesHotThreads().setIgnoreIdleThreads(false).get().getNodes() final String hotThreads = client().admin().cluster().prepareNodesHotThreads().setThreads(99999).setIgnoreIdleThreads(false)
.stream().map(NodeHotThreads::getHotThreads).collect(Collectors.joining("\n")); .get().getNodes().stream().map(NodeHotThreads::getHotThreads).collect(Collectors.joining("\n"));
logger.info("{} timed out, cluster state:\n{}\npending tasks:\n{}\nhot threads:\n{}\n", logger.info("{} timed out, cluster state:\n{}\npending tasks:\n{}\nhot threads:\n{}\n",
method, method,
client().admin().cluster().prepareState().get().getState(), client().admin().cluster().prepareState().get().getState(),