Increase timeout for relocation tests (#46554)

There's nothing wrong in the logs from these failures. I think 30 seconds might not be enough to relocate shards with many documents as CI is quite slow. This change increases the timeout to 60 seconds for these relocation tests. It also dumps the hot threads in case of timed out. Closes #46526 Closes #46439
2019-09-11 10:34:14 -04:00 · 2019-09-11 10:34:14 -04:00 · 5465c8d095
parent 5b1f6669ff
commit 5465c8d095
3 changed files with 23 additions and 8 deletions
--- a/server/src/test/java/org/elasticsearch/indices/recovery/IndexPrimaryRelocationIT.java
+++ b/server/src/test/java/org/elasticsearch/indices/recovery/IndexPrimaryRelocationIT.java
@ -21,6 +21,7 @@ package org.elasticsearch.indices.recovery;

 import org.elasticsearch.action.DocWriteResponse;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
+import org.elasticsearch.action.admin.cluster.node.hotthreads.NodeHotThreads;
 import org.elasticsearch.action.delete.DeleteResponse;
 import org.elasticsearch.action.index.IndexResponse;
 import org.elasticsearch.cluster.ClusterState;
@ -28,14 +29,15 @@ import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand;
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.test.ESIntegTestCase;
 import org.elasticsearch.test.hamcrest.ElasticsearchAssertions;

 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;

-import static org.hamcrest.Matchers.equalTo;

@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST)
 public class IndexPrimaryRelocationIT extends ESIntegTestCase {
@ -54,7 +56,7 @@ public class IndexPrimaryRelocationIT extends ESIntegTestCase {
        Thread indexingThread = new Thread() {
            @Override
            public void run() {
-                while (finished.get() == false) {
+                while (finished.get() == false && numAutoGenDocs.get() < 10_000) {
                    IndexResponse indexResponse = client().prepareIndex("test", "type", "id").setSource("field", "value").get();
                    assertEquals(DocWriteResponse.Result.CREATED, indexResponse.getResult());
                    DeleteResponse deleteResponse = client().prepareDelete("test", "type", "id").get();
@ -80,8 +82,18 @@ public class IndexPrimaryRelocationIT extends ESIntegTestCase {
                .add(new MoveAllocationCommand("test", 0, relocationSource.getId(), relocationTarget.getId()))
                .execute().actionGet();
            ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
+                .setTimeout(TimeValue.timeValueSeconds(60))
                .setWaitForEvents(Priority.LANGUID).setWaitForNoRelocatingShards(true).execute().actionGet();
-            assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
+            if (clusterHealthResponse.isTimedOut()) {
+                final String hotThreads = client().admin().cluster().prepareNodesHotThreads().setIgnoreIdleThreads(false).get().getNodes()
+                    .stream().map(NodeHotThreads::getHotThreads).collect(Collectors.joining("\n"));
+                final ClusterState clusterState = client().admin().cluster().prepareState().get().getState();
+                logger.info("timed out for waiting for relocation iteration [{}] \ncluster state {} \nhot threads {}",
+                    i, clusterState, hotThreads);
+                finished.set(true);
+                indexingThread.join();
+                throw new AssertionError("timed out waiting for relocation iteration [" + i + "] ");
+            }
            logger.info("--> [iteration {}] relocation complete", i);
            relocationSource = relocationTarget;
            // indexing process aborted early, no need for more relocations as test has already failed
--- a/server/src/test/java/org/elasticsearch/recovery/RelocationIT.java
+++ b/server/src/test/java/org/elasticsearch/recovery/RelocationIT.java
@ -78,7 +78,6 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
 import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Stream;
@ -446,7 +445,7 @@ public class RelocationIT extends ESIntegTestCase {
        }
    }

-    public void testIndexAndRelocateConcurrently() throws ExecutionException, InterruptedException {
+    public void testIndexAndRelocateConcurrently() throws Exception {
        int halfNodes = randomIntBetween(1, 3);
        Settings[] nodeSettings = Stream.concat(
            Stream.generate(() -> Settings.builder().put("node.attr.color", "blue").build()).limit(halfNodes),
@ -494,7 +493,7 @@ public class RelocationIT extends ESIntegTestCase {
        numDocs *= 2;

        logger.info(" --> waiting for relocation to complete");
-        ensureGreen("test"); // move all shards to the new nodes (it waits on relocation)
+        ensureGreen(TimeValue.timeValueSeconds(60), "test"); // move all shards to the new nodes (it waits on relocation)

        final int numIters = randomIntBetween(10, 20);
        for (int i = 0; i < numIters; i++) {
--- a/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java
@ -33,6 +33,7 @@ import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.DocWriteResponse;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
+import org.elasticsearch.action.admin.cluster.node.hotthreads.NodeHotThreads;
 import org.elasticsearch.action.admin.cluster.node.info.NodeInfo;
 import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse;
 import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
@ -918,10 +919,13 @@ public abstract class ESIntegTestCase extends ESTestCase {

        ClusterHealthResponse actionGet = client().admin().cluster().health(healthRequest).actionGet();
        if (actionGet.isTimedOut()) {
-            logger.info("{} timed out, cluster state:\n{}\n{}",
+            final String hotThreads = client().admin().cluster().prepareNodesHotThreads().setIgnoreIdleThreads(false).get().getNodes()
+                .stream().map(NodeHotThreads::getHotThreads).collect(Collectors.joining("\n"));
+            logger.info("{} timed out, cluster state:\n{}\npending tasks:\n{}\nhot threads:\n{}\n",
                method,
                client().admin().cluster().prepareState().get().getState(),
-                client().admin().cluster().preparePendingClusterTasks().get());
+                client().admin().cluster().preparePendingClusterTasks().get(),
+                hotThreads);
            fail("timed out waiting for " + color + " state");
        }
        assertThat("Expected at least " + clusterHealthStatus + " but got " + actionGet.getStatus(),