Increase timeout for relocation tests (#46554)

There's nothing wrong in the logs from these failures. I think 30
seconds might not be enough to relocate shards with many documents as CI
is quite slow. This change increases the timeout to 60 seconds for these
relocation tests. It also dumps the hot threads in case of timed out.

Closes #46526
Closes #46439
This commit is contained in:
Nhat Nguyen 2019-09-11 10:34:14 -04:00
parent 5b1f6669ff
commit 5465c8d095
3 changed files with 23 additions and 8 deletions

View File

@ -21,6 +21,7 @@ package org.elasticsearch.indices.recovery;
import org.elasticsearch.action.DocWriteResponse; import org.elasticsearch.action.DocWriteResponse;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.admin.cluster.node.hotthreads.NodeHotThreads;
import org.elasticsearch.action.delete.DeleteResponse; import org.elasticsearch.action.delete.DeleteResponse;
import org.elasticsearch.action.index.IndexResponse; import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ClusterState;
@ -28,14 +29,15 @@ import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand; import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand;
import org.elasticsearch.common.Priority; import org.elasticsearch.common.Priority;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.hamcrest.ElasticsearchAssertions; import org.elasticsearch.test.hamcrest.ElasticsearchAssertions;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import static org.hamcrest.Matchers.equalTo;
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST) @ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST)
public class IndexPrimaryRelocationIT extends ESIntegTestCase { public class IndexPrimaryRelocationIT extends ESIntegTestCase {
@ -54,7 +56,7 @@ public class IndexPrimaryRelocationIT extends ESIntegTestCase {
Thread indexingThread = new Thread() { Thread indexingThread = new Thread() {
@Override @Override
public void run() { public void run() {
while (finished.get() == false) { while (finished.get() == false && numAutoGenDocs.get() < 10_000) {
IndexResponse indexResponse = client().prepareIndex("test", "type", "id").setSource("field", "value").get(); IndexResponse indexResponse = client().prepareIndex("test", "type", "id").setSource("field", "value").get();
assertEquals(DocWriteResponse.Result.CREATED, indexResponse.getResult()); assertEquals(DocWriteResponse.Result.CREATED, indexResponse.getResult());
DeleteResponse deleteResponse = client().prepareDelete("test", "type", "id").get(); DeleteResponse deleteResponse = client().prepareDelete("test", "type", "id").get();
@ -80,8 +82,18 @@ public class IndexPrimaryRelocationIT extends ESIntegTestCase {
.add(new MoveAllocationCommand("test", 0, relocationSource.getId(), relocationTarget.getId())) .add(new MoveAllocationCommand("test", 0, relocationSource.getId(), relocationTarget.getId()))
.execute().actionGet(); .execute().actionGet();
ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth() ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
.setTimeout(TimeValue.timeValueSeconds(60))
.setWaitForEvents(Priority.LANGUID).setWaitForNoRelocatingShards(true).execute().actionGet(); .setWaitForEvents(Priority.LANGUID).setWaitForNoRelocatingShards(true).execute().actionGet();
assertThat(clusterHealthResponse.isTimedOut(), equalTo(false)); if (clusterHealthResponse.isTimedOut()) {
final String hotThreads = client().admin().cluster().prepareNodesHotThreads().setIgnoreIdleThreads(false).get().getNodes()
.stream().map(NodeHotThreads::getHotThreads).collect(Collectors.joining("\n"));
final ClusterState clusterState = client().admin().cluster().prepareState().get().getState();
logger.info("timed out for waiting for relocation iteration [{}] \ncluster state {} \nhot threads {}",
i, clusterState, hotThreads);
finished.set(true);
indexingThread.join();
throw new AssertionError("timed out waiting for relocation iteration [" + i + "] ");
}
logger.info("--> [iteration {}] relocation complete", i); logger.info("--> [iteration {}] relocation complete", i);
relocationSource = relocationTarget; relocationSource = relocationTarget;
// indexing process aborted early, no need for more relocations as test has already failed // indexing process aborted early, no need for more relocations as test has already failed

View File

@ -78,7 +78,6 @@ import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.concurrent.CountDownLatch; import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Semaphore; import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.stream.Stream; import java.util.stream.Stream;
@ -446,7 +445,7 @@ public class RelocationIT extends ESIntegTestCase {
} }
} }
public void testIndexAndRelocateConcurrently() throws ExecutionException, InterruptedException { public void testIndexAndRelocateConcurrently() throws Exception {
int halfNodes = randomIntBetween(1, 3); int halfNodes = randomIntBetween(1, 3);
Settings[] nodeSettings = Stream.concat( Settings[] nodeSettings = Stream.concat(
Stream.generate(() -> Settings.builder().put("node.attr.color", "blue").build()).limit(halfNodes), Stream.generate(() -> Settings.builder().put("node.attr.color", "blue").build()).limit(halfNodes),
@ -494,7 +493,7 @@ public class RelocationIT extends ESIntegTestCase {
numDocs *= 2; numDocs *= 2;
logger.info(" --> waiting for relocation to complete"); logger.info(" --> waiting for relocation to complete");
ensureGreen("test"); // move all shards to the new nodes (it waits on relocation) ensureGreen(TimeValue.timeValueSeconds(60), "test"); // move all shards to the new nodes (it waits on relocation)
final int numIters = randomIntBetween(10, 20); final int numIters = randomIntBetween(10, 20);
for (int i = 0; i < numIters; i++) { for (int i = 0; i < numIters; i++) {

View File

@ -33,6 +33,7 @@ import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.DocWriteResponse; import org.elasticsearch.action.DocWriteResponse;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest; import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.admin.cluster.node.hotthreads.NodeHotThreads;
import org.elasticsearch.action.admin.cluster.node.info.NodeInfo; import org.elasticsearch.action.admin.cluster.node.info.NodeInfo;
import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse; import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse;
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse; import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
@ -918,10 +919,13 @@ public abstract class ESIntegTestCase extends ESTestCase {
ClusterHealthResponse actionGet = client().admin().cluster().health(healthRequest).actionGet(); ClusterHealthResponse actionGet = client().admin().cluster().health(healthRequest).actionGet();
if (actionGet.isTimedOut()) { if (actionGet.isTimedOut()) {
logger.info("{} timed out, cluster state:\n{}\n{}", final String hotThreads = client().admin().cluster().prepareNodesHotThreads().setIgnoreIdleThreads(false).get().getNodes()
.stream().map(NodeHotThreads::getHotThreads).collect(Collectors.joining("\n"));
logger.info("{} timed out, cluster state:\n{}\npending tasks:\n{}\nhot threads:\n{}\n",
method, method,
client().admin().cluster().prepareState().get().getState(), client().admin().cluster().prepareState().get().getState(),
client().admin().cluster().preparePendingClusterTasks().get()); client().admin().cluster().preparePendingClusterTasks().get(),
hotThreads);
fail("timed out waiting for " + color + " state"); fail("timed out waiting for " + color + " state");
} }
assertThat("Expected at least " + clusterHealthStatus + " but got " + actionGet.getStatus(), assertThat("Expected at least " + clusterHealthStatus + " but got " + actionGet.getStatus(),