From 3143b5ea470a33f6e6243522fa49fd6d3411915a Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Thu, 13 Aug 2020 16:26:56 +0200 Subject: [PATCH] Stabilize testSnapshotDeleteRelocatingPrimaryIndex (#61088) (#61096) Use transport blocking to make relocation take forever instead of relying on the relocation to take long enough to clash with the snapshot. Closes #61069 --- .../DedicatedClusterSnapshotRestoreIT.java | 50 ++++++++++++++++++- .../SharedClusterSnapshotRestoreIT.java | 32 ------------ 2 files changed, 49 insertions(+), 33 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java index 7b30a51c7c2..54104c8fc7e 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java @@ -69,6 +69,7 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.seqno.RetentionLeaseActions; import org.elasticsearch.index.seqno.RetentionLeases; import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.indices.recovery.PeerRecoveryTargetService; import org.elasticsearch.indices.recovery.RecoveryState; import org.elasticsearch.node.Node; import org.elasticsearch.plugins.Plugin; @@ -88,6 +89,7 @@ import org.elasticsearch.test.TestCustomMetadata; import org.elasticsearch.test.disruption.BusyMasterServiceDisruption; import org.elasticsearch.test.disruption.ServiceDisruptionScheme; import org.elasticsearch.test.rest.FakeRestRequest; +import org.elasticsearch.test.transport.MockTransportService; import org.elasticsearch.transport.TransportMessageListener; import org.elasticsearch.transport.TransportRequest; import org.elasticsearch.transport.TransportRequestOptions; @@ -176,7 +178,8 @@ public class DedicatedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTest @Override protected Collection> nodePlugins() { - return Arrays.asList(MockRepository.Plugin.class, TestCustomMetadataPlugin.class, BrokenSettingPlugin.class); + return Arrays.asList(MockRepository.Plugin.class, TestCustomMetadataPlugin.class, BrokenSettingPlugin.class, + MockTransportService.TestPlugin.class); } public static class BrokenSettingPlugin extends Plugin { @@ -1310,6 +1313,51 @@ public class DedicatedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTest assertThat(snapshot3.state(), is(SnapshotState.SUCCESS)); } + public void testSnapshotDeleteRelocatingPrimaryIndex() throws Exception { + internalCluster().startMasterOnlyNode(); + final List dataNodes = internalCluster().startDataOnlyNodes(2); + final String repoName = "test-repo"; + createRepository(repoName, "fs"); + + // Create index on two nodes and make sure each node has a primary by setting no replicas + final String indexName = "test-idx"; + assertAcked(prepareCreate(indexName, 2, indexSettingsNoReplicas(between(2, 10)))); + ensureGreen(indexName); + indexRandomDocs(indexName, 100); + + // Drop all file chunk requests so that below relocation takes forever and we're guaranteed to run the snapshot in parallel to it + for (String nodeName : dataNodes) { + ((MockTransportService) internalCluster().getInstance(TransportService.class, nodeName)).addSendBehavior( + (connection, requestId, action, request, options) -> { + if (PeerRecoveryTargetService.Actions.FILE_CHUNK.equals(action)) { + return; + } + connection.sendRequest(requestId, action, request, options); + }); + } + + logger.info("--> start relocations"); + allowNodes(indexName, 1); + + logger.info("--> wait for relocations to start"); + + assertBusy(() -> assertThat( + client().admin().cluster().prepareHealth(indexName).execute().actionGet().getRelocatingShards(), greaterThan(0)), + 1L, TimeUnit.MINUTES); + + logger.info("--> snapshot"); + client().admin().cluster().prepareCreateSnapshot(repoName, "test-snap") + .setWaitForCompletion(false).setPartial(true).setIndices(indexName).get(); + + assertAcked(client().admin().indices().prepareDelete(indexName)); + + logger.info("--> wait for snapshot to complete"); + SnapshotInfo snapshotInfo = waitForCompletion(repoName, "test-snap", TimeValue.timeValueSeconds(600)); + assertThat(snapshotInfo.state(), equalTo(SnapshotState.PARTIAL)); + assertThat(snapshotInfo.shardFailures().size(), greaterThan(0)); + logger.info("--> done"); + } + private long calculateTotalFilesSize(List files) { return files.stream().mapToLong(f -> { try { diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java index 3c513165234..9554f7dc210 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java @@ -1937,38 +1937,6 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas logger.info("--> done"); } - public void testSnapshotDeleteRelocatingPrimaryIndex() throws Exception { - final String repoName = "test-repo"; - createRepository(repoName, "fs"); - - // Create index on two nodes and make sure each node has a primary by setting no replicas - final String indexName = "test-idx"; - assertAcked(prepareCreate(indexName, 2, indexSettingsNoReplicas(between(2, 10)))); - ensureGreen(indexName); - indexRandomDocs(indexName, 100); - - logger.info("--> start relocations"); - allowNodes(indexName, 1); - - logger.info("--> wait for relocations to start"); - - assertBusy(() -> assertThat( - client().admin().cluster().prepareHealth(indexName).execute().actionGet().getRelocatingShards(), greaterThan(0)), - 1L, TimeUnit.MINUTES); - - logger.info("--> snapshot"); - client().admin().cluster().prepareCreateSnapshot(repoName, "test-snap") - .setWaitForCompletion(false).setPartial(true).setIndices(indexName).get(); - - assertAcked(client().admin().indices().prepareDelete(indexName)); - - logger.info("--> wait for snapshot to complete"); - SnapshotInfo snapshotInfo = waitForCompletion(repoName, "test-snap", TimeValue.timeValueSeconds(600)); - assertThat(snapshotInfo.state(), equalTo(SnapshotState.PARTIAL)); - assertThat(snapshotInfo.shardFailures().size(), greaterThan(0)); - logger.info("--> done"); - } - public void testSnapshotMoreThanOnce() throws InterruptedException { Client client = client();