Check restores in progress before deleting a snapshot (#19853)

Currently, when attempting to delete a snapshot, we check if a snapshot is in progress before proceeding with the delete. However, we do not check if a restore is taking place before deleting. This can lead to concurrency issues where a restore is in progress but the snapshotted files for the restore are being deleted underneath. This commit first checks if a restore is in progress and if so, it prevents the deletion of a snapshot with an exception. Note that this is not a complete solution because it is still possible that a restore of the same snapshot is started after the deletion commenced but before the deletion finished. But there is a much smaller window for this to occur and this commit is a quick way to check for the common case.
2016-08-09 15:07:09 -05:00 · 2016-08-09 15:07:09 -05:00 · 601602b364
parent 16d93e5a53
commit 601602b364
3 changed files with 78 additions and 1 deletions
--- a/core/src/main/java/org/elasticsearch/snapshots/ConcurrentSnapshotExecutionException.java
+++ b/core/src/main/java/org/elasticsearch/snapshots/ConcurrentSnapshotExecutionException.java
@ -25,7 +25,7 @@ import org.elasticsearch.rest.RestStatus;
 import java.io.IOException;

 /**
- * Thrown when a user tries to start multiple snapshots at the same time
+ * Thrown when a user tries to multiple conflicting snapshot/restore operations at the same time.
 */
 public class ConcurrentSnapshotExecutionException extends SnapshotException {

--- a/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
+++ b/core/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
@ -30,6 +30,7 @@ import org.elasticsearch.cluster.ClusterChangedEvent;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.ClusterStateListener;
 import org.elasticsearch.cluster.ClusterStateUpdateTask;
+import org.elasticsearch.cluster.RestoreInProgress;
 import org.elasticsearch.cluster.SnapshotsInProgress;
 import org.elasticsearch.cluster.SnapshotsInProgress.ShardSnapshotStatus;
 import org.elasticsearch.cluster.SnapshotsInProgress.State;
@ -991,6 +992,15 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus

            @Override
            public ClusterState execute(ClusterState currentState) throws Exception {
+                RestoreInProgress restoreInProgress = currentState.custom(RestoreInProgress.TYPE);
+                if (restoreInProgress != null) {
+                    // don't allow snapshot deletions while a restore is taking place,
+                    // otherwise we could end up deleting a snapshot that is being restored
+                    // and the files the restore depends on would all be gone
+                    if (restoreInProgress.entries().isEmpty() == false) {
+                        throw new ConcurrentSnapshotExecutionException(snapshot, "cannot delete snapshot during a restore");
+                    }
+                }
                SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
                if (snapshots == null) {
                    // No snapshots running - we can continue
--- a/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java
+++ b/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java
@ -1991,6 +1991,73 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
        assertThat(restoreSnapshotResponse.getRestoreInfo().failedShards(), equalTo(0));
    }

+    public void testDeleteSnapshotWhileRestoringFails() throws Exception {
+        Client client = client();
+
+        logger.info("-->  creating repository");
+        final String repoName = "test-repo";
+        assertAcked(client.admin().cluster().preparePutRepository(repoName)
+                        .setType("mock")
+                        .setSettings(Settings.builder().put("location", randomRepoPath())));
+
+        logger.info("--> creating index");
+        final String indexName = "test-idx";
+        assertAcked(prepareCreate(indexName).setWaitForActiveShards(ActiveShardCount.ALL));
+
+        logger.info("--> indexing some data");
+        for (int i = 0; i < 100; i++) {
+            index(indexName, "doc", Integer.toString(i), "foo", "bar" + i);
+        }
+        refresh();
+        assertThat(client.prepareSearch(indexName).setSize(0).get().getHits().totalHits(), equalTo(100L));
+
+        logger.info("--> take snapshots");
+        final String snapshotName = "test-snap";
+        assertThat(client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName)
+                       .setIndices(indexName).setWaitForCompletion(true).get().getSnapshotInfo().state(), equalTo(SnapshotState.SUCCESS));
+        final String snapshotName2 = "test-snap-2";
+        assertThat(client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName2)
+                       .setIndices(indexName).setWaitForCompletion(true).get().getSnapshotInfo().state(), equalTo(SnapshotState.SUCCESS));
+
+        logger.info("--> delete index before restoring");
+        assertAcked(client.admin().indices().prepareDelete(indexName).get());
+
+        logger.info("--> execution will be blocked on all data nodes");
+        blockAllDataNodes(repoName);
+
+        final ListenableActionFuture<RestoreSnapshotResponse> restoreFut;
+        try {
+            logger.info("--> start restore");
+            restoreFut = client.admin().cluster().prepareRestoreSnapshot(repoName, snapshotName)
+                             .setWaitForCompletion(true)
+                             .execute();
+
+            logger.info("--> waiting for block to kick in");
+            waitForBlockOnAnyDataNode(repoName, TimeValue.timeValueMinutes(1));
+
+            logger.info("--> try deleting the snapshot while the restore is in progress (should throw an error)");
+            ConcurrentSnapshotExecutionException e = expectThrows(ConcurrentSnapshotExecutionException.class, () ->
+                client().admin().cluster().prepareDeleteSnapshot(repoName, snapshotName).get());
+            assertEquals(repoName, e.getRepositoryName());
+            assertEquals(snapshotName, e.getSnapshotName());
+            assertThat(e.getMessage(), containsString("cannot delete snapshot during a restore"));
+
+            logger.info("-- try deleting another snapshot while the restore is in progress (should throw an error)");
+            e = expectThrows(ConcurrentSnapshotExecutionException.class, () ->
+                client().admin().cluster().prepareDeleteSnapshot(repoName, snapshotName2).get());
+            assertEquals(repoName, e.getRepositoryName());
+            assertEquals(snapshotName2, e.getSnapshotName());
+            assertThat(e.getMessage(), containsString("cannot delete snapshot during a restore"));
+        } finally {
+            // unblock even if the try block fails otherwise we will get bogus failures when we delete all indices in test teardown.
+            logger.info("--> unblocking all data nodes");
+            unblockAllDataNodes(repoName);
+        }
+
+        logger.info("--> wait for restore to finish");
+        restoreFut.get();
+    }
+
    public void testDeleteOrphanSnapshot() throws Exception {
        Client client = client();