Check restores in progress before deleting a snapshot (#19853)

Currently, when attempting to delete a snapshot, we check
if a snapshot is in progress before proceeding with the
delete. However, we do not check if a restore is taking
place before deleting. This can lead to concurrency issues
where a restore is in progress but the snapshotted files
for the restore are being deleted underneath.

This commit first checks if a restore is in progress and
if so, it prevents the deletion of a snapshot with an
exception.

Note that this is not a complete solution because it is
still possible that a restore of the same snapshot is
started after the deletion commenced but before the
deletion finished. But there is a much smaller window
for this to occur and this commit is a quick way to
check for the common case.
This commit is contained in:
Ali Beyad 2016-08-09 15:07:09 -05:00 committed by GitHub
parent 16d93e5a53
commit 601602b364
3 changed files with 78 additions and 1 deletions

View File

@ -25,7 +25,7 @@ import org.elasticsearch.rest.RestStatus;
import java.io.IOException;
/**
* Thrown when a user tries to start multiple snapshots at the same time
* Thrown when a user tries to multiple conflicting snapshot/restore operations at the same time.
*/
public class ConcurrentSnapshotExecutionException extends SnapshotException {

View File

@ -30,6 +30,7 @@ import org.elasticsearch.cluster.ClusterChangedEvent;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ClusterStateListener;
import org.elasticsearch.cluster.ClusterStateUpdateTask;
import org.elasticsearch.cluster.RestoreInProgress;
import org.elasticsearch.cluster.SnapshotsInProgress;
import org.elasticsearch.cluster.SnapshotsInProgress.ShardSnapshotStatus;
import org.elasticsearch.cluster.SnapshotsInProgress.State;
@ -991,6 +992,15 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
RestoreInProgress restoreInProgress = currentState.custom(RestoreInProgress.TYPE);
if (restoreInProgress != null) {
// don't allow snapshot deletions while a restore is taking place,
// otherwise we could end up deleting a snapshot that is being restored
// and the files the restore depends on would all be gone
if (restoreInProgress.entries().isEmpty() == false) {
throw new ConcurrentSnapshotExecutionException(snapshot, "cannot delete snapshot during a restore");
}
}
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
if (snapshots == null) {
// No snapshots running - we can continue

View File

@ -1991,6 +1991,73 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
assertThat(restoreSnapshotResponse.getRestoreInfo().failedShards(), equalTo(0));
}
public void testDeleteSnapshotWhileRestoringFails() throws Exception {
Client client = client();
logger.info("--> creating repository");
final String repoName = "test-repo";
assertAcked(client.admin().cluster().preparePutRepository(repoName)
.setType("mock")
.setSettings(Settings.builder().put("location", randomRepoPath())));
logger.info("--> creating index");
final String indexName = "test-idx";
assertAcked(prepareCreate(indexName).setWaitForActiveShards(ActiveShardCount.ALL));
logger.info("--> indexing some data");
for (int i = 0; i < 100; i++) {
index(indexName, "doc", Integer.toString(i), "foo", "bar" + i);
}
refresh();
assertThat(client.prepareSearch(indexName).setSize(0).get().getHits().totalHits(), equalTo(100L));
logger.info("--> take snapshots");
final String snapshotName = "test-snap";
assertThat(client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName)
.setIndices(indexName).setWaitForCompletion(true).get().getSnapshotInfo().state(), equalTo(SnapshotState.SUCCESS));
final String snapshotName2 = "test-snap-2";
assertThat(client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName2)
.setIndices(indexName).setWaitForCompletion(true).get().getSnapshotInfo().state(), equalTo(SnapshotState.SUCCESS));
logger.info("--> delete index before restoring");
assertAcked(client.admin().indices().prepareDelete(indexName).get());
logger.info("--> execution will be blocked on all data nodes");
blockAllDataNodes(repoName);
final ListenableActionFuture<RestoreSnapshotResponse> restoreFut;
try {
logger.info("--> start restore");
restoreFut = client.admin().cluster().prepareRestoreSnapshot(repoName, snapshotName)
.setWaitForCompletion(true)
.execute();
logger.info("--> waiting for block to kick in");
waitForBlockOnAnyDataNode(repoName, TimeValue.timeValueMinutes(1));
logger.info("--> try deleting the snapshot while the restore is in progress (should throw an error)");
ConcurrentSnapshotExecutionException e = expectThrows(ConcurrentSnapshotExecutionException.class, () ->
client().admin().cluster().prepareDeleteSnapshot(repoName, snapshotName).get());
assertEquals(repoName, e.getRepositoryName());
assertEquals(snapshotName, e.getSnapshotName());
assertThat(e.getMessage(), containsString("cannot delete snapshot during a restore"));
logger.info("-- try deleting another snapshot while the restore is in progress (should throw an error)");
e = expectThrows(ConcurrentSnapshotExecutionException.class, () ->
client().admin().cluster().prepareDeleteSnapshot(repoName, snapshotName2).get());
assertEquals(repoName, e.getRepositoryName());
assertEquals(snapshotName2, e.getSnapshotName());
assertThat(e.getMessage(), containsString("cannot delete snapshot during a restore"));
} finally {
// unblock even if the try block fails otherwise we will get bogus failures when we delete all indices in test teardown.
logger.info("--> unblocking all data nodes");
unblockAllDataNodes(repoName);
}
logger.info("--> wait for restore to finish");
restoreFut.get();
}
public void testDeleteOrphanSnapshot() throws Exception {
Client client = client();