Check restores in progress before deleting a snapshot (#19853)
Currently, when attempting to delete a snapshot, we check if a snapshot is in progress before proceeding with the delete. However, we do not check if a restore is taking place before deleting. This can lead to concurrency issues where a restore is in progress but the snapshotted files for the restore are being deleted underneath. This commit first checks if a restore is in progress and if so, it prevents the deletion of a snapshot with an exception. Note that this is not a complete solution because it is still possible that a restore of the same snapshot is started after the deletion commenced but before the deletion finished. But there is a much smaller window for this to occur and this commit is a quick way to check for the common case.
This commit is contained in:
parent
16d93e5a53
commit
601602b364
|
@ -25,7 +25,7 @@ import org.elasticsearch.rest.RestStatus;
|
|||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Thrown when a user tries to start multiple snapshots at the same time
|
||||
* Thrown when a user tries to multiple conflicting snapshot/restore operations at the same time.
|
||||
*/
|
||||
public class ConcurrentSnapshotExecutionException extends SnapshotException {
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.elasticsearch.cluster.ClusterChangedEvent;
|
|||
import org.elasticsearch.cluster.ClusterState;
|
||||
import org.elasticsearch.cluster.ClusterStateListener;
|
||||
import org.elasticsearch.cluster.ClusterStateUpdateTask;
|
||||
import org.elasticsearch.cluster.RestoreInProgress;
|
||||
import org.elasticsearch.cluster.SnapshotsInProgress;
|
||||
import org.elasticsearch.cluster.SnapshotsInProgress.ShardSnapshotStatus;
|
||||
import org.elasticsearch.cluster.SnapshotsInProgress.State;
|
||||
|
@ -991,6 +992,15 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||
|
||||
@Override
|
||||
public ClusterState execute(ClusterState currentState) throws Exception {
|
||||
RestoreInProgress restoreInProgress = currentState.custom(RestoreInProgress.TYPE);
|
||||
if (restoreInProgress != null) {
|
||||
// don't allow snapshot deletions while a restore is taking place,
|
||||
// otherwise we could end up deleting a snapshot that is being restored
|
||||
// and the files the restore depends on would all be gone
|
||||
if (restoreInProgress.entries().isEmpty() == false) {
|
||||
throw new ConcurrentSnapshotExecutionException(snapshot, "cannot delete snapshot during a restore");
|
||||
}
|
||||
}
|
||||
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
|
||||
if (snapshots == null) {
|
||||
// No snapshots running - we can continue
|
||||
|
|
|
@ -1991,6 +1991,73 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
|
|||
assertThat(restoreSnapshotResponse.getRestoreInfo().failedShards(), equalTo(0));
|
||||
}
|
||||
|
||||
public void testDeleteSnapshotWhileRestoringFails() throws Exception {
|
||||
Client client = client();
|
||||
|
||||
logger.info("--> creating repository");
|
||||
final String repoName = "test-repo";
|
||||
assertAcked(client.admin().cluster().preparePutRepository(repoName)
|
||||
.setType("mock")
|
||||
.setSettings(Settings.builder().put("location", randomRepoPath())));
|
||||
|
||||
logger.info("--> creating index");
|
||||
final String indexName = "test-idx";
|
||||
assertAcked(prepareCreate(indexName).setWaitForActiveShards(ActiveShardCount.ALL));
|
||||
|
||||
logger.info("--> indexing some data");
|
||||
for (int i = 0; i < 100; i++) {
|
||||
index(indexName, "doc", Integer.toString(i), "foo", "bar" + i);
|
||||
}
|
||||
refresh();
|
||||
assertThat(client.prepareSearch(indexName).setSize(0).get().getHits().totalHits(), equalTo(100L));
|
||||
|
||||
logger.info("--> take snapshots");
|
||||
final String snapshotName = "test-snap";
|
||||
assertThat(client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName)
|
||||
.setIndices(indexName).setWaitForCompletion(true).get().getSnapshotInfo().state(), equalTo(SnapshotState.SUCCESS));
|
||||
final String snapshotName2 = "test-snap-2";
|
||||
assertThat(client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName2)
|
||||
.setIndices(indexName).setWaitForCompletion(true).get().getSnapshotInfo().state(), equalTo(SnapshotState.SUCCESS));
|
||||
|
||||
logger.info("--> delete index before restoring");
|
||||
assertAcked(client.admin().indices().prepareDelete(indexName).get());
|
||||
|
||||
logger.info("--> execution will be blocked on all data nodes");
|
||||
blockAllDataNodes(repoName);
|
||||
|
||||
final ListenableActionFuture<RestoreSnapshotResponse> restoreFut;
|
||||
try {
|
||||
logger.info("--> start restore");
|
||||
restoreFut = client.admin().cluster().prepareRestoreSnapshot(repoName, snapshotName)
|
||||
.setWaitForCompletion(true)
|
||||
.execute();
|
||||
|
||||
logger.info("--> waiting for block to kick in");
|
||||
waitForBlockOnAnyDataNode(repoName, TimeValue.timeValueMinutes(1));
|
||||
|
||||
logger.info("--> try deleting the snapshot while the restore is in progress (should throw an error)");
|
||||
ConcurrentSnapshotExecutionException e = expectThrows(ConcurrentSnapshotExecutionException.class, () ->
|
||||
client().admin().cluster().prepareDeleteSnapshot(repoName, snapshotName).get());
|
||||
assertEquals(repoName, e.getRepositoryName());
|
||||
assertEquals(snapshotName, e.getSnapshotName());
|
||||
assertThat(e.getMessage(), containsString("cannot delete snapshot during a restore"));
|
||||
|
||||
logger.info("-- try deleting another snapshot while the restore is in progress (should throw an error)");
|
||||
e = expectThrows(ConcurrentSnapshotExecutionException.class, () ->
|
||||
client().admin().cluster().prepareDeleteSnapshot(repoName, snapshotName2).get());
|
||||
assertEquals(repoName, e.getRepositoryName());
|
||||
assertEquals(snapshotName2, e.getSnapshotName());
|
||||
assertThat(e.getMessage(), containsString("cannot delete snapshot during a restore"));
|
||||
} finally {
|
||||
// unblock even if the try block fails otherwise we will get bogus failures when we delete all indices in test teardown.
|
||||
logger.info("--> unblocking all data nodes");
|
||||
unblockAllDataNodes(repoName);
|
||||
}
|
||||
|
||||
logger.info("--> wait for restore to finish");
|
||||
restoreFut.get();
|
||||
}
|
||||
|
||||
public void testDeleteOrphanSnapshot() throws Exception {
|
||||
Client client = client();
|
||||
|
||||
|
|
Loading…
Reference in New Issue