Check restores in progress before deleting a snapshot (#19853)
Currently, when attempting to delete a snapshot, we check if a snapshot is in progress before proceeding with the delete. However, we do not check if a restore is taking place before deleting. This can lead to concurrency issues where a restore is in progress but the snapshotted files for the restore are being deleted underneath. This commit first checks if a restore is in progress and if so, it prevents the deletion of a snapshot with an exception. Note that this is not a complete solution because it is still possible that a restore of the same snapshot is started after the deletion commenced but before the deletion finished. But there is a much smaller window for this to occur and this commit is a quick way to check for the common case.
This commit is contained in:
parent
16d93e5a53
commit
601602b364
|
@ -25,7 +25,7 @@ import org.elasticsearch.rest.RestStatus;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Thrown when a user tries to start multiple snapshots at the same time
|
* Thrown when a user tries to multiple conflicting snapshot/restore operations at the same time.
|
||||||
*/
|
*/
|
||||||
public class ConcurrentSnapshotExecutionException extends SnapshotException {
|
public class ConcurrentSnapshotExecutionException extends SnapshotException {
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,7 @@ import org.elasticsearch.cluster.ClusterChangedEvent;
|
||||||
import org.elasticsearch.cluster.ClusterState;
|
import org.elasticsearch.cluster.ClusterState;
|
||||||
import org.elasticsearch.cluster.ClusterStateListener;
|
import org.elasticsearch.cluster.ClusterStateListener;
|
||||||
import org.elasticsearch.cluster.ClusterStateUpdateTask;
|
import org.elasticsearch.cluster.ClusterStateUpdateTask;
|
||||||
|
import org.elasticsearch.cluster.RestoreInProgress;
|
||||||
import org.elasticsearch.cluster.SnapshotsInProgress;
|
import org.elasticsearch.cluster.SnapshotsInProgress;
|
||||||
import org.elasticsearch.cluster.SnapshotsInProgress.ShardSnapshotStatus;
|
import org.elasticsearch.cluster.SnapshotsInProgress.ShardSnapshotStatus;
|
||||||
import org.elasticsearch.cluster.SnapshotsInProgress.State;
|
import org.elasticsearch.cluster.SnapshotsInProgress.State;
|
||||||
|
@ -991,6 +992,15 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ClusterState execute(ClusterState currentState) throws Exception {
|
public ClusterState execute(ClusterState currentState) throws Exception {
|
||||||
|
RestoreInProgress restoreInProgress = currentState.custom(RestoreInProgress.TYPE);
|
||||||
|
if (restoreInProgress != null) {
|
||||||
|
// don't allow snapshot deletions while a restore is taking place,
|
||||||
|
// otherwise we could end up deleting a snapshot that is being restored
|
||||||
|
// and the files the restore depends on would all be gone
|
||||||
|
if (restoreInProgress.entries().isEmpty() == false) {
|
||||||
|
throw new ConcurrentSnapshotExecutionException(snapshot, "cannot delete snapshot during a restore");
|
||||||
|
}
|
||||||
|
}
|
||||||
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
|
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
|
||||||
if (snapshots == null) {
|
if (snapshots == null) {
|
||||||
// No snapshots running - we can continue
|
// No snapshots running - we can continue
|
||||||
|
|
|
@ -1991,6 +1991,73 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
|
||||||
assertThat(restoreSnapshotResponse.getRestoreInfo().failedShards(), equalTo(0));
|
assertThat(restoreSnapshotResponse.getRestoreInfo().failedShards(), equalTo(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testDeleteSnapshotWhileRestoringFails() throws Exception {
|
||||||
|
Client client = client();
|
||||||
|
|
||||||
|
logger.info("--> creating repository");
|
||||||
|
final String repoName = "test-repo";
|
||||||
|
assertAcked(client.admin().cluster().preparePutRepository(repoName)
|
||||||
|
.setType("mock")
|
||||||
|
.setSettings(Settings.builder().put("location", randomRepoPath())));
|
||||||
|
|
||||||
|
logger.info("--> creating index");
|
||||||
|
final String indexName = "test-idx";
|
||||||
|
assertAcked(prepareCreate(indexName).setWaitForActiveShards(ActiveShardCount.ALL));
|
||||||
|
|
||||||
|
logger.info("--> indexing some data");
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
index(indexName, "doc", Integer.toString(i), "foo", "bar" + i);
|
||||||
|
}
|
||||||
|
refresh();
|
||||||
|
assertThat(client.prepareSearch(indexName).setSize(0).get().getHits().totalHits(), equalTo(100L));
|
||||||
|
|
||||||
|
logger.info("--> take snapshots");
|
||||||
|
final String snapshotName = "test-snap";
|
||||||
|
assertThat(client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName)
|
||||||
|
.setIndices(indexName).setWaitForCompletion(true).get().getSnapshotInfo().state(), equalTo(SnapshotState.SUCCESS));
|
||||||
|
final String snapshotName2 = "test-snap-2";
|
||||||
|
assertThat(client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName2)
|
||||||
|
.setIndices(indexName).setWaitForCompletion(true).get().getSnapshotInfo().state(), equalTo(SnapshotState.SUCCESS));
|
||||||
|
|
||||||
|
logger.info("--> delete index before restoring");
|
||||||
|
assertAcked(client.admin().indices().prepareDelete(indexName).get());
|
||||||
|
|
||||||
|
logger.info("--> execution will be blocked on all data nodes");
|
||||||
|
blockAllDataNodes(repoName);
|
||||||
|
|
||||||
|
final ListenableActionFuture<RestoreSnapshotResponse> restoreFut;
|
||||||
|
try {
|
||||||
|
logger.info("--> start restore");
|
||||||
|
restoreFut = client.admin().cluster().prepareRestoreSnapshot(repoName, snapshotName)
|
||||||
|
.setWaitForCompletion(true)
|
||||||
|
.execute();
|
||||||
|
|
||||||
|
logger.info("--> waiting for block to kick in");
|
||||||
|
waitForBlockOnAnyDataNode(repoName, TimeValue.timeValueMinutes(1));
|
||||||
|
|
||||||
|
logger.info("--> try deleting the snapshot while the restore is in progress (should throw an error)");
|
||||||
|
ConcurrentSnapshotExecutionException e = expectThrows(ConcurrentSnapshotExecutionException.class, () ->
|
||||||
|
client().admin().cluster().prepareDeleteSnapshot(repoName, snapshotName).get());
|
||||||
|
assertEquals(repoName, e.getRepositoryName());
|
||||||
|
assertEquals(snapshotName, e.getSnapshotName());
|
||||||
|
assertThat(e.getMessage(), containsString("cannot delete snapshot during a restore"));
|
||||||
|
|
||||||
|
logger.info("-- try deleting another snapshot while the restore is in progress (should throw an error)");
|
||||||
|
e = expectThrows(ConcurrentSnapshotExecutionException.class, () ->
|
||||||
|
client().admin().cluster().prepareDeleteSnapshot(repoName, snapshotName2).get());
|
||||||
|
assertEquals(repoName, e.getRepositoryName());
|
||||||
|
assertEquals(snapshotName2, e.getSnapshotName());
|
||||||
|
assertThat(e.getMessage(), containsString("cannot delete snapshot during a restore"));
|
||||||
|
} finally {
|
||||||
|
// unblock even if the try block fails otherwise we will get bogus failures when we delete all indices in test teardown.
|
||||||
|
logger.info("--> unblocking all data nodes");
|
||||||
|
unblockAllDataNodes(repoName);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("--> wait for restore to finish");
|
||||||
|
restoreFut.get();
|
||||||
|
}
|
||||||
|
|
||||||
public void testDeleteOrphanSnapshot() throws Exception {
|
public void testDeleteOrphanSnapshot() throws Exception {
|
||||||
Client client = client();
|
Client client = client();
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue