From 8429b4ace88811d53685ed4532d1017d75029128 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Wed, 29 Jul 2020 15:54:18 +0200 Subject: [PATCH] Fix Queued Snapshot Deletes After Finalization Failure (#60285) (#60379) This fixes the behavior of the snapshot state machine in the following edge case: 1. Snapshot is running 2. Delete/abort for the snapshot is started 3. Snapshot fails to finalize We were not removing the failed snapshot id from the list of snapshots to delete in the delete. This lead to an error in the repository, which throws if we try to delete a non-existing snapshot. This commmit updates the deletions in progress by removing the failed snapshot id. The fact that this could lead to snapshot delete entries without any snapshot ids is not optimized on purpose because it allows for another attempt at writing clean `RepositoryData` and will run basic cleanup on the repository (root level blobs and stale indices) and thus bring the repository back into a clean state after a failed finalization. Closes #60274 --- .../snapshots/ConcurrentSnapshotsIT.java | 16 ++++++ .../snapshots/SnapshotsService.java | 55 +++++++++++++------ 2 files changed, 54 insertions(+), 17 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java index 048e0fb865c..8c8f2b1f9a1 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java @@ -1172,6 +1172,22 @@ public class ConcurrentSnapshotsIT extends AbstractSnapshotIntegTestCase { assertThat(repositoryData.shardGenerations(), is(ShardGenerations.EMPTY)); } + public void testQueuedDeleteAfterFinalizationFailure() throws Exception { + final String masterNode = internalCluster().startMasterOnlyNode(); + final String repoName = "test-repo"; + createRepository(repoName, "mock"); + blockMasterFromFinalizingSnapshotOnIndexFile(repoName); + final String snapshotName = "snap-1"; + final ActionFuture snapshotFuture = startFullSnapshot(repoName, snapshotName); + waitForBlock(masterNode, repoName, TimeValue.timeValueSeconds(30L)); + final ActionFuture deleteFuture = startDelete(repoName, snapshotName); + awaitNDeletionsInProgress(1); + unblockNode(repoName, masterNode); + assertAcked(deleteFuture.get()); + final SnapshotException sne = expectThrows(SnapshotException.class, snapshotFuture::actionGet); + assertThat(sne.getCause().getMessage(), containsString("exception after block")); + } + private static String startDataNodeWithLargeSnapshotPool() { return internalCluster().startDataOnlyNode(LARGE_SNAPSHOT_POOL_SETTINGS); } diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index 8313109ce86..347eaa1c6ca 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -1437,7 +1437,11 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus @Override public ClusterState execute(ClusterState currentState) { - return stateWithoutSnapshot(currentState, snapshot); + final ClusterState updatedState = stateWithoutSnapshot(currentState, snapshot); + // now check if there are any delete operations that refer to the just failed snapshot and remove the snapshot from them + return updateWithSnapshots(updatedState, null, deletionsWithoutSnapshots( + updatedState.custom(SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY), + Collections.singletonList(snapshot.getSnapshotId()), snapshot.getRepository())); } @Override @@ -1473,6 +1477,36 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus }); } + /** + * Remove the given {@link SnapshotId}s for the given {@code repository} from an instance of {@link SnapshotDeletionsInProgress}. + * If no deletion contained any of the snapshot ids to remove then return {@code null}. + * + * @param deletions snapshot deletions to update + * @param snapshotIds snapshot ids to remove + * @param repository repository that the snapshot ids belong to + * @return updated {@link SnapshotDeletionsInProgress} or {@code null} if unchanged + */ + @Nullable + private static SnapshotDeletionsInProgress deletionsWithoutSnapshots(SnapshotDeletionsInProgress deletions, + Collection snapshotIds, String repository) { + boolean changed = false; + List updatedEntries = new ArrayList<>(deletions.getEntries().size()); + for (SnapshotDeletionsInProgress.Entry entry : deletions.getEntries()) { + if (entry.repository().equals(repository)) { + final List updatedSnapshotIds = new ArrayList<>(entry.getSnapshots()); + if (updatedSnapshotIds.removeAll(snapshotIds)) { + changed = true; + updatedEntries.add(entry.withSnapshots(updatedSnapshotIds)); + } else { + updatedEntries.add(entry); + } + } else { + updatedEntries.add(entry); + } + } + return changed ? SnapshotDeletionsInProgress.of(updatedEntries) : null; + } + private void failSnapshotCompletionListeners(Snapshot snapshot, Exception e) { endingSnapshots.remove(snapshot); failListenersIgnoringException(snapshotCompletionListeners.remove(snapshot), e); @@ -1968,22 +2002,9 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus clusterStateUpdateTask = new RemoveSnapshotDeletionAndContinueTask(deleteEntry, repositoryData) { @Override protected SnapshotDeletionsInProgress filterDeletions(SnapshotDeletionsInProgress deletions) { - boolean changed = false; - List updatedEntries = new ArrayList<>(deletions.getEntries().size()); - for (SnapshotDeletionsInProgress.Entry entry : deletions.getEntries()) { - if (entry.repository().equals(deleteEntry.repository())) { - final List updatedSnapshotIds = new ArrayList<>(entry.getSnapshots()); - if (updatedSnapshotIds.removeAll(deleteEntry.getSnapshots())) { - changed = true; - updatedEntries.add(entry.withSnapshots(updatedSnapshotIds)); - } else { - updatedEntries.add(entry); - } - } else { - updatedEntries.add(entry); - } - } - return changed ? SnapshotDeletionsInProgress.of(updatedEntries) : deletions; + final SnapshotDeletionsInProgress updatedDeletions = + deletionsWithoutSnapshots(deletions, deleteEntry.getSnapshots(), deleteEntry.repository()); + return updatedDeletions == null ? deletions : updatedDeletions; } @Override