Fix Queued Snapshot Deletes After Finalization Failure (#60285) (#60379)

This fixes the behavior of the snapshot state machine in the following edge case:

1. Snapshot is running
2. Delete/abort for the snapshot is started
3. Snapshot fails to finalize

We were not removing the failed snapshot id from the list of snapshots to delete in the delete.
This lead to an error in the repository, which throws if we try to delete a non-existing snapshot.
This commmit updates the deletions in progress by removing the failed snapshot id.
The fact that this could lead to snapshot delete entries without any snapshot ids is not optimized
on purpose because it allows for another attempt at writing clean `RepositoryData` and will run basic
cleanup on the repository (root level blobs and stale indices) and thus bring the repository back into
a clean state after a failed finalization.

Closes #60274
This commit is contained in:
Armin Braun 2020-07-29 15:54:18 +02:00 committed by GitHub
parent 381cec2ba9
commit 8429b4ace8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 54 additions and 17 deletions

View File

@ -1172,6 +1172,22 @@ public class ConcurrentSnapshotsIT extends AbstractSnapshotIntegTestCase {
assertThat(repositoryData.shardGenerations(), is(ShardGenerations.EMPTY));
}
public void testQueuedDeleteAfterFinalizationFailure() throws Exception {
final String masterNode = internalCluster().startMasterOnlyNode();
final String repoName = "test-repo";
createRepository(repoName, "mock");
blockMasterFromFinalizingSnapshotOnIndexFile(repoName);
final String snapshotName = "snap-1";
final ActionFuture<CreateSnapshotResponse> snapshotFuture = startFullSnapshot(repoName, snapshotName);
waitForBlock(masterNode, repoName, TimeValue.timeValueSeconds(30L));
final ActionFuture<AcknowledgedResponse> deleteFuture = startDelete(repoName, snapshotName);
awaitNDeletionsInProgress(1);
unblockNode(repoName, masterNode);
assertAcked(deleteFuture.get());
final SnapshotException sne = expectThrows(SnapshotException.class, snapshotFuture::actionGet);
assertThat(sne.getCause().getMessage(), containsString("exception after block"));
}
private static String startDataNodeWithLargeSnapshotPool() {
return internalCluster().startDataOnlyNode(LARGE_SNAPSHOT_POOL_SETTINGS);
}

View File

@ -1437,7 +1437,11 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
@Override
public ClusterState execute(ClusterState currentState) {
return stateWithoutSnapshot(currentState, snapshot);
final ClusterState updatedState = stateWithoutSnapshot(currentState, snapshot);
// now check if there are any delete operations that refer to the just failed snapshot and remove the snapshot from them
return updateWithSnapshots(updatedState, null, deletionsWithoutSnapshots(
updatedState.custom(SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY),
Collections.singletonList(snapshot.getSnapshotId()), snapshot.getRepository()));
}
@Override
@ -1473,6 +1477,36 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
});
}
/**
* Remove the given {@link SnapshotId}s for the given {@code repository} from an instance of {@link SnapshotDeletionsInProgress}.
* If no deletion contained any of the snapshot ids to remove then return {@code null}.
*
* @param deletions snapshot deletions to update
* @param snapshotIds snapshot ids to remove
* @param repository repository that the snapshot ids belong to
* @return updated {@link SnapshotDeletionsInProgress} or {@code null} if unchanged
*/
@Nullable
private static SnapshotDeletionsInProgress deletionsWithoutSnapshots(SnapshotDeletionsInProgress deletions,
Collection<SnapshotId> snapshotIds, String repository) {
boolean changed = false;
List<SnapshotDeletionsInProgress.Entry> updatedEntries = new ArrayList<>(deletions.getEntries().size());
for (SnapshotDeletionsInProgress.Entry entry : deletions.getEntries()) {
if (entry.repository().equals(repository)) {
final List<SnapshotId> updatedSnapshotIds = new ArrayList<>(entry.getSnapshots());
if (updatedSnapshotIds.removeAll(snapshotIds)) {
changed = true;
updatedEntries.add(entry.withSnapshots(updatedSnapshotIds));
} else {
updatedEntries.add(entry);
}
} else {
updatedEntries.add(entry);
}
}
return changed ? SnapshotDeletionsInProgress.of(updatedEntries) : null;
}
private void failSnapshotCompletionListeners(Snapshot snapshot, Exception e) {
endingSnapshots.remove(snapshot);
failListenersIgnoringException(snapshotCompletionListeners.remove(snapshot), e);
@ -1968,22 +2002,9 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
clusterStateUpdateTask = new RemoveSnapshotDeletionAndContinueTask(deleteEntry, repositoryData) {
@Override
protected SnapshotDeletionsInProgress filterDeletions(SnapshotDeletionsInProgress deletions) {
boolean changed = false;
List<SnapshotDeletionsInProgress.Entry> updatedEntries = new ArrayList<>(deletions.getEntries().size());
for (SnapshotDeletionsInProgress.Entry entry : deletions.getEntries()) {
if (entry.repository().equals(deleteEntry.repository())) {
final List<SnapshotId> updatedSnapshotIds = new ArrayList<>(entry.getSnapshots());
if (updatedSnapshotIds.removeAll(deleteEntry.getSnapshots())) {
changed = true;
updatedEntries.add(entry.withSnapshots(updatedSnapshotIds));
} else {
updatedEntries.add(entry);
}
} else {
updatedEntries.add(entry);
}
}
return changed ? SnapshotDeletionsInProgress.of(updatedEntries) : deletions;
final SnapshotDeletionsInProgress updatedDeletions =
deletionsWithoutSnapshots(deletions, deleteEntry.getSnapshots(), deleteEntry.repository());
return updatedDeletions == null ? deletions : updatedDeletions;
}
@Override