Improve Snapshot Abort Behavior (#54256) (#54410)

This commit improves the behavior of aborting snapshots and by that fixes
some extremely rare test failures.

Improvements:
1. When aborting a snapshot while it is in the `INIT` stage we do not need
to ever delete anything from the repository because nothing is written to the
repo during INIT any more (in the past running deletes for these snapshots made
sense because we were writing `snap-` and `meta-` blobs during the `INIT` step).
2. Do not try to finalize snapshots that never moved past `INIT`. Same reason as
with the first step. If we never moved past `INIT` no data was written to the repo
so no need to now write a useless entry for the aborted snapshot to `index-N`.
This is especially true, since the reason the snapshot was aborted during `INIT` was
a delete call so the useless empty snapshot just added to `index-N` would be removed
by the subsequent delete that is still waiting anyway.
3. if after aborting a snapshot we wait for it to finish we should not try deleting it
if it failed. If the snapshot failed it means it did not become part of the most recent
`RepositoryData` so a delete for it will needlessly fail with a confusing message about
that snapshot being missing or concurrent repository modification. I moved to throw the snapshot missing exception here because that seems the most user friendly. This allows the user to simply ignore `404` returns from the delete API when using it to make sure a snapshot is aborted+deleted.

Marking this as a non-issue since it doesn't have any negative repercussions other than confusing exceptions on some snapshot aborts.

Closes #52843
This commit is contained in:
Armin Braun 2020-03-30 15:08:18 +02:00 committed by GitHub
parent 21f362a2a8
commit 9392fca36a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 18 deletions

View File

@ -903,7 +903,8 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
final Map<String, Object> userMetadata,
Version repositoryMetaVersion,
final ActionListener<SnapshotInfo> listener) {
assert repositoryStateId > RepositoryData.UNKNOWN_REPO_GEN :
"Must finalize based on a valid repository generation but received [" + repositoryStateId + "]";
final Collection<IndexId> indices = shardGenerations.indices();
// Once we are done writing the updated index-N blob we remove the now unreferenced index-${uuid} blobs in each shard
// directory if all nodes are at least at version SnapshotsService#SHARD_GEN_IN_REPO_DATA_VERSION

View File

@ -106,9 +106,8 @@ import static org.elasticsearch.cluster.SnapshotsInProgress.completed;
* <ul>
* <li>On the master node the {@link #createSnapshot(CreateSnapshotRequest, ActionListener)} is called and makes sure that
* no snapshot is currently running and registers the new snapshot in cluster state</li>
* <li>When cluster state is updated
* the {@link #beginSnapshot} method kicks in and initializes
* the snapshot in the repository and then populates list of shards that needs to be snapshotted in cluster state</li>
* <li>When the cluster state is updated the {@link #beginSnapshot} method kicks in and populates the list of shards that need to be
* snapshotted in cluster state</li>
* <li>Each data node is watching for these shards and when new shards scheduled for snapshotting appear in the cluster state, data nodes
* start processing them through {@link SnapshotShardsService#startNewSnapshots} method</li>
* <li>Once shard snapshot is created data node updates state of the shard in the cluster state using
@ -1057,10 +1056,16 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
if (endingSnapshots.add(entry.snapshot()) == false) {
return;
}
final Snapshot snapshot = entry.snapshot();
if (entry.repositoryStateId() == RepositoryData.UNKNOWN_REPO_GEN) {
logger.debug("[{}] was aborted before starting", snapshot);
removeSnapshotFromClusterState(entry.snapshot(), null,
new SnapshotException(snapshot, "Aborted on initialization"));
return;
}
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(new AbstractRunnable() {
@Override
protected void doRun() {
final Snapshot snapshot = entry.snapshot();
final Repository repository = repositoriesService.repository(snapshot.getRepository());
final String failure = entry.failure();
logger.trace("[{}] finalizing snapshot in repository, state: [{}], failure[{}]", snapshot, entry.state(), failure);
@ -1256,12 +1261,15 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
*/
private void deleteSnapshot(final Snapshot snapshot, final ActionListener<Void> listener, final long repositoryStateId,
final boolean immediatePriority) {
logger.info("deleting snapshot [{}]", snapshot);
Priority priority = immediatePriority ? Priority.IMMEDIATE : Priority.NORMAL;
logger.info("deleting snapshot [{}] assuming repository generation [{}] and with priority [{}]",
snapshot, repositoryStateId, priority);
clusterService.submitStateUpdateTask("delete snapshot", new ClusterStateUpdateTask(priority) {
boolean waitForSnapshot = false;
boolean abortedDuringInit = false;
@Override
public ClusterState execute(ClusterState currentState) {
SnapshotDeletionsInProgress deletionsInProgress = currentState.custom(SnapshotDeletionsInProgress.TYPE);
@ -1321,6 +1329,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
shards = snapshotEntry.shards();
assert shards.isEmpty();
failure = "Snapshot was aborted during initialization";
abortedDuringInit = true;
} else if (state == State.STARTED) {
// snapshot is started - mark every non completed shard as aborted
final ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> shardsBuilder = ImmutableOpenMap.builder();
@ -1385,19 +1394,21 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
);
},
e -> {
logger.warn("deleted snapshot failed - deleting files", e);
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(() -> {
try {
deleteSnapshot(snapshot.getRepository(), snapshot.getSnapshotId().getName(), listener, true);
} catch (SnapshotMissingException smex) {
logger.info(() -> new ParameterizedMessage(
"Tried deleting in-progress snapshot [{}], but it could not be found after failing to abort.",
smex.getSnapshotName()), e);
listener.onFailure(new SnapshotException(snapshot,
"Tried deleting in-progress snapshot [" + smex.getSnapshotName() + "], but it " +
"could not be found after failing to abort.", smex));
if (abortedDuringInit) {
logger.debug(() -> new ParameterizedMessage("Snapshot [{}] was aborted during INIT", snapshot), e);
listener.onResponse(null);
} else {
if (ExceptionsHelper.unwrap(e, NotMasterException.class, FailedToCommitClusterStateException.class)
!= null) {
logger.warn("master failover before deleted snapshot could complete", e);
// Just pass the exception to the transport handler as is so it is retried on the new master
listener.onFailure(e);
} else {
logger.warn("deleted snapshot failed", e);
listener.onFailure(
new SnapshotMissingException(snapshot.getRepository(), snapshot.getSnapshotId(), e));
}
});
}
}
));
} else {