From c06c9fb96669a90e57635fd21df93b4011e4ccc3 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Wed, 22 Jul 2020 10:09:55 +0200 Subject: [PATCH] Fix BwC Snapshot INIT Path (#60006) There were two subtle bugs here from backporting #56911 to 7.x. 1. We passed `null` for the `shards` map which isn't nullable any longer when creating `SnapshotsInProgress.Entry`, fixed by just passing an empty map like the `null` handling did in the past. 2. The removal of a failed `INIT` state snapshot from the cluster state tried removing it from the finalization loop (the set of repository names that are currently finalizing). This will trip an assertion since the snapshot failed before its repository was put into the set. I made the logic ignore the set in case we remove a failed `INIT` state snapshot to restore the old logic to exactly as it was before the concurrent snapshots backport to be on the safe side here. Also, added tests that explicitly call the old code paths because as can be seen from initially missing this, the BwC tests will only run in the configuration new version master, old version nodes ever so often and having a deterministic test for the old state machine seems the safest bet here. Closes #59986 --- .../DedicatedClusterSnapshotRestoreIT.java | 30 +++++++++++++++++++ .../snapshots/SnapshotsService.java | 24 +++++++-------- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java index f76098ca705..df7dfd37b7c 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java @@ -24,6 +24,7 @@ import com.carrotsearch.hppc.IntSet; import org.elasticsearch.Version; import org.elasticsearch.action.ActionFuture; +import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotRequest; import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse; import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse; import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse; @@ -1275,6 +1276,35 @@ public class DedicatedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTest assertThat(createSnapshotResponse.getSnapshotInfo().state(), is(SnapshotState.PARTIAL)); } + /** + * Tests for the legacy snapshot path that is normally executed if the cluster contains any nodes older than + * {@link SnapshotsService#NO_REPO_INITIALIZE_VERSION}. + * Makes sure that blocking as well as non-blocking snapshot create paths execute cleanly as well as that error handling works out + * correctly by testing a snapshot name collision. + */ + public void testCreateSnapshotLegacyPath() throws Exception { + final String masterNode = internalCluster().startMasterOnlyNode(); + internalCluster().startDataOnlyNode(); + final String repoName = "test-repo"; + createRepository(repoName, "fs"); + createIndex("some-index"); + + final SnapshotsService snapshotsService = internalCluster().getMasterNodeInstance(SnapshotsService.class); + final Snapshot snapshot1 = + PlainActionFuture.get(f -> snapshotsService.createSnapshotLegacy(new CreateSnapshotRequest(repoName, "snap-1"), f)); + awaitNoMoreRunningOperations(masterNode); + + final InvalidSnapshotNameException sne = expectThrows(InvalidSnapshotNameException.class, + () -> PlainActionFuture.get( + f -> snapshotsService.executeSnapshotLegacy( + new CreateSnapshotRequest(repoName, snapshot1.getSnapshotId().getName()), f))); + + assertThat(sne.getMessage(), containsString("snapshot with the same name already exists")); + final SnapshotInfo snapshot2 = + PlainActionFuture.get(f -> snapshotsService.executeSnapshotLegacy(new CreateSnapshotRequest(repoName, "snap-2"), f)); + assertThat(snapshot2.state(), is(SnapshotState.SUCCESS)); + } + private long calculateTotalFilesSize(List files) { return files.stream().mapToLong(f -> { try { diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java index 0de7b007f20..a30fb8b70b4 100644 --- a/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java +++ b/server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java @@ -276,7 +276,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus dataStreams, threadPool.absoluteTimeInMillis(), RepositoryData.UNKNOWN_REPO_GEN, - null, + ImmutableOpenMap.of(), userMeta, Version.CURRENT ); initializingSnapshots.add(newEntry.snapshot()); @@ -627,7 +627,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus public void onFailure(String source, Exception e) { logger.warn(() -> new ParameterizedMessage("[{}] failed to create snapshot", snapshot.snapshot().getSnapshotId()), e); - removeFailedSnapshotFromClusterState(snapshot.snapshot(), e, repositoryData, + removeFailedSnapshotFromClusterState(snapshot.snapshot(), e, null, new CleanupAfterErrorListener(userCreateSnapshotListener, e)); } @@ -1422,12 +1422,17 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus * used when the snapshot fails for some reason. During normal operation the snapshot repository will remove the * {@link SnapshotsInProgress.Entry} from the cluster state once it's done finalizing the snapshot. * - * @param snapshot snapshot that failed - * @param failure exception that failed the snapshot + * @param snapshot snapshot that failed + * @param failure exception that failed the snapshot + * @param repositoryData repository data or {@code null} when cleaning up a BwC snapshot that never fully initialized + * @param listener listener to invoke when done with, only passed by the BwC path that has {@code repositoryData} set to + * {@code null} */ private void removeFailedSnapshotFromClusterState(Snapshot snapshot, Exception failure, @Nullable RepositoryData repositoryData, @Nullable CleanupAfterErrorListener listener) { assert failure != null : "Failure must be supplied"; + assert (listener == null || repositoryData == null) && (listener == null && repositoryData == null) == false : + "Either repository data or a listener but not both must be null but saw [" + listener + "] and [" + repositoryData + "]"; clusterService.submitStateUpdateTask("remove snapshot metadata", new ClusterStateUpdateTask() { @Override @@ -1459,15 +1464,10 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus @Override public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { failSnapshotCompletionListeners(snapshot, failure); - if (listener != null) { - listener.onFailure(null); - } - // BwC path, null RepositoryData is only set if a snapshot following the pre-7.5 state machine failed to load initial - // RepositoryData - if (repositoryData == null) { - leaveRepoLoop(snapshot.getRepository()); - } else { + if (listener == null) { runNextQueuedOperation(repositoryData, snapshot.getRepository(), true); + } else { + listener.onFailure(null); } } });