Fix BwC Snapshot INIT Path (#60006)

There were two subtle bugs here from backporting #56911 to 7.x.

1. We passed `null` for the `shards` map which isn't nullable any longer
when creating `SnapshotsInProgress.Entry`, fixed by just passing an empty map
like the `null` handling did in the past.
2. The removal of a failed `INIT` state snapshot from the cluster state tried
removing it from the finalization loop (the set of repository names that are
currently finalizing). This will trip an assertion since the snapshot failed
before its repository was put into the set. I made the logic ignore the set
in case we remove a failed `INIT` state snapshot to restore the old logic to
exactly as it was before the concurrent snapshots backport to be on the safe
side here.

Also, added tests that explicitly call the old code paths because as can be seen
from initially missing this, the BwC tests will only run in the configuration new
version master, old version nodes ever so often and having a deterministic test
for the old state machine seems the safest bet here.

Closes #59986
This commit is contained in:
Armin Braun 2020-07-22 10:09:55 +02:00 committed by GitHub
parent b210af8389
commit c06c9fb966
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 42 additions and 12 deletions

View File

@ -24,6 +24,7 @@ import com.carrotsearch.hppc.IntSet;
import org.elasticsearch.Version; import org.elasticsearch.Version;
import org.elasticsearch.action.ActionFuture; import org.elasticsearch.action.ActionFuture;
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotRequest;
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse; import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse;
import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse; import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse;
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse; import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
@ -1275,6 +1276,35 @@ public class DedicatedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTest
assertThat(createSnapshotResponse.getSnapshotInfo().state(), is(SnapshotState.PARTIAL)); assertThat(createSnapshotResponse.getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
} }
/**
* Tests for the legacy snapshot path that is normally executed if the cluster contains any nodes older than
* {@link SnapshotsService#NO_REPO_INITIALIZE_VERSION}.
* Makes sure that blocking as well as non-blocking snapshot create paths execute cleanly as well as that error handling works out
* correctly by testing a snapshot name collision.
*/
public void testCreateSnapshotLegacyPath() throws Exception {
final String masterNode = internalCluster().startMasterOnlyNode();
internalCluster().startDataOnlyNode();
final String repoName = "test-repo";
createRepository(repoName, "fs");
createIndex("some-index");
final SnapshotsService snapshotsService = internalCluster().getMasterNodeInstance(SnapshotsService.class);
final Snapshot snapshot1 =
PlainActionFuture.get(f -> snapshotsService.createSnapshotLegacy(new CreateSnapshotRequest(repoName, "snap-1"), f));
awaitNoMoreRunningOperations(masterNode);
final InvalidSnapshotNameException sne = expectThrows(InvalidSnapshotNameException.class,
() -> PlainActionFuture.<SnapshotInfo, Exception>get(
f -> snapshotsService.executeSnapshotLegacy(
new CreateSnapshotRequest(repoName, snapshot1.getSnapshotId().getName()), f)));
assertThat(sne.getMessage(), containsString("snapshot with the same name already exists"));
final SnapshotInfo snapshot2 =
PlainActionFuture.get(f -> snapshotsService.executeSnapshotLegacy(new CreateSnapshotRequest(repoName, "snap-2"), f));
assertThat(snapshot2.state(), is(SnapshotState.SUCCESS));
}
private long calculateTotalFilesSize(List<Path> files) { private long calculateTotalFilesSize(List<Path> files) {
return files.stream().mapToLong(f -> { return files.stream().mapToLong(f -> {
try { try {

View File

@ -276,7 +276,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
dataStreams, dataStreams,
threadPool.absoluteTimeInMillis(), threadPool.absoluteTimeInMillis(),
RepositoryData.UNKNOWN_REPO_GEN, RepositoryData.UNKNOWN_REPO_GEN,
null, ImmutableOpenMap.of(),
userMeta, Version.CURRENT userMeta, Version.CURRENT
); );
initializingSnapshots.add(newEntry.snapshot()); initializingSnapshots.add(newEntry.snapshot());
@ -627,7 +627,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
public void onFailure(String source, Exception e) { public void onFailure(String source, Exception e) {
logger.warn(() -> new ParameterizedMessage("[{}] failed to create snapshot", logger.warn(() -> new ParameterizedMessage("[{}] failed to create snapshot",
snapshot.snapshot().getSnapshotId()), e); snapshot.snapshot().getSnapshotId()), e);
removeFailedSnapshotFromClusterState(snapshot.snapshot(), e, repositoryData, removeFailedSnapshotFromClusterState(snapshot.snapshot(), e, null,
new CleanupAfterErrorListener(userCreateSnapshotListener, e)); new CleanupAfterErrorListener(userCreateSnapshotListener, e));
} }
@ -1424,10 +1424,15 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
* *
* @param snapshot snapshot that failed * @param snapshot snapshot that failed
* @param failure exception that failed the snapshot * @param failure exception that failed the snapshot
* @param repositoryData repository data or {@code null} when cleaning up a BwC snapshot that never fully initialized
* @param listener listener to invoke when done with, only passed by the BwC path that has {@code repositoryData} set to
* {@code null}
*/ */
private void removeFailedSnapshotFromClusterState(Snapshot snapshot, Exception failure, @Nullable RepositoryData repositoryData, private void removeFailedSnapshotFromClusterState(Snapshot snapshot, Exception failure, @Nullable RepositoryData repositoryData,
@Nullable CleanupAfterErrorListener listener) { @Nullable CleanupAfterErrorListener listener) {
assert failure != null : "Failure must be supplied"; assert failure != null : "Failure must be supplied";
assert (listener == null || repositoryData == null) && (listener == null && repositoryData == null) == false :
"Either repository data or a listener but not both must be null but saw [" + listener + "] and [" + repositoryData + "]";
clusterService.submitStateUpdateTask("remove snapshot metadata", new ClusterStateUpdateTask() { clusterService.submitStateUpdateTask("remove snapshot metadata", new ClusterStateUpdateTask() {
@Override @Override
@ -1459,15 +1464,10 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
@Override @Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
failSnapshotCompletionListeners(snapshot, failure); failSnapshotCompletionListeners(snapshot, failure);
if (listener != null) { if (listener == null) {
listener.onFailure(null);
}
// BwC path, null RepositoryData is only set if a snapshot following the pre-7.5 state machine failed to load initial
// RepositoryData
if (repositoryData == null) {
leaveRepoLoop(snapshot.getRepository());
} else {
runNextQueuedOperation(repositoryData, snapshot.getRepository(), true); runNextQueuedOperation(repositoryData, snapshot.getRepository(), true);
} else {
listener.onFailure(null);
} }
} }
}); });