Fix BwC Snapshot INIT Path (#60006)
There were two subtle bugs here from backporting #56911 to 7.x. 1. We passed `null` for the `shards` map which isn't nullable any longer when creating `SnapshotsInProgress.Entry`, fixed by just passing an empty map like the `null` handling did in the past. 2. The removal of a failed `INIT` state snapshot from the cluster state tried removing it from the finalization loop (the set of repository names that are currently finalizing). This will trip an assertion since the snapshot failed before its repository was put into the set. I made the logic ignore the set in case we remove a failed `INIT` state snapshot to restore the old logic to exactly as it was before the concurrent snapshots backport to be on the safe side here. Also, added tests that explicitly call the old code paths because as can be seen from initially missing this, the BwC tests will only run in the configuration new version master, old version nodes ever so often and having a deterministic test for the old state machine seems the safest bet here. Closes #59986
This commit is contained in:
parent
b210af8389
commit
c06c9fb966
|
@ -24,6 +24,7 @@ import com.carrotsearch.hppc.IntSet;
|
||||||
|
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.action.ActionFuture;
|
import org.elasticsearch.action.ActionFuture;
|
||||||
|
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotRequest;
|
||||||
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse;
|
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse;
|
||||||
import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse;
|
import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse;
|
||||||
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
|
import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
|
||||||
|
@ -1275,6 +1276,35 @@ public class DedicatedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTest
|
||||||
assertThat(createSnapshotResponse.getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
|
assertThat(createSnapshotResponse.getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for the legacy snapshot path that is normally executed if the cluster contains any nodes older than
|
||||||
|
* {@link SnapshotsService#NO_REPO_INITIALIZE_VERSION}.
|
||||||
|
* Makes sure that blocking as well as non-blocking snapshot create paths execute cleanly as well as that error handling works out
|
||||||
|
* correctly by testing a snapshot name collision.
|
||||||
|
*/
|
||||||
|
public void testCreateSnapshotLegacyPath() throws Exception {
|
||||||
|
final String masterNode = internalCluster().startMasterOnlyNode();
|
||||||
|
internalCluster().startDataOnlyNode();
|
||||||
|
final String repoName = "test-repo";
|
||||||
|
createRepository(repoName, "fs");
|
||||||
|
createIndex("some-index");
|
||||||
|
|
||||||
|
final SnapshotsService snapshotsService = internalCluster().getMasterNodeInstance(SnapshotsService.class);
|
||||||
|
final Snapshot snapshot1 =
|
||||||
|
PlainActionFuture.get(f -> snapshotsService.createSnapshotLegacy(new CreateSnapshotRequest(repoName, "snap-1"), f));
|
||||||
|
awaitNoMoreRunningOperations(masterNode);
|
||||||
|
|
||||||
|
final InvalidSnapshotNameException sne = expectThrows(InvalidSnapshotNameException.class,
|
||||||
|
() -> PlainActionFuture.<SnapshotInfo, Exception>get(
|
||||||
|
f -> snapshotsService.executeSnapshotLegacy(
|
||||||
|
new CreateSnapshotRequest(repoName, snapshot1.getSnapshotId().getName()), f)));
|
||||||
|
|
||||||
|
assertThat(sne.getMessage(), containsString("snapshot with the same name already exists"));
|
||||||
|
final SnapshotInfo snapshot2 =
|
||||||
|
PlainActionFuture.get(f -> snapshotsService.executeSnapshotLegacy(new CreateSnapshotRequest(repoName, "snap-2"), f));
|
||||||
|
assertThat(snapshot2.state(), is(SnapshotState.SUCCESS));
|
||||||
|
}
|
||||||
|
|
||||||
private long calculateTotalFilesSize(List<Path> files) {
|
private long calculateTotalFilesSize(List<Path> files) {
|
||||||
return files.stream().mapToLong(f -> {
|
return files.stream().mapToLong(f -> {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -276,7 +276,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
||||||
dataStreams,
|
dataStreams,
|
||||||
threadPool.absoluteTimeInMillis(),
|
threadPool.absoluteTimeInMillis(),
|
||||||
RepositoryData.UNKNOWN_REPO_GEN,
|
RepositoryData.UNKNOWN_REPO_GEN,
|
||||||
null,
|
ImmutableOpenMap.of(),
|
||||||
userMeta, Version.CURRENT
|
userMeta, Version.CURRENT
|
||||||
);
|
);
|
||||||
initializingSnapshots.add(newEntry.snapshot());
|
initializingSnapshots.add(newEntry.snapshot());
|
||||||
|
@ -627,7 +627,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
||||||
public void onFailure(String source, Exception e) {
|
public void onFailure(String source, Exception e) {
|
||||||
logger.warn(() -> new ParameterizedMessage("[{}] failed to create snapshot",
|
logger.warn(() -> new ParameterizedMessage("[{}] failed to create snapshot",
|
||||||
snapshot.snapshot().getSnapshotId()), e);
|
snapshot.snapshot().getSnapshotId()), e);
|
||||||
removeFailedSnapshotFromClusterState(snapshot.snapshot(), e, repositoryData,
|
removeFailedSnapshotFromClusterState(snapshot.snapshot(), e, null,
|
||||||
new CleanupAfterErrorListener(userCreateSnapshotListener, e));
|
new CleanupAfterErrorListener(userCreateSnapshotListener, e));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1424,10 +1424,15 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
||||||
*
|
*
|
||||||
* @param snapshot snapshot that failed
|
* @param snapshot snapshot that failed
|
||||||
* @param failure exception that failed the snapshot
|
* @param failure exception that failed the snapshot
|
||||||
|
* @param repositoryData repository data or {@code null} when cleaning up a BwC snapshot that never fully initialized
|
||||||
|
* @param listener listener to invoke when done with, only passed by the BwC path that has {@code repositoryData} set to
|
||||||
|
* {@code null}
|
||||||
*/
|
*/
|
||||||
private void removeFailedSnapshotFromClusterState(Snapshot snapshot, Exception failure, @Nullable RepositoryData repositoryData,
|
private void removeFailedSnapshotFromClusterState(Snapshot snapshot, Exception failure, @Nullable RepositoryData repositoryData,
|
||||||
@Nullable CleanupAfterErrorListener listener) {
|
@Nullable CleanupAfterErrorListener listener) {
|
||||||
assert failure != null : "Failure must be supplied";
|
assert failure != null : "Failure must be supplied";
|
||||||
|
assert (listener == null || repositoryData == null) && (listener == null && repositoryData == null) == false :
|
||||||
|
"Either repository data or a listener but not both must be null but saw [" + listener + "] and [" + repositoryData + "]";
|
||||||
clusterService.submitStateUpdateTask("remove snapshot metadata", new ClusterStateUpdateTask() {
|
clusterService.submitStateUpdateTask("remove snapshot metadata", new ClusterStateUpdateTask() {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -1459,15 +1464,10 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
||||||
@Override
|
@Override
|
||||||
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
|
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
|
||||||
failSnapshotCompletionListeners(snapshot, failure);
|
failSnapshotCompletionListeners(snapshot, failure);
|
||||||
if (listener != null) {
|
if (listener == null) {
|
||||||
listener.onFailure(null);
|
|
||||||
}
|
|
||||||
// BwC path, null RepositoryData is only set if a snapshot following the pre-7.5 state machine failed to load initial
|
|
||||||
// RepositoryData
|
|
||||||
if (repositoryData == null) {
|
|
||||||
leaveRepoLoop(snapshot.getRepository());
|
|
||||||
} else {
|
|
||||||
runNextQueuedOperation(repositoryData, snapshot.getRepository(), true);
|
runNextQueuedOperation(repositoryData, snapshot.getRepository(), true);
|
||||||
|
} else {
|
||||||
|
listener.onFailure(null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
Loading…
Reference in New Issue