mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-09 22:45:04 +00:00
Fix Concurrent Snapshot Ending And Stabilize Snapshot Finalization (#38368)
* The problem in #38226 is that in some corner cases multiple calls to `endSnapshot` were made concurrently, leading to non-deterministic behavior (`beginSnapshot` was triggering a repository finalization while one that was triggered by a `deleteSnapshot` was already in progress) * Fixed by: * Making all `endSnapshot` calls originate from the cluster state being in a "completed" state (apart from on short-circuit on initializing an empty snapshot). This forced putting the failure string into `SnapshotsInProgress.Entry`. * Adding deduplication logic to `endSnapshot` * Also: * Streamlined the init behavior to work the same way (keep state on the `SnapshotsService` to decide which snapshot entries are stale) * closes #38226
This commit is contained in:
parent
d862453d68
commit
2f6afd290e
@ -24,6 +24,7 @@ import com.carrotsearch.hppc.cursors.ObjectCursor;
|
|||||||
import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
|
import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.cluster.ClusterState.Custom;
|
import org.elasticsearch.cluster.ClusterState.Custom;
|
||||||
|
import org.elasticsearch.common.Nullable;
|
||||||
import org.elasticsearch.common.collect.ImmutableOpenMap;
|
import org.elasticsearch.common.collect.ImmutableOpenMap;
|
||||||
import org.elasticsearch.common.io.stream.StreamInput;
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||||
@ -87,9 +88,11 @@ public class SnapshotsInProgress extends AbstractNamedDiffable<Custom> implement
|
|||||||
private final ImmutableOpenMap<String, List<ShardId>> waitingIndices;
|
private final ImmutableOpenMap<String, List<ShardId>> waitingIndices;
|
||||||
private final long startTime;
|
private final long startTime;
|
||||||
private final long repositoryStateId;
|
private final long repositoryStateId;
|
||||||
|
@Nullable private final String failure;
|
||||||
|
|
||||||
public Entry(Snapshot snapshot, boolean includeGlobalState, boolean partial, State state, List<IndexId> indices,
|
public Entry(Snapshot snapshot, boolean includeGlobalState, boolean partial, State state, List<IndexId> indices,
|
||||||
long startTime, long repositoryStateId, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards) {
|
long startTime, long repositoryStateId, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards,
|
||||||
|
String failure) {
|
||||||
this.state = state;
|
this.state = state;
|
||||||
this.snapshot = snapshot;
|
this.snapshot = snapshot;
|
||||||
this.includeGlobalState = includeGlobalState;
|
this.includeGlobalState = includeGlobalState;
|
||||||
@ -104,15 +107,26 @@ public class SnapshotsInProgress extends AbstractNamedDiffable<Custom> implement
|
|||||||
this.waitingIndices = findWaitingIndices(shards);
|
this.waitingIndices = findWaitingIndices(shards);
|
||||||
}
|
}
|
||||||
this.repositoryStateId = repositoryStateId;
|
this.repositoryStateId = repositoryStateId;
|
||||||
|
this.failure = failure;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Entry(Snapshot snapshot, boolean includeGlobalState, boolean partial, State state, List<IndexId> indices,
|
||||||
|
long startTime, long repositoryStateId, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards) {
|
||||||
|
this(snapshot, includeGlobalState, partial, state, indices, startTime, repositoryStateId, shards, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Entry(Entry entry, State state, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards) {
|
public Entry(Entry entry, State state, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards) {
|
||||||
this(entry.snapshot, entry.includeGlobalState, entry.partial, state, entry.indices, entry.startTime,
|
this(entry.snapshot, entry.includeGlobalState, entry.partial, state, entry.indices, entry.startTime,
|
||||||
entry.repositoryStateId, shards);
|
entry.repositoryStateId, shards, entry.failure);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Entry(Entry entry, State state, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards, String failure) {
|
||||||
|
this(entry.snapshot, entry.includeGlobalState, entry.partial, state, entry.indices, entry.startTime,
|
||||||
|
entry.repositoryStateId, shards, failure);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Entry(Entry entry, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards) {
|
public Entry(Entry entry, ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards) {
|
||||||
this(entry, entry.state, shards);
|
this(entry, entry.state, shards, entry.failure);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Snapshot snapshot() {
|
public Snapshot snapshot() {
|
||||||
@ -151,6 +165,10 @@ public class SnapshotsInProgress extends AbstractNamedDiffable<Custom> implement
|
|||||||
return repositoryStateId;
|
return repositoryStateId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String failure() {
|
||||||
|
return failure;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
if (this == o) return true;
|
if (this == o) return true;
|
||||||
@ -427,6 +445,12 @@ public class SnapshotsInProgress extends AbstractNamedDiffable<Custom> implement
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
long repositoryStateId = in.readLong();
|
long repositoryStateId = in.readLong();
|
||||||
|
final String failure;
|
||||||
|
if (in.getVersion().onOrAfter(Version.V_7_0_0)) {
|
||||||
|
failure = in.readOptionalString();
|
||||||
|
} else {
|
||||||
|
failure = null;
|
||||||
|
}
|
||||||
entries[i] = new Entry(snapshot,
|
entries[i] = new Entry(snapshot,
|
||||||
includeGlobalState,
|
includeGlobalState,
|
||||||
partial,
|
partial,
|
||||||
@ -434,7 +458,8 @@ public class SnapshotsInProgress extends AbstractNamedDiffable<Custom> implement
|
|||||||
Collections.unmodifiableList(indexBuilder),
|
Collections.unmodifiableList(indexBuilder),
|
||||||
startTime,
|
startTime,
|
||||||
repositoryStateId,
|
repositoryStateId,
|
||||||
builder.build());
|
builder.build(),
|
||||||
|
failure);
|
||||||
}
|
}
|
||||||
this.entries = Arrays.asList(entries);
|
this.entries = Arrays.asList(entries);
|
||||||
}
|
}
|
||||||
@ -463,6 +488,9 @@ public class SnapshotsInProgress extends AbstractNamedDiffable<Custom> implement
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
out.writeLong(entry.repositoryStateId);
|
out.writeLong(entry.repositoryStateId);
|
||||||
|
if (out.getVersion().onOrAfter(Version.V_7_0_0)) {
|
||||||
|
out.writeOptionalString(entry.failure);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -591,8 +591,6 @@ public class SnapshotShardsService extends AbstractLifecycleComponent implements
|
|||||||
// TODO: Add PARTIAL_SUCCESS status?
|
// TODO: Add PARTIAL_SUCCESS status?
|
||||||
SnapshotsInProgress.Entry updatedEntry = new SnapshotsInProgress.Entry(entry, State.SUCCESS, shards.build());
|
SnapshotsInProgress.Entry updatedEntry = new SnapshotsInProgress.Entry(entry, State.SUCCESS, shards.build());
|
||||||
entries.add(updatedEntry);
|
entries.add(updatedEntry);
|
||||||
// Finalize snapshot in the repository
|
|
||||||
snapshotsService.endSnapshot(updatedEntry);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
entries.add(entry);
|
entries.add(entry);
|
||||||
|
@ -83,7 +83,9 @@ import java.util.Set;
|
|||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.CopyOnWriteArrayList;
|
import java.util.concurrent.CopyOnWriteArrayList;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
|
import static java.util.Collections.unmodifiableList;
|
||||||
import static java.util.Collections.unmodifiableMap;
|
import static java.util.Collections.unmodifiableMap;
|
||||||
import static org.elasticsearch.cluster.SnapshotsInProgress.completed;
|
import static org.elasticsearch.cluster.SnapshotsInProgress.completed;
|
||||||
|
|
||||||
@ -121,6 +123,12 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
|
|
||||||
private final Map<Snapshot, List<ActionListener<SnapshotInfo>>> snapshotCompletionListeners = new ConcurrentHashMap<>();
|
private final Map<Snapshot, List<ActionListener<SnapshotInfo>>> snapshotCompletionListeners = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
|
// Set of snapshots that are currently being initialized by this node
|
||||||
|
private final Set<Snapshot> initializingSnapshots = Collections.synchronizedSet(new HashSet<>());
|
||||||
|
|
||||||
|
// Set of snapshots that are currently being ended by this node
|
||||||
|
private final Set<Snapshot> endingSnapshots = Collections.synchronizedSet(new HashSet<>());
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SnapshotsService(Settings settings, ClusterService clusterService, IndexNameExpressionResolver indexNameExpressionResolver,
|
public SnapshotsService(Settings settings, ClusterService clusterService, IndexNameExpressionResolver indexNameExpressionResolver,
|
||||||
RepositoriesService repositoriesService, ThreadPool threadPool) {
|
RepositoriesService repositoriesService, ThreadPool threadPool) {
|
||||||
@ -207,7 +215,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
}
|
}
|
||||||
final ArrayList<SnapshotInfo> snapshotList = new ArrayList<>(snapshotSet);
|
final ArrayList<SnapshotInfo> snapshotList = new ArrayList<>(snapshotSet);
|
||||||
CollectionUtil.timSort(snapshotList);
|
CollectionUtil.timSort(snapshotList);
|
||||||
return Collections.unmodifiableList(snapshotList);
|
return unmodifiableList(snapshotList);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -223,7 +231,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
snapshotList.add(inProgressSnapshot(entry));
|
snapshotList.add(inProgressSnapshot(entry));
|
||||||
}
|
}
|
||||||
CollectionUtil.timSort(snapshotList);
|
CollectionUtil.timSort(snapshotList);
|
||||||
return Collections.unmodifiableList(snapshotList);
|
return unmodifiableList(snapshotList);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -280,6 +288,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
System.currentTimeMillis(),
|
System.currentTimeMillis(),
|
||||||
repositoryData.getGenId(),
|
repositoryData.getGenId(),
|
||||||
null);
|
null);
|
||||||
|
initializingSnapshots.add(newSnapshot.snapshot());
|
||||||
snapshots = new SnapshotsInProgress(newSnapshot);
|
snapshots = new SnapshotsInProgress(newSnapshot);
|
||||||
} else {
|
} else {
|
||||||
throw new ConcurrentSnapshotExecutionException(repositoryName, snapshotName, " a snapshot is already running");
|
throw new ConcurrentSnapshotExecutionException(repositoryName, snapshotName, " a snapshot is already running");
|
||||||
@ -290,6 +299,9 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
@Override
|
@Override
|
||||||
public void onFailure(String source, Exception e) {
|
public void onFailure(String source, Exception e) {
|
||||||
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to create snapshot", repositoryName, snapshotName), e);
|
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to create snapshot", repositoryName, snapshotName), e);
|
||||||
|
if (newSnapshot != null) {
|
||||||
|
initializingSnapshots.remove(newSnapshot.snapshot());
|
||||||
|
}
|
||||||
newSnapshot = null;
|
newSnapshot = null;
|
||||||
listener.onFailure(e);
|
listener.onFailure(e);
|
||||||
}
|
}
|
||||||
@ -297,7 +309,21 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
@Override
|
@Override
|
||||||
public void clusterStateProcessed(String source, ClusterState oldState, final ClusterState newState) {
|
public void clusterStateProcessed(String source, ClusterState oldState, final ClusterState newState) {
|
||||||
if (newSnapshot != null) {
|
if (newSnapshot != null) {
|
||||||
beginSnapshot(newState, newSnapshot, request.partial(), listener);
|
final Snapshot current = newSnapshot.snapshot();
|
||||||
|
assert initializingSnapshots.contains(current);
|
||||||
|
beginSnapshot(newState, newSnapshot, request.partial(), new ActionListener<Snapshot>() {
|
||||||
|
@Override
|
||||||
|
public void onResponse(final Snapshot snapshot) {
|
||||||
|
initializingSnapshots.remove(snapshot);
|
||||||
|
listener.onResponse(snapshot);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onFailure(final Exception e) {
|
||||||
|
initializingSnapshots.remove(current);
|
||||||
|
listener.onFailure(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -370,6 +396,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void doRun() {
|
protected void doRun() {
|
||||||
|
assert initializingSnapshots.contains(snapshot.snapshot());
|
||||||
Repository repository = repositoriesService.repository(snapshot.snapshot().getRepository());
|
Repository repository = repositoriesService.repository(snapshot.snapshot().getRepository());
|
||||||
|
|
||||||
MetaData metaData = clusterState.metaData();
|
MetaData metaData = clusterState.metaData();
|
||||||
@ -394,9 +421,6 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
}
|
}
|
||||||
clusterService.submitStateUpdateTask("update_snapshot [" + snapshot.snapshot() + "]", new ClusterStateUpdateTask() {
|
clusterService.submitStateUpdateTask("update_snapshot [" + snapshot.snapshot() + "]", new ClusterStateUpdateTask() {
|
||||||
|
|
||||||
SnapshotsInProgress.Entry endSnapshot;
|
|
||||||
String failure;
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ClusterState execute(ClusterState currentState) {
|
public ClusterState execute(ClusterState currentState) {
|
||||||
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
|
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
|
||||||
@ -407,19 +431,18 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.state() != State.ABORTED) {
|
if (entry.state() == State.ABORTED) {
|
||||||
// Replace the snapshot that was just intialized
|
entries.add(entry);
|
||||||
ImmutableOpenMap<ShardId, SnapshotsInProgress.ShardSnapshotStatus> shards =
|
} else {
|
||||||
shards(currentState, entry.indices());
|
// Replace the snapshot that was just initialized
|
||||||
|
ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards =
|
||||||
|
shards(currentState, entry.indices());
|
||||||
if (!partial) {
|
if (!partial) {
|
||||||
Tuple<Set<String>, Set<String>> indicesWithMissingShards = indicesWithMissingShards(shards,
|
Tuple<Set<String>, Set<String>> indicesWithMissingShards = indicesWithMissingShards(shards,
|
||||||
currentState.metaData());
|
currentState.metaData());
|
||||||
Set<String> missing = indicesWithMissingShards.v1();
|
Set<String> missing = indicesWithMissingShards.v1();
|
||||||
Set<String> closed = indicesWithMissingShards.v2();
|
Set<String> closed = indicesWithMissingShards.v2();
|
||||||
if (missing.isEmpty() == false || closed.isEmpty() == false) {
|
if (missing.isEmpty() == false || closed.isEmpty() == false) {
|
||||||
endSnapshot = new SnapshotsInProgress.Entry(entry, State.FAILED, shards);
|
|
||||||
entries.add(endSnapshot);
|
|
||||||
|
|
||||||
final StringBuilder failureMessage = new StringBuilder();
|
final StringBuilder failureMessage = new StringBuilder();
|
||||||
if (missing.isEmpty() == false) {
|
if (missing.isEmpty() == false) {
|
||||||
failureMessage.append("Indices don't have primary shards ");
|
failureMessage.append("Indices don't have primary shards ");
|
||||||
@ -432,24 +455,15 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
failureMessage.append("Indices are closed ");
|
failureMessage.append("Indices are closed ");
|
||||||
failureMessage.append(closed);
|
failureMessage.append(closed);
|
||||||
}
|
}
|
||||||
failure = failureMessage.toString();
|
entries.add(new SnapshotsInProgress.Entry(entry, State.FAILED, shards, failureMessage.toString()));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
SnapshotsInProgress.Entry updatedSnapshot = new SnapshotsInProgress.Entry(entry, State.STARTED, shards);
|
entries.add(new SnapshotsInProgress.Entry(entry, State.STARTED, shards));
|
||||||
entries.add(updatedSnapshot);
|
|
||||||
if (completed(shards.values())) {
|
|
||||||
endSnapshot = updatedSnapshot;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
assert entry.state() == State.ABORTED : "expecting snapshot to be aborted during initialization";
|
|
||||||
failure = "snapshot was aborted during initialization";
|
|
||||||
endSnapshot = entry;
|
|
||||||
entries.add(endSnapshot);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ClusterState.builder(currentState)
|
return ClusterState.builder(currentState)
|
||||||
.putCustom(SnapshotsInProgress.TYPE, new SnapshotsInProgress(Collections.unmodifiableList(entries)))
|
.putCustom(SnapshotsInProgress.TYPE, new SnapshotsInProgress(unmodifiableList(entries)))
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -477,14 +491,6 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
// completion listener in this method. For the snapshot completion to work properly, the snapshot
|
// completion listener in this method. For the snapshot completion to work properly, the snapshot
|
||||||
// should still exist when listener is registered.
|
// should still exist when listener is registered.
|
||||||
userCreateSnapshotListener.onResponse(snapshot.snapshot());
|
userCreateSnapshotListener.onResponse(snapshot.snapshot());
|
||||||
|
|
||||||
// Now that snapshot completion listener is registered we can end the snapshot if needed
|
|
||||||
// We should end snapshot only if 1) we didn't accept it for processing (which happens when there
|
|
||||||
// is nothing to do) and 2) there was a snapshot in metadata that we should end. Otherwise we should
|
|
||||||
// go ahead and continue working on this snapshot rather then end here.
|
|
||||||
if (endSnapshot != null) {
|
|
||||||
endSnapshot(endSnapshot, failure);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -552,7 +558,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private SnapshotInfo inProgressSnapshot(SnapshotsInProgress.Entry entry) {
|
private static SnapshotInfo inProgressSnapshot(SnapshotsInProgress.Entry entry) {
|
||||||
return new SnapshotInfo(entry.snapshot().getSnapshotId(),
|
return new SnapshotInfo(entry.snapshot().getSnapshotId(),
|
||||||
entry.indices().stream().map(IndexId::getName).collect(Collectors.toList()),
|
entry.indices().stream().map(IndexId::getName).collect(Collectors.toList()),
|
||||||
entry.startTime(), entry.includeGlobalState());
|
entry.startTime(), entry.includeGlobalState());
|
||||||
@ -610,7 +616,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
builder.add(entry);
|
builder.add(entry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return Collections.unmodifiableList(builder);
|
return unmodifiableList(builder);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -666,7 +672,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
return unmodifiableMap(shardStatus);
|
return unmodifiableMap(shardStatus);
|
||||||
}
|
}
|
||||||
|
|
||||||
private SnapshotShardFailure findShardFailure(List<SnapshotShardFailure> shardFailures, ShardId shardId) {
|
private static SnapshotShardFailure findShardFailure(List<SnapshotShardFailure> shardFailures, ShardId shardId) {
|
||||||
for (SnapshotShardFailure shardFailure : shardFailures) {
|
for (SnapshotShardFailure shardFailure : shardFailures) {
|
||||||
if (shardId.getIndexName().equals(shardFailure.index()) && shardId.getId() == shardFailure.shardId()) {
|
if (shardId.getIndexName().equals(shardFailure.index()) && shardId.getId() == shardFailure.shardId()) {
|
||||||
return shardFailure;
|
return shardFailure;
|
||||||
@ -680,14 +686,27 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
try {
|
try {
|
||||||
if (event.localNodeMaster()) {
|
if (event.localNodeMaster()) {
|
||||||
// We don't remove old master when master flips anymore. So, we need to check for change in master
|
// We don't remove old master when master flips anymore. So, we need to check for change in master
|
||||||
if (event.nodesRemoved() || event.previousState().nodes().isLocalNodeElectedMaster() == false) {
|
final SnapshotsInProgress snapshotsInProgress = event.state().custom(SnapshotsInProgress.TYPE);
|
||||||
processSnapshotsOnRemovedNodes(event);
|
if (snapshotsInProgress != null) {
|
||||||
|
if (removedNodesCleanupNeeded(snapshotsInProgress, event.nodesDelta().removedNodes())) {
|
||||||
|
processSnapshotsOnRemovedNodes();
|
||||||
|
}
|
||||||
|
if (event.routingTableChanged() && waitingShardsStartedOrUnassigned(snapshotsInProgress, event)) {
|
||||||
|
processStartedShards();
|
||||||
|
}
|
||||||
|
// Cleanup all snapshots that have no more work left:
|
||||||
|
// 1. Completed snapshots
|
||||||
|
// 2. Snapshots in state INIT that the previous master failed to start
|
||||||
|
// 3. Snapshots in any other state that have all their shard tasks completed
|
||||||
|
snapshotsInProgress.entries().stream().filter(
|
||||||
|
entry -> entry.state().completed()
|
||||||
|
|| entry.state() == State.INIT && initializingSnapshots.contains(entry.snapshot()) == false
|
||||||
|
|| entry.state() != State.INIT && completed(entry.shards().values())
|
||||||
|
).forEach(this::endSnapshot);
|
||||||
}
|
}
|
||||||
if (event.routingTableChanged()) {
|
if (event.previousState().nodes().isLocalNodeElectedMaster() == false) {
|
||||||
processStartedShards(event);
|
finalizeSnapshotDeletionFromPreviousMaster(event);
|
||||||
}
|
}
|
||||||
removeFinishedSnapshotFromClusterState(event);
|
|
||||||
finalizeSnapshotDeletionFromPreviousMaster(event);
|
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.warn("Failed to update snapshot state ", e);
|
logger.warn("Failed to update snapshot state ", e);
|
||||||
@ -706,166 +725,134 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
* snapshot was deleted and a call to GET snapshots would reveal that the snapshot no longer exists.
|
* snapshot was deleted and a call to GET snapshots would reveal that the snapshot no longer exists.
|
||||||
*/
|
*/
|
||||||
private void finalizeSnapshotDeletionFromPreviousMaster(ClusterChangedEvent event) {
|
private void finalizeSnapshotDeletionFromPreviousMaster(ClusterChangedEvent event) {
|
||||||
if (event.localNodeMaster() && event.previousState().nodes().isLocalNodeElectedMaster() == false) {
|
SnapshotDeletionsInProgress deletionsInProgress = event.state().custom(SnapshotDeletionsInProgress.TYPE);
|
||||||
SnapshotDeletionsInProgress deletionsInProgress = event.state().custom(SnapshotDeletionsInProgress.TYPE);
|
if (deletionsInProgress != null && deletionsInProgress.hasDeletionsInProgress()) {
|
||||||
if (deletionsInProgress != null && deletionsInProgress.hasDeletionsInProgress()) {
|
assert deletionsInProgress.getEntries().size() == 1 : "only one in-progress deletion allowed per cluster";
|
||||||
assert deletionsInProgress.getEntries().size() == 1 : "only one in-progress deletion allowed per cluster";
|
SnapshotDeletionsInProgress.Entry entry = deletionsInProgress.getEntries().get(0);
|
||||||
SnapshotDeletionsInProgress.Entry entry = deletionsInProgress.getEntries().get(0);
|
deleteSnapshotFromRepository(entry.getSnapshot(), null, entry.getRepositoryStateId());
|
||||||
deleteSnapshotFromRepository(entry.getSnapshot(), null, entry.getRepositoryStateId());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Removes a finished snapshot from the cluster state. This can happen if the previous
|
|
||||||
* master node processed a cluster state update that marked the snapshot as finished,
|
|
||||||
* but the previous master node died before removing the snapshot in progress from the
|
|
||||||
* cluster state. It is then the responsibility of the new master node to end the
|
|
||||||
* snapshot and remove it from the cluster state.
|
|
||||||
*/
|
|
||||||
private void removeFinishedSnapshotFromClusterState(ClusterChangedEvent event) {
|
|
||||||
if (event.localNodeMaster() && !event.previousState().nodes().isLocalNodeElectedMaster()) {
|
|
||||||
SnapshotsInProgress snapshotsInProgress = event.state().custom(SnapshotsInProgress.TYPE);
|
|
||||||
if (snapshotsInProgress != null && !snapshotsInProgress.entries().isEmpty()) {
|
|
||||||
for (SnapshotsInProgress.Entry entry : snapshotsInProgress.entries()) {
|
|
||||||
if (entry.state().completed()) {
|
|
||||||
endSnapshot(entry);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleans up shard snapshots that were running on removed nodes
|
* Cleans up shard snapshots that were running on removed nodes
|
||||||
*
|
|
||||||
* @param event cluster changed event
|
|
||||||
*/
|
*/
|
||||||
private void processSnapshotsOnRemovedNodes(ClusterChangedEvent event) {
|
private void processSnapshotsOnRemovedNodes() {
|
||||||
if (removedNodesCleanupNeeded(event)) {
|
clusterService.submitStateUpdateTask("update snapshot state after node removal", new ClusterStateUpdateTask() {
|
||||||
// Check if we just became the master
|
@Override
|
||||||
final boolean newMaster = !event.previousState().nodes().isLocalNodeElectedMaster();
|
public ClusterState execute(ClusterState currentState) {
|
||||||
clusterService.submitStateUpdateTask("update snapshot state after node removal", new ClusterStateUpdateTask() {
|
DiscoveryNodes nodes = currentState.nodes();
|
||||||
@Override
|
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
|
||||||
public ClusterState execute(ClusterState currentState) throws Exception {
|
if (snapshots == null) {
|
||||||
DiscoveryNodes nodes = currentState.nodes();
|
return currentState;
|
||||||
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
|
}
|
||||||
if (snapshots == null) {
|
boolean changed = false;
|
||||||
return currentState;
|
ArrayList<SnapshotsInProgress.Entry> entries = new ArrayList<>();
|
||||||
|
for (final SnapshotsInProgress.Entry snapshot : snapshots.entries()) {
|
||||||
|
SnapshotsInProgress.Entry updatedSnapshot = snapshot;
|
||||||
|
if (snapshot.state() == State.STARTED || snapshot.state() == State.ABORTED) {
|
||||||
|
ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> shards = ImmutableOpenMap.builder();
|
||||||
|
boolean snapshotChanged = false;
|
||||||
|
for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shardEntry : snapshot.shards()) {
|
||||||
|
ShardSnapshotStatus shardStatus = shardEntry.value;
|
||||||
|
if (!shardStatus.state().completed() && shardStatus.nodeId() != null) {
|
||||||
|
if (nodes.nodeExists(shardStatus.nodeId())) {
|
||||||
|
shards.put(shardEntry.key, shardEntry.value);
|
||||||
|
} else {
|
||||||
|
// TODO: Restart snapshot on another node?
|
||||||
|
snapshotChanged = true;
|
||||||
|
logger.warn("failing snapshot of shard [{}] on closed node [{}]",
|
||||||
|
shardEntry.key, shardStatus.nodeId());
|
||||||
|
shards.put(shardEntry.key,
|
||||||
|
new ShardSnapshotStatus(shardStatus.nodeId(), State.FAILED, "node shutdown"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (snapshotChanged) {
|
||||||
|
changed = true;
|
||||||
|
ImmutableOpenMap<ShardId, ShardSnapshotStatus> shardsMap = shards.build();
|
||||||
|
if (!snapshot.state().completed() && completed(shardsMap.values())) {
|
||||||
|
updatedSnapshot = new SnapshotsInProgress.Entry(snapshot, State.SUCCESS, shardsMap);
|
||||||
|
} else {
|
||||||
|
updatedSnapshot = new SnapshotsInProgress.Entry(snapshot, snapshot.state(), shardsMap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
entries.add(updatedSnapshot);
|
||||||
|
} else if (snapshot.state() == State.INIT && initializingSnapshots.contains(snapshot.snapshot()) == false) {
|
||||||
|
changed = true;
|
||||||
|
// Mark the snapshot as aborted as it failed to start from the previous master
|
||||||
|
updatedSnapshot = new SnapshotsInProgress.Entry(snapshot, State.ABORTED, snapshot.shards());
|
||||||
|
entries.add(updatedSnapshot);
|
||||||
|
|
||||||
|
// Clean up the snapshot that failed to start from the old master
|
||||||
|
deleteSnapshot(snapshot.snapshot(), new ActionListener<Void>() {
|
||||||
|
@Override
|
||||||
|
public void onResponse(Void aVoid) {
|
||||||
|
logger.debug("cleaned up abandoned snapshot {} in INIT state", snapshot.snapshot());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onFailure(Exception e) {
|
||||||
|
logger.warn("failed to clean up abandoned snapshot {} in INIT state", snapshot.snapshot());
|
||||||
|
}
|
||||||
|
}, updatedSnapshot.getRepositoryStateId(), false);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (changed) {
|
||||||
|
return ClusterState.builder(currentState)
|
||||||
|
.putCustom(SnapshotsInProgress.TYPE, new SnapshotsInProgress(unmodifiableList(entries))).build();
|
||||||
|
}
|
||||||
|
return currentState;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onFailure(String source, Exception e) {
|
||||||
|
logger.warn("failed to update snapshot state after node removal");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processStartedShards() {
|
||||||
|
clusterService.submitStateUpdateTask("update snapshot state after shards started", new ClusterStateUpdateTask() {
|
||||||
|
@Override
|
||||||
|
public ClusterState execute(ClusterState currentState) {
|
||||||
|
RoutingTable routingTable = currentState.routingTable();
|
||||||
|
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
|
||||||
|
if (snapshots != null) {
|
||||||
boolean changed = false;
|
boolean changed = false;
|
||||||
ArrayList<SnapshotsInProgress.Entry> entries = new ArrayList<>();
|
ArrayList<SnapshotsInProgress.Entry> entries = new ArrayList<>();
|
||||||
for (final SnapshotsInProgress.Entry snapshot : snapshots.entries()) {
|
for (final SnapshotsInProgress.Entry snapshot : snapshots.entries()) {
|
||||||
SnapshotsInProgress.Entry updatedSnapshot = snapshot;
|
SnapshotsInProgress.Entry updatedSnapshot = snapshot;
|
||||||
boolean snapshotChanged = false;
|
if (snapshot.state() == State.STARTED) {
|
||||||
if (snapshot.state() == State.STARTED || snapshot.state() == State.ABORTED) {
|
ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards = processWaitingShards(snapshot.shards(),
|
||||||
ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> shards = ImmutableOpenMap.builder();
|
routingTable);
|
||||||
for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shardEntry : snapshot.shards()) {
|
if (shards != null) {
|
||||||
ShardSnapshotStatus shardStatus = shardEntry.value;
|
|
||||||
if (!shardStatus.state().completed() && shardStatus.nodeId() != null) {
|
|
||||||
if (nodes.nodeExists(shardStatus.nodeId())) {
|
|
||||||
shards.put(shardEntry.key, shardEntry.value);
|
|
||||||
} else {
|
|
||||||
// TODO: Restart snapshot on another node?
|
|
||||||
snapshotChanged = true;
|
|
||||||
logger.warn("failing snapshot of shard [{}] on closed node [{}]",
|
|
||||||
shardEntry.key, shardStatus.nodeId());
|
|
||||||
shards.put(shardEntry.key,
|
|
||||||
new ShardSnapshotStatus(shardStatus.nodeId(), State.FAILED, "node shutdown"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (snapshotChanged) {
|
|
||||||
changed = true;
|
changed = true;
|
||||||
ImmutableOpenMap<ShardId, ShardSnapshotStatus> shardsMap = shards.build();
|
if (!snapshot.state().completed() && completed(shards.values())) {
|
||||||
if (!snapshot.state().completed() && completed(shardsMap.values())) {
|
updatedSnapshot = new SnapshotsInProgress.Entry(snapshot, State.SUCCESS, shards);
|
||||||
updatedSnapshot = new SnapshotsInProgress.Entry(snapshot, State.SUCCESS, shardsMap);
|
|
||||||
endSnapshot(updatedSnapshot);
|
|
||||||
} else {
|
} else {
|
||||||
updatedSnapshot = new SnapshotsInProgress.Entry(snapshot, snapshot.state(), shardsMap);
|
updatedSnapshot = new SnapshotsInProgress.Entry(snapshot, shards);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
entries.add(updatedSnapshot);
|
entries.add(updatedSnapshot);
|
||||||
} else if (snapshot.state() == State.INIT && newMaster) {
|
|
||||||
changed = true;
|
|
||||||
// Mark the snapshot as aborted as it failed to start from the previous master
|
|
||||||
updatedSnapshot = new SnapshotsInProgress.Entry(snapshot, State.ABORTED, snapshot.shards());
|
|
||||||
entries.add(updatedSnapshot);
|
|
||||||
|
|
||||||
// Clean up the snapshot that failed to start from the old master
|
|
||||||
deleteSnapshot(snapshot.snapshot(), new ActionListener<Void>() {
|
|
||||||
@Override
|
|
||||||
public void onResponse(Void aVoid) {
|
|
||||||
logger.debug("cleaned up abandoned snapshot {} in INIT state", snapshot.snapshot());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void onFailure(Exception e) {
|
|
||||||
logger.warn("failed to clean up abandoned snapshot {} in INIT state", snapshot.snapshot());
|
|
||||||
}
|
|
||||||
}, updatedSnapshot.getRepositoryStateId(), false);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (changed) {
|
if (changed) {
|
||||||
snapshots = new SnapshotsInProgress(entries.toArray(new SnapshotsInProgress.Entry[entries.size()]));
|
return ClusterState.builder(currentState)
|
||||||
return ClusterState.builder(currentState).putCustom(SnapshotsInProgress.TYPE, snapshots).build();
|
.putCustom(SnapshotsInProgress.TYPE, new SnapshotsInProgress(unmodifiableList(entries))).build();
|
||||||
}
|
}
|
||||||
return currentState;
|
|
||||||
}
|
}
|
||||||
|
return currentState;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onFailure(String source, Exception e) {
|
public void onFailure(String source, Exception e) {
|
||||||
logger.warn("failed to update snapshot state after node removal");
|
logger.warn(() ->
|
||||||
}
|
new ParameterizedMessage("failed to update snapshot state after shards started from [{}] ", source), e);
|
||||||
});
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private void processStartedShards(ClusterChangedEvent event) {
|
private static ImmutableOpenMap<ShardId, ShardSnapshotStatus> processWaitingShards(
|
||||||
if (waitingShardsStartedOrUnassigned(event)) {
|
|
||||||
clusterService.submitStateUpdateTask("update snapshot state after shards started", new ClusterStateUpdateTask() {
|
|
||||||
@Override
|
|
||||||
public ClusterState execute(ClusterState currentState) throws Exception {
|
|
||||||
RoutingTable routingTable = currentState.routingTable();
|
|
||||||
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
|
|
||||||
if (snapshots != null) {
|
|
||||||
boolean changed = false;
|
|
||||||
ArrayList<SnapshotsInProgress.Entry> entries = new ArrayList<>();
|
|
||||||
for (final SnapshotsInProgress.Entry snapshot : snapshots.entries()) {
|
|
||||||
SnapshotsInProgress.Entry updatedSnapshot = snapshot;
|
|
||||||
if (snapshot.state() == State.STARTED) {
|
|
||||||
ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards = processWaitingShards(snapshot.shards(),
|
|
||||||
routingTable);
|
|
||||||
if (shards != null) {
|
|
||||||
changed = true;
|
|
||||||
if (!snapshot.state().completed() && completed(shards.values())) {
|
|
||||||
updatedSnapshot = new SnapshotsInProgress.Entry(snapshot, State.SUCCESS, shards);
|
|
||||||
endSnapshot(updatedSnapshot);
|
|
||||||
} else {
|
|
||||||
updatedSnapshot = new SnapshotsInProgress.Entry(snapshot, shards);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
entries.add(updatedSnapshot);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (changed) {
|
|
||||||
snapshots = new SnapshotsInProgress(entries.toArray(new SnapshotsInProgress.Entry[entries.size()]));
|
|
||||||
return ClusterState.builder(currentState).putCustom(SnapshotsInProgress.TYPE, snapshots).build();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return currentState;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void onFailure(String source, Exception e) {
|
|
||||||
logger.warn(() ->
|
|
||||||
new ParameterizedMessage("failed to update snapshot state after shards started from [{}] ", source), e);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private ImmutableOpenMap<ShardId, ShardSnapshotStatus> processWaitingShards(
|
|
||||||
ImmutableOpenMap<ShardId, ShardSnapshotStatus> snapshotShards, RoutingTable routingTable) {
|
ImmutableOpenMap<ShardId, ShardSnapshotStatus> snapshotShards, RoutingTable routingTable) {
|
||||||
boolean snapshotChanged = false;
|
boolean snapshotChanged = false;
|
||||||
ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> shards = ImmutableOpenMap.builder();
|
ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> shards = ImmutableOpenMap.builder();
|
||||||
@ -905,19 +892,16 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean waitingShardsStartedOrUnassigned(ClusterChangedEvent event) {
|
private static boolean waitingShardsStartedOrUnassigned(SnapshotsInProgress snapshotsInProgress, ClusterChangedEvent event) {
|
||||||
SnapshotsInProgress curr = event.state().custom(SnapshotsInProgress.TYPE);
|
for (SnapshotsInProgress.Entry entry : snapshotsInProgress.entries()) {
|
||||||
if (curr != null) {
|
if (entry.state() == State.STARTED) {
|
||||||
for (SnapshotsInProgress.Entry entry : curr.entries()) {
|
for (ObjectCursor<String> index : entry.waitingIndices().keys()) {
|
||||||
if (entry.state() == State.STARTED && !entry.waitingIndices().isEmpty()) {
|
if (event.indexRoutingTableChanged(index.value)) {
|
||||||
for (ObjectCursor<String> index : entry.waitingIndices().keys()) {
|
IndexRoutingTable indexShardRoutingTable = event.state().getRoutingTable().index(index.value);
|
||||||
if (event.indexRoutingTableChanged(index.value)) {
|
for (ShardId shardId : entry.waitingIndices().get(index.value)) {
|
||||||
IndexRoutingTable indexShardRoutingTable = event.state().getRoutingTable().index(index.value);
|
ShardRouting shardRouting = indexShardRoutingTable.shard(shardId.id()).primaryShard();
|
||||||
for (ShardId shardId : entry.waitingIndices().get(index.value)) {
|
if (shardRouting != null && (shardRouting.started() || shardRouting.unassigned())) {
|
||||||
ShardRouting shardRouting = indexShardRoutingTable.shard(shardId.id()).primaryShard();
|
return true;
|
||||||
if (shardRouting != null && (shardRouting.started() || shardRouting.unassigned())) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -927,28 +911,12 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean removedNodesCleanupNeeded(ClusterChangedEvent event) {
|
private static boolean removedNodesCleanupNeeded(SnapshotsInProgress snapshotsInProgress, List<DiscoveryNode> removedNodes) {
|
||||||
SnapshotsInProgress snapshotsInProgress = event.state().custom(SnapshotsInProgress.TYPE);
|
// If at least one shard was running on a removed node - we need to fail it
|
||||||
if (snapshotsInProgress == null) {
|
return removedNodes.isEmpty() == false && snapshotsInProgress.entries().stream().flatMap(snapshot ->
|
||||||
return false;
|
StreamSupport.stream(((Iterable<ShardSnapshotStatus>) () -> snapshot.shards().valuesIt()).spliterator(), false)
|
||||||
}
|
.filter(s -> s.state().completed() == false).map(ShardSnapshotStatus::nodeId))
|
||||||
// Check if we just became the master
|
.anyMatch(removedNodes.stream().map(DiscoveryNode::getId).collect(Collectors.toSet())::contains);
|
||||||
boolean newMaster = !event.previousState().nodes().isLocalNodeElectedMaster();
|
|
||||||
for (SnapshotsInProgress.Entry snapshot : snapshotsInProgress.entries()) {
|
|
||||||
if (newMaster && (snapshot.state() == State.SUCCESS || snapshot.state() == State.INIT)) {
|
|
||||||
// We just replaced old master and snapshots in intermediate states needs to be cleaned
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
for (DiscoveryNode node : event.nodesDelta().removedNodes()) {
|
|
||||||
for (ObjectCursor<ShardSnapshotStatus> shardStatus : snapshot.shards().values()) {
|
|
||||||
if (!shardStatus.value.state().completed() && node.getId().equals(shardStatus.value.nodeId())) {
|
|
||||||
// At least one shard was running on the removed node - we need to fail it
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -981,25 +949,16 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
*
|
*
|
||||||
* @param entry snapshot
|
* @param entry snapshot
|
||||||
*/
|
*/
|
||||||
void endSnapshot(final SnapshotsInProgress.Entry entry) {
|
private void endSnapshot(final SnapshotsInProgress.Entry entry) {
|
||||||
endSnapshot(entry, null);
|
if (endingSnapshots.add(entry.snapshot()) == false) {
|
||||||
}
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Finalizes the shard in repository and then removes it from cluster state
|
|
||||||
* <p>
|
|
||||||
* This is non-blocking method that runs on a thread from SNAPSHOT thread pool
|
|
||||||
*
|
|
||||||
* @param entry snapshot
|
|
||||||
* @param failure failure reason or null if snapshot was successful
|
|
||||||
*/
|
|
||||||
private void endSnapshot(final SnapshotsInProgress.Entry entry, final String failure) {
|
|
||||||
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(new AbstractRunnable() {
|
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(new AbstractRunnable() {
|
||||||
@Override
|
@Override
|
||||||
protected void doRun() {
|
protected void doRun() {
|
||||||
final Snapshot snapshot = entry.snapshot();
|
final Snapshot snapshot = entry.snapshot();
|
||||||
final Repository repository = repositoriesService.repository(snapshot.getRepository());
|
final Repository repository = repositoriesService.repository(snapshot.getRepository());
|
||||||
|
final String failure = entry.failure();
|
||||||
logger.trace("[{}] finalizing snapshot in repository, state: [{}], failure[{}]", snapshot, entry.state(), failure);
|
logger.trace("[{}] finalizing snapshot in repository, state: [{}], failure[{}]", snapshot, entry.state(), failure);
|
||||||
ArrayList<SnapshotShardFailure> shardFailures = new ArrayList<>();
|
ArrayList<SnapshotShardFailure> shardFailures = new ArrayList<>();
|
||||||
for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shardStatus : entry.shards()) {
|
for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shardStatus : entry.shards()) {
|
||||||
@ -1015,7 +974,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
entry.startTime(),
|
entry.startTime(),
|
||||||
failure,
|
failure,
|
||||||
entry.shards().size(),
|
entry.shards().size(),
|
||||||
Collections.unmodifiableList(shardFailures),
|
unmodifiableList(shardFailures),
|
||||||
entry.getRepositoryStateId(),
|
entry.getRepositoryStateId(),
|
||||||
entry.includeGlobalState());
|
entry.includeGlobalState());
|
||||||
removeSnapshotFromClusterState(snapshot, snapshotInfo, null);
|
removeSnapshotFromClusterState(snapshot, snapshotInfo, null);
|
||||||
@ -1033,7 +992,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes record of running snapshot from cluster state
|
* Removes record of running snapshot from cluster state
|
||||||
* @param snapshot snapshot
|
* @param snapshot snapshot
|
||||||
* @param snapshotInfo snapshot info if snapshot was successful
|
* @param snapshotInfo snapshot info if snapshot was successful
|
||||||
* @param e exception if snapshot failed
|
* @param e exception if snapshot failed
|
||||||
*/
|
*/
|
||||||
@ -1043,11 +1002,11 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes record of running snapshot from cluster state and notifies the listener when this action is complete
|
* Removes record of running snapshot from cluster state and notifies the listener when this action is complete
|
||||||
* @param snapshot snapshot
|
* @param snapshot snapshot
|
||||||
* @param failure exception if snapshot failed
|
* @param failure exception if snapshot failed
|
||||||
* @param listener listener to notify when snapshot information is removed from the cluster state
|
* @param listener listener to notify when snapshot information is removed from the cluster state
|
||||||
*/
|
*/
|
||||||
private void removeSnapshotFromClusterState(final Snapshot snapshot, final SnapshotInfo snapshotInfo, final Exception failure,
|
private void removeSnapshotFromClusterState(final Snapshot snapshot, @Nullable SnapshotInfo snapshotInfo, final Exception failure,
|
||||||
@Nullable CleanupAfterErrorListener listener) {
|
@Nullable CleanupAfterErrorListener listener) {
|
||||||
clusterService.submitStateUpdateTask("remove snapshot metadata", new ClusterStateUpdateTask() {
|
clusterService.submitStateUpdateTask("remove snapshot metadata", new ClusterStateUpdateTask() {
|
||||||
|
|
||||||
@ -1065,8 +1024,8 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (changed) {
|
if (changed) {
|
||||||
snapshots = new SnapshotsInProgress(entries.toArray(new SnapshotsInProgress.Entry[entries.size()]));
|
return ClusterState.builder(currentState)
|
||||||
return ClusterState.builder(currentState).putCustom(SnapshotsInProgress.TYPE, snapshots).build();
|
.putCustom(SnapshotsInProgress.TYPE, new SnapshotsInProgress(unmodifiableList(entries))).build();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return currentState;
|
return currentState;
|
||||||
@ -1075,6 +1034,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
@Override
|
@Override
|
||||||
public void onFailure(String source, Exception e) {
|
public void onFailure(String source, Exception e) {
|
||||||
logger.warn(() -> new ParameterizedMessage("[{}] failed to remove snapshot metadata", snapshot), e);
|
logger.warn(() -> new ParameterizedMessage("[{}] failed to remove snapshot metadata", snapshot), e);
|
||||||
|
endingSnapshots.remove(snapshot);
|
||||||
if (listener != null) {
|
if (listener != null) {
|
||||||
listener.onFailure(e);
|
listener.onFailure(e);
|
||||||
}
|
}
|
||||||
@ -1082,6 +1042,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void onNoLongerMaster(String source) {
|
public void onNoLongerMaster(String source) {
|
||||||
|
endingSnapshots.remove(snapshot);
|
||||||
if (listener != null) {
|
if (listener != null) {
|
||||||
listener.onNoLongerMaster();
|
listener.onNoLongerMaster();
|
||||||
}
|
}
|
||||||
@ -1101,6 +1062,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
logger.warn("Failed to notify listeners", e);
|
logger.warn("Failed to notify listeners", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
endingSnapshots.remove(snapshot);
|
||||||
if (listener != null) {
|
if (listener != null) {
|
||||||
listener.onResponse(snapshotInfo);
|
listener.onResponse(snapshotInfo);
|
||||||
}
|
}
|
||||||
@ -1207,13 +1169,12 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
final ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards;
|
final ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards;
|
||||||
|
|
||||||
final State state = snapshotEntry.state();
|
final State state = snapshotEntry.state();
|
||||||
|
final String failure;
|
||||||
if (state == State.INIT) {
|
if (state == State.INIT) {
|
||||||
// snapshot is still initializing, mark it as aborted
|
// snapshot is still initializing, mark it as aborted
|
||||||
shards = snapshotEntry.shards();
|
shards = snapshotEntry.shards();
|
||||||
assert shards.isEmpty();
|
assert shards.isEmpty();
|
||||||
// No shards in this snapshot, we delete it right away since the SnapshotShardsService
|
failure = "Snapshot was aborted during initialization";
|
||||||
// has no work to do.
|
|
||||||
endSnapshot(snapshotEntry);
|
|
||||||
} else if (state == State.STARTED) {
|
} else if (state == State.STARTED) {
|
||||||
// snapshot is started - mark every non completed shard as aborted
|
// snapshot is started - mark every non completed shard as aborted
|
||||||
final ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> shardsBuilder = ImmutableOpenMap.builder();
|
final ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> shardsBuilder = ImmutableOpenMap.builder();
|
||||||
@ -1225,7 +1186,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
shardsBuilder.put(shardEntry.key, status);
|
shardsBuilder.put(shardEntry.key, status);
|
||||||
}
|
}
|
||||||
shards = shardsBuilder.build();
|
shards = shardsBuilder.build();
|
||||||
|
failure = "Snapshot was aborted by deletion";
|
||||||
} else {
|
} else {
|
||||||
boolean hasUncompletedShards = false;
|
boolean hasUncompletedShards = false;
|
||||||
// Cleanup in case a node gone missing and snapshot wasn't updated for some reason
|
// Cleanup in case a node gone missing and snapshot wasn't updated for some reason
|
||||||
@ -1246,10 +1207,10 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
// where we force to finish the snapshot
|
// where we force to finish the snapshot
|
||||||
logger.debug("trying to delete completed snapshot with no finalizing shards - can delete immediately");
|
logger.debug("trying to delete completed snapshot with no finalizing shards - can delete immediately");
|
||||||
shards = snapshotEntry.shards();
|
shards = snapshotEntry.shards();
|
||||||
endSnapshot(snapshotEntry);
|
|
||||||
}
|
}
|
||||||
|
failure = snapshotEntry.failure();
|
||||||
}
|
}
|
||||||
SnapshotsInProgress.Entry newSnapshot = new SnapshotsInProgress.Entry(snapshotEntry, State.ABORTED, shards);
|
SnapshotsInProgress.Entry newSnapshot = new SnapshotsInProgress.Entry(snapshotEntry, State.ABORTED, shards, failure);
|
||||||
clusterStateBuilder.putCustom(SnapshotsInProgress.TYPE, new SnapshotsInProgress(newSnapshot));
|
clusterStateBuilder.putCustom(SnapshotsInProgress.TYPE, new SnapshotsInProgress(newSnapshot));
|
||||||
}
|
}
|
||||||
return clusterStateBuilder.build();
|
return clusterStateBuilder.build();
|
||||||
@ -1400,7 +1361,8 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||||||
* @param indices list of indices to be snapshotted
|
* @param indices list of indices to be snapshotted
|
||||||
* @return list of shard to be included into current snapshot
|
* @return list of shard to be included into current snapshot
|
||||||
*/
|
*/
|
||||||
private ImmutableOpenMap<ShardId, SnapshotsInProgress.ShardSnapshotStatus> shards(ClusterState clusterState, List<IndexId> indices) {
|
private static ImmutableOpenMap<ShardId, SnapshotsInProgress.ShardSnapshotStatus> shards(ClusterState clusterState,
|
||||||
|
List<IndexId> indices) {
|
||||||
ImmutableOpenMap.Builder<ShardId, SnapshotsInProgress.ShardSnapshotStatus> builder = ImmutableOpenMap.builder();
|
ImmutableOpenMap.Builder<ShardId, SnapshotsInProgress.ShardSnapshotStatus> builder = ImmutableOpenMap.builder();
|
||||||
MetaData metaData = clusterState.metaData();
|
MetaData metaData = clusterState.metaData();
|
||||||
for (IndexId index : indices) {
|
for (IndexId index : indices) {
|
||||||
|
@ -29,7 +29,6 @@ import org.elasticsearch.cluster.ClusterState;
|
|||||||
import org.elasticsearch.cluster.ClusterStateListener;
|
import org.elasticsearch.cluster.ClusterStateListener;
|
||||||
import org.elasticsearch.cluster.SnapshotDeletionsInProgress;
|
import org.elasticsearch.cluster.SnapshotDeletionsInProgress;
|
||||||
import org.elasticsearch.cluster.SnapshotsInProgress;
|
import org.elasticsearch.cluster.SnapshotsInProgress;
|
||||||
import org.elasticsearch.cluster.coordination.FailedToCommitClusterStateException;
|
|
||||||
import org.elasticsearch.cluster.service.ClusterService;
|
import org.elasticsearch.cluster.service.ClusterService;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.common.unit.ByteSizeUnit;
|
import org.elasticsearch.common.unit.ByteSizeUnit;
|
||||||
@ -52,7 +51,6 @@ import java.util.concurrent.TimeUnit;
|
|||||||
import org.elasticsearch.test.transport.MockTransportService;
|
import org.elasticsearch.test.transport.MockTransportService;
|
||||||
|
|
||||||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
|
||||||
import static org.hamcrest.Matchers.instanceOf;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests snapshot operations during disruptions.
|
* Tests snapshot operations during disruptions.
|
||||||
@ -156,9 +154,6 @@ public class SnapshotDisruptionIT extends ESIntegTestCase {
|
|||||||
logger.info("--> got exception from race in master operation retries");
|
logger.info("--> got exception from race in master operation retries");
|
||||||
} else {
|
} else {
|
||||||
logger.info("--> got exception from hanged master", ex);
|
logger.info("--> got exception from hanged master", ex);
|
||||||
assertThat(cause, instanceOf(MasterNotDiscoveredException.class));
|
|
||||||
cause = cause.getCause();
|
|
||||||
assertThat(cause, instanceOf(FailedToCommitClusterStateException.class));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -988,7 +988,6 @@ public class DedicatedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTest
|
|||||||
* can be restored when the node the shrunken index was created on is no longer part of
|
* can be restored when the node the shrunken index was created on is no longer part of
|
||||||
* the cluster.
|
* the cluster.
|
||||||
*/
|
*/
|
||||||
@AwaitsFix(bugUrl="https://github.com/elastic/elasticsearch/issues/38226")
|
|
||||||
public void testRestoreShrinkIndex() throws Exception {
|
public void testRestoreShrinkIndex() throws Exception {
|
||||||
logger.info("--> starting a master node and a data node");
|
logger.info("--> starting a master node and a data node");
|
||||||
internalCluster().startMasterOnlyNode();
|
internalCluster().startMasterOnlyNode();
|
||||||
|
@ -3637,7 +3637,6 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
|
|||||||
}
|
}
|
||||||
|
|
||||||
@TestLogging("org.elasticsearch.snapshots:TRACE")
|
@TestLogging("org.elasticsearch.snapshots:TRACE")
|
||||||
@AwaitsFix(bugUrl="https://github.com/elastic/elasticsearch/issues/38226")
|
|
||||||
public void testAbortedSnapshotDuringInitDoesNotStart() throws Exception {
|
public void testAbortedSnapshotDuringInitDoesNotStart() throws Exception {
|
||||||
final Client client = client();
|
final Client client = client();
|
||||||
|
|
||||||
@ -3684,14 +3683,9 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
|
|||||||
|
|
||||||
// The deletion must set the snapshot in the ABORTED state
|
// The deletion must set the snapshot in the ABORTED state
|
||||||
assertBusy(() -> {
|
assertBusy(() -> {
|
||||||
try {
|
SnapshotsStatusResponse status =
|
||||||
SnapshotsStatusResponse status =
|
client.admin().cluster().prepareSnapshotStatus("repository").setSnapshots("snap").get();
|
||||||
client.admin().cluster().prepareSnapshotStatus("repository").setSnapshots("snap").get();
|
assertThat(status.getSnapshots().iterator().next().getState(), equalTo(State.ABORTED));
|
||||||
assertThat(status.getSnapshots().iterator().next().getState(), equalTo(State.ABORTED));
|
|
||||||
} catch (Exception e) {
|
|
||||||
// Force assertBusy to retry on every exception
|
|
||||||
throw new AssertionError(e);
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// Now unblock the repository
|
// Now unblock the repository
|
||||||
|
Loading…
x
Reference in New Issue
Block a user