We had a bug here were we put a `null` value into the shard assignment mapping when reassigning work after a snapshot delete had gone through. This only affects partial snaphots but essentially dead-locks the snapshot process. Closes #61762
This commit is contained in:
parent
50119bb08f
commit
3fd25bfa87
|
@ -819,6 +819,28 @@ public class ConcurrentSnapshotsIT extends AbstractSnapshotIntegTestCase {
|
|||
assertAcked(deleteFuture.get());
|
||||
}
|
||||
|
||||
public void testMultiplePartialSnapshotsQueuedAfterDelete() throws Exception {
|
||||
final String masterNode = internalCluster().startMasterOnlyNode();
|
||||
internalCluster().startDataOnlyNode();
|
||||
final String repoName = "test-repo";
|
||||
createRepository(repoName, "mock");
|
||||
createIndexWithContent("index-one");
|
||||
createIndexWithContent("index-two");
|
||||
createNSnapshots(repoName, randomIntBetween(1, 5));
|
||||
|
||||
final ActionFuture<AcknowledgedResponse> deleteFuture = startAndBlockOnDeleteSnapshot(repoName, "*");
|
||||
final ActionFuture<CreateSnapshotResponse> snapshotThree = startFullSnapshot(repoName, "snapshot-three", true);
|
||||
final ActionFuture<CreateSnapshotResponse> snapshotFour = startFullSnapshot(repoName, "snapshot-four", true);
|
||||
awaitNSnapshotsInProgress(2);
|
||||
|
||||
assertAcked(client().admin().indices().prepareDelete("index-two"));
|
||||
unblockNode(repoName, masterNode);
|
||||
|
||||
assertThat(snapshotThree.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
|
||||
assertThat(snapshotFour.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
|
||||
assertAcked(deleteFuture.get());
|
||||
}
|
||||
|
||||
public void testQueuedSnapshotsWaitingForShardReady() throws Exception {
|
||||
internalCluster().startMasterOnlyNode();
|
||||
internalCluster().startDataOnlyNodes(2);
|
||||
|
@ -1238,8 +1260,13 @@ public class ConcurrentSnapshotsIT extends AbstractSnapshotIntegTestCase {
|
|||
}
|
||||
|
||||
private ActionFuture<CreateSnapshotResponse> startFullSnapshot(String repoName, String snapshotName) {
|
||||
return startFullSnapshot(repoName, snapshotName, false);
|
||||
}
|
||||
|
||||
private ActionFuture<CreateSnapshotResponse> startFullSnapshot(String repoName, String snapshotName, boolean partial) {
|
||||
logger.info("--> creating full snapshot [{}] to repo [{}]", snapshotName, repoName);
|
||||
return client().admin().cluster().prepareCreateSnapshot(repoName, snapshotName).setWaitForCompletion(true).execute();
|
||||
return client().admin().cluster().prepareCreateSnapshot(repoName, snapshotName).setWaitForCompletion(true)
|
||||
.setPartial(partial).execute();
|
||||
}
|
||||
|
||||
private void awaitClusterState(Predicate<ClusterState> statePredicate) throws Exception {
|
||||
|
|
|
@ -401,6 +401,13 @@ public class SnapshotsInProgress extends AbstractNamedDiffable<Custom> implement
|
|||
public static final ShardSnapshotStatus UNASSIGNED_QUEUED =
|
||||
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.QUEUED, null);
|
||||
|
||||
/**
|
||||
* Shard snapshot status for shards that could not be snapshotted because their index was deleted from before the shard snapshot
|
||||
* started.
|
||||
*/
|
||||
public static final ShardSnapshotStatus MISSING =
|
||||
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.MISSING, "missing index", null);
|
||||
|
||||
private final ShardState state;
|
||||
|
||||
@Nullable
|
||||
|
|
|
@ -2192,9 +2192,17 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||
final ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> updatedAssignmentsBuilder =
|
||||
ImmutableOpenMap.builder(entry.shards());
|
||||
for (ShardId shardId : canBeUpdated) {
|
||||
final boolean added = reassignedShardIds.add(shardId);
|
||||
assert added;
|
||||
updatedAssignmentsBuilder.put(shardId, shardAssignments.get(shardId));
|
||||
final ShardSnapshotStatus updated = shardAssignments.get(shardId);
|
||||
if (updated == null) {
|
||||
// We don't have a new assignment for this shard because its index was concurrently deleted
|
||||
assert currentState.routingTable().hasIndex(shardId.getIndex()) == false :
|
||||
"Missing assignment for [" + shardId + "]";
|
||||
updatedAssignmentsBuilder.put(shardId, ShardSnapshotStatus.MISSING);
|
||||
} else {
|
||||
final boolean added = reassignedShardIds.add(shardId);
|
||||
assert added;
|
||||
updatedAssignmentsBuilder.put(shardId, updated);
|
||||
}
|
||||
}
|
||||
snapshotEntries.add(entry.withShards(updatedAssignmentsBuilder.build()));
|
||||
changed = true;
|
||||
|
@ -2284,8 +2292,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
|
|||
IndexMetadata indexMetadata = metadata.index(indexName);
|
||||
if (indexMetadata == null) {
|
||||
// The index was deleted before we managed to start the snapshot - mark it as missing.
|
||||
builder.put(new ShardId(indexName, IndexMetadata.INDEX_UUID_NA_VALUE, 0),
|
||||
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.MISSING, "missing index", null));
|
||||
builder.put(new ShardId(indexName, IndexMetadata.INDEX_UUID_NA_VALUE, 0), ShardSnapshotStatus.MISSING);
|
||||
} else {
|
||||
final IndexRoutingTable indexRoutingTable = routingTable.index(indexName);
|
||||
for (int i = 0; i < indexMetadata.getNumberOfShards(); i++) {
|
||||
|
|
Loading…
Reference in New Issue