Fix Concurrent Snapshot Create+Delete + Delete Index (#61770) (#61773)

We had a bug here were we put a `null` value into the shard
assignment mapping when reassigning work after a snapshot delete
had gone through. This only affects partial snaphots but essentially
dead-locks the snapshot process.

Closes #61762
This commit is contained in:
Armin Braun 2020-09-01 13:20:25 +02:00 committed by GitHub
parent 50119bb08f
commit 3fd25bfa87
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 47 additions and 6 deletions

View File

@ -819,6 +819,28 @@ public class ConcurrentSnapshotsIT extends AbstractSnapshotIntegTestCase {
assertAcked(deleteFuture.get());
}
public void testMultiplePartialSnapshotsQueuedAfterDelete() throws Exception {
final String masterNode = internalCluster().startMasterOnlyNode();
internalCluster().startDataOnlyNode();
final String repoName = "test-repo";
createRepository(repoName, "mock");
createIndexWithContent("index-one");
createIndexWithContent("index-two");
createNSnapshots(repoName, randomIntBetween(1, 5));
final ActionFuture<AcknowledgedResponse> deleteFuture = startAndBlockOnDeleteSnapshot(repoName, "*");
final ActionFuture<CreateSnapshotResponse> snapshotThree = startFullSnapshot(repoName, "snapshot-three", true);
final ActionFuture<CreateSnapshotResponse> snapshotFour = startFullSnapshot(repoName, "snapshot-four", true);
awaitNSnapshotsInProgress(2);
assertAcked(client().admin().indices().prepareDelete("index-two"));
unblockNode(repoName, masterNode);
assertThat(snapshotThree.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
assertThat(snapshotFour.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
assertAcked(deleteFuture.get());
}
public void testQueuedSnapshotsWaitingForShardReady() throws Exception {
internalCluster().startMasterOnlyNode();
internalCluster().startDataOnlyNodes(2);
@ -1238,8 +1260,13 @@ public class ConcurrentSnapshotsIT extends AbstractSnapshotIntegTestCase {
}
private ActionFuture<CreateSnapshotResponse> startFullSnapshot(String repoName, String snapshotName) {
return startFullSnapshot(repoName, snapshotName, false);
}
private ActionFuture<CreateSnapshotResponse> startFullSnapshot(String repoName, String snapshotName, boolean partial) {
logger.info("--> creating full snapshot [{}] to repo [{}]", snapshotName, repoName);
return client().admin().cluster().prepareCreateSnapshot(repoName, snapshotName).setWaitForCompletion(true).execute();
return client().admin().cluster().prepareCreateSnapshot(repoName, snapshotName).setWaitForCompletion(true)
.setPartial(partial).execute();
}
private void awaitClusterState(Predicate<ClusterState> statePredicate) throws Exception {

View File

@ -401,6 +401,13 @@ public class SnapshotsInProgress extends AbstractNamedDiffable<Custom> implement
public static final ShardSnapshotStatus UNASSIGNED_QUEUED =
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.QUEUED, null);
/**
* Shard snapshot status for shards that could not be snapshotted because their index was deleted from before the shard snapshot
* started.
*/
public static final ShardSnapshotStatus MISSING =
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.MISSING, "missing index", null);
private final ShardState state;
@Nullable

View File

@ -2192,9 +2192,17 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
final ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> updatedAssignmentsBuilder =
ImmutableOpenMap.builder(entry.shards());
for (ShardId shardId : canBeUpdated) {
final boolean added = reassignedShardIds.add(shardId);
assert added;
updatedAssignmentsBuilder.put(shardId, shardAssignments.get(shardId));
final ShardSnapshotStatus updated = shardAssignments.get(shardId);
if (updated == null) {
// We don't have a new assignment for this shard because its index was concurrently deleted
assert currentState.routingTable().hasIndex(shardId.getIndex()) == false :
"Missing assignment for [" + shardId + "]";
updatedAssignmentsBuilder.put(shardId, ShardSnapshotStatus.MISSING);
} else {
final boolean added = reassignedShardIds.add(shardId);
assert added;
updatedAssignmentsBuilder.put(shardId, updated);
}
}
snapshotEntries.add(entry.withShards(updatedAssignmentsBuilder.build()));
changed = true;
@ -2284,8 +2292,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
IndexMetadata indexMetadata = metadata.index(indexName);
if (indexMetadata == null) {
// The index was deleted before we managed to start the snapshot - mark it as missing.
builder.put(new ShardId(indexName, IndexMetadata.INDEX_UUID_NA_VALUE, 0),
new SnapshotsInProgress.ShardSnapshotStatus(null, ShardState.MISSING, "missing index", null));
builder.put(new ShardId(indexName, IndexMetadata.INDEX_UUID_NA_VALUE, 0), ShardSnapshotStatus.MISSING);
} else {
final IndexRoutingTable indexRoutingTable = routingTable.index(indexName);
for (int i = 0; i < indexMetadata.getNumberOfShards(); i++) {