Fixes snapshot status on failed snapshots (#23833)

If a snapshot is taken on multiple indices, and some of them are "good"
indices that don't contain any corruption or failures, and some of them
are "bad" indices that contain missing shards or corrupted shards, and
if the snapshot request is set to partial=false (meaning don't take a
snapshot if there are any failures), then the good indices will not be
snapshotted either.  Previously, when getting the status of such a
snapshot, a 500 error would be thrown, because the snap-*.dat blob for
the shards in the good index could not be found.

This commit fixes the problem by reporting shards of good indices as
failed due to a failed snapshot, instead of throwing the
NoSuchFileException.

Closes #23716
This commit is contained in:
Ali Beyad 2017-04-06 20:54:21 -04:00 committed by GitHub
parent 1ad5398af7
commit 480cfe3fe0
2 changed files with 92 additions and 3 deletions

View File

@ -550,7 +550,8 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
/**
* Returns status of shards currently finished snapshots
* <p>
* This method is executed on master node and it's complimentary to the {@link SnapshotShardsService#currentSnapshotShards(Snapshot)} because it
* This method is executed on master node and it's complimentary to the
* {@link SnapshotShardsService#currentSnapshotShards(Snapshot)} because it
* returns similar information but for already finished snapshots.
* </p>
*
@ -578,8 +579,25 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
shardSnapshotStatus.failure(shardFailure.reason());
shardStatus.put(shardId, shardSnapshotStatus);
} else {
IndexShardSnapshotStatus shardSnapshotStatus =
repository.getShardSnapshotStatus(snapshotInfo.snapshotId(), snapshotInfo.version(), indexId, shardId);
final IndexShardSnapshotStatus shardSnapshotStatus;
if (snapshotInfo.state() == SnapshotState.FAILED) {
// If the snapshot failed, but the shard's snapshot does
// not have an exception, it means that partial snapshots
// were disabled and in this case, the shard snapshot will
// *not* have any metadata, so attempting to read the shard
// snapshot status will throw an exception. Instead, we create
// a status for the shard to indicate that the shard snapshot
// could not be taken due to partial being set to false.
shardSnapshotStatus = new IndexShardSnapshotStatus();
shardSnapshotStatus.updateStage(IndexShardSnapshotStatus.Stage.FAILURE);
shardSnapshotStatus.failure("skipped");
} else {
shardSnapshotStatus = repository.getShardSnapshotStatus(
snapshotInfo.snapshotId(),
snapshotInfo.version(),
indexId,
shardId);
}
shardStatus.put(shardId, shardSnapshotStatus);
}
}

View File

@ -54,6 +54,7 @@ import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.metadata.MappingMetaData;
import org.elasticsearch.cluster.metadata.MetaDataIndexStateService;
import org.elasticsearch.cluster.routing.IndexRoutingTable;
import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.Strings;
@ -2737,4 +2738,74 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
assertEquals(SnapshotState.SUCCESS, getSnapshotsResponse.getSnapshots().get(0).state());
}
public void testSnapshotStatusOnFailedIndex() throws Exception {
logger.info("--> creating repository");
final Path repoPath = randomRepoPath();
final Client client = client();
assertAcked(client.admin().cluster()
.preparePutRepository("test-repo")
.setType("fs")
.setVerify(false)
.setSettings(Settings.builder().put("location", repoPath)));
logger.info("--> creating good index");
assertAcked(prepareCreate("test-idx-good")
.setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put(SETTING_NUMBER_OF_REPLICAS, 0)));
ensureGreen();
final int numDocs = randomIntBetween(1, 5);
for (int i = 0; i < numDocs; i++) {
index("test-idx-good", "doc", Integer.toString(i), "foo", "bar" + i);
}
refresh();
logger.info("--> creating bad index");
assertAcked(prepareCreate("test-idx-bad")
.setWaitForActiveShards(ActiveShardCount.NONE)
.setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put(SETTING_NUMBER_OF_REPLICAS, 0)
// set shard allocation to none so the primary cannot be
// allocated - simulates a "bad" index that fails to snapshot
.put(EnableAllocationDecider.INDEX_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(),
"none")));
logger.info("--> snapshot bad index and get status");
client.admin().cluster()
.prepareCreateSnapshot("test-repo", "test-snap1")
.setWaitForCompletion(true)
.setIndices("test-idx-bad")
.get();
SnapshotsStatusResponse snapshotsStatusResponse = client.admin().cluster()
.prepareSnapshotStatus("test-repo")
.setSnapshots("test-snap1")
.get();
assertEquals(1, snapshotsStatusResponse.getSnapshots().size());
assertEquals(State.FAILED, snapshotsStatusResponse.getSnapshots().get(0).getState());
logger.info("--> snapshot both good and bad index and get status");
client.admin().cluster()
.prepareCreateSnapshot("test-repo", "test-snap2")
.setWaitForCompletion(true)
.setIndices("test-idx-good", "test-idx-bad")
.get();
snapshotsStatusResponse = client.admin().cluster()
.prepareSnapshotStatus("test-repo")
.setSnapshots("test-snap2")
.get();
assertEquals(1, snapshotsStatusResponse.getSnapshots().size());
// verify a FAILED status is returned instead of a 500 status code
// see https://github.com/elastic/elasticsearch/issues/23716
SnapshotStatus snapshotStatus = snapshotsStatusResponse.getSnapshots().get(0);
assertEquals(State.FAILED, snapshotStatus.getState());
for (SnapshotIndexShardStatus shardStatus : snapshotStatus.getShards()) {
assertEquals(SnapshotIndexShardStage.FAILURE, shardStatus.getStage());
if (shardStatus.getIndex().equals("test-idx-good")) {
assertEquals("skipped", shardStatus.getFailure());
} else {
assertEquals("primary shard is not allocated", shardStatus.getFailure());
}
}
}
}