Fix Broken Index Shard Snapshot File Preventing Snapshot Creation (#41310) (#41473)

* The problem here is that if we run into a corrupted index-N file, instead of generating a new index-(N+1) file, we instead set the newest index generation to -1 and thus tried to create `index-0`
   * If `index-0` is corrupt, this prevents us from ever creating a new snapshot using the broken shard, because we are unable to create `index-0` since it already exists
   * Fixed by still using the index generation for naming the next index file, even if it was a broken index file
* Added test that makes sure restoring as well as snapshotting on top of the broken shard index file work as expected
* closes #41304
This commit is contained in:
Armin Braun 2019-04-24 18:39:17 +02:00 committed by GitHub
parent 37ebefb553
commit 1db9166ea0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 120 additions and 17 deletions

View File

@ -37,10 +37,10 @@ import java.util.Map;
import static java.util.Collections.unmodifiableMap;
/**
* Contains information about all snapshot for the given shard in repository
* Contains information about all snapshots for the given shard in repository
* <p>
* This class is used to find files that were already snapshotted and clear out files that no longer referenced by any
* snapshots
* snapshots.
*/
public class BlobStoreIndexShardSnapshots implements Iterable<SnapshotFiles>, ToXContentFragment {

View File

@ -121,27 +121,28 @@ import static org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSna
* <pre>
* {@code
* STORE_ROOT
* |- index-N - list of all snapshot ids and the indices belonging to each snapshot, N is the generation of the file
* |- index-N - JSON serialized {@link RepositoryData} containing a list of all snapshot ids and the indices belonging to
* | each snapshot, N is the generation of the file
* |- index.latest - contains the numeric value of the latest generation of the index file (i.e. N from above)
* |- incompatible-snapshots - list of all snapshot ids that are no longer compatible with the current version of the cluster
* |- snap-20131010 - JSON serialized Snapshot for snapshot "20131010"
* |- meta-20131010.dat - JSON serialized MetaData for snapshot "20131010" (includes only global metadata)
* |- snap-20131011 - JSON serialized Snapshot for snapshot "20131011"
* |- meta-20131011.dat - JSON serialized MetaData for snapshot "20131011"
* |- snap-20131010.dat - SMILE serialized {@link SnapshotInfo} for snapshot "20131010"
* |- meta-20131010.dat - SMILE serialized {@link MetaData} for snapshot "20131010" (includes only global metadata)
* |- snap-20131011.dat - SMILE serialized {@link SnapshotInfo} for snapshot "20131011"
* |- meta-20131011.dat - SMILE serialized {@link MetaData} for snapshot "20131011"
* .....
* |- indices/ - data for all indices
* |- Ac1342-B_x/ - data for index "foo" which was assigned the unique id of Ac1342-B_x in the repository
* | |- meta-20131010.dat - JSON Serialized IndexMetaData for index "foo"
* | |- meta-20131010.dat - JSON Serialized {@link IndexMetaData} for index "foo"
* | |- 0/ - data for shard "0" of index "foo"
* | | |- __1 \
* | | |- __1 \ (files with numeric names were created by older ES versions)
* | | |- __2 |
* | | |- __3 |- files from different segments see snapshot-* for their mappings to real segment files
* | | |- __4 |
* | | |- __5 /
* | | |- __VPO5oDMVT5y4Akv8T_AO_A |- files from different segments see snap-* for their mappings to real segment files
* | | |- __1gbJy18wS_2kv1qI7FgKuQ |
* | | |- __R8JvZAHlSMyMXyZc2SS8Zg /
* | | .....
* | | |- snap-20131010.dat - JSON serialized BlobStoreIndexShardSnapshot for snapshot "20131010"
* | | |- snap-20131011.dat - JSON serialized BlobStoreIndexShardSnapshot for snapshot "20131011"
* | | |- list-123 - JSON serialized BlobStoreIndexShardSnapshot for snapshot "20131011"
* | | |- snap-20131010.dat - SMILE serialized {@link BlobStoreIndexShardSnapshot} for snapshot "20131010"
* | | |- snap-20131011.dat - SMILE serialized {@link BlobStoreIndexShardSnapshot} for snapshot "20131011"
* | | |- index-123 - SMILE serialized {@link BlobStoreIndexShardSnapshots} for the shard
* | |
* | |- 1/ - data for shard "1" of index "foo"
* | | |- __1
@ -1136,7 +1137,7 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
logger.warn(() -> new ParameterizedMessage("failed to read commit point [{}]", name), e);
}
}
return new Tuple<>(new BlobStoreIndexShardSnapshots(snapshots), -1);
return new Tuple<>(new BlobStoreIndexShardSnapshots(snapshots), latest);
}
}

View File

@ -2963,6 +2963,108 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
assertAcked(client().admin().cluster().prepareDeleteSnapshot("test-repo", snapshotInfo.snapshotId().getName()).get());
}
/**
* Tests that a shard snapshot with a corrupted shard index file can still be used for restore and incremental snapshots.
*/
public void testSnapshotWithCorruptedShardIndexFile() throws Exception {
final Client client = client();
final Path repo = randomRepoPath();
final String indexName = "test-idx";
final int nDocs = randomIntBetween(1, 10);
logger.info("--> creating index [{}] with [{}] documents in it", indexName, nDocs);
assertAcked(prepareCreate(indexName).setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1).put(SETTING_NUMBER_OF_REPLICAS, 0)));
final IndexRequestBuilder[] documents = new IndexRequestBuilder[nDocs];
for (int j = 0; j < nDocs; j++) {
documents[j] = client.prepareIndex(indexName, "_doc").setSource("foo", "bar");
}
indexRandom(true, documents);
flushAndRefresh();
logger.info("--> creating repository");
assertAcked(client().admin().cluster().preparePutRepository("test-repo")
.setType("fs")
.setSettings(Settings.builder()
.put("location", repo)));
final String snapshot1 = "test-snap-1";
logger.info("--> creating snapshot [{}]", snapshot1);
final SnapshotInfo snapshotInfo = client().admin().cluster().prepareCreateSnapshot("test-repo", snapshot1)
.setWaitForCompletion(true)
.get()
.getSnapshotInfo();
assertThat(snapshotInfo.failedShards(), equalTo(0));
assertThat(snapshotInfo.successfulShards(), equalTo(snapshotInfo.totalShards()));
assertThat(snapshotInfo.indices(), hasSize(1));
RepositoriesService service = internalCluster().getInstance(RepositoriesService.class, internalCluster().getMasterName());
Repository repository = service.repository("test-repo");
final RepositoryData repositoryData = getRepositoryData(repository);
final Map<String, IndexId> indexIds = repositoryData.getIndices();
assertThat(indexIds.size(), equalTo(1));
final IndexId corruptedIndex = indexIds.get(indexName);
final Path shardIndexFile = repo.resolve("indices")
.resolve(corruptedIndex.getId()).resolve("0")
.resolve("index-0");
logger.info("--> truncating shard index file [{}]", shardIndexFile);
try (SeekableByteChannel outChan = Files.newByteChannel(shardIndexFile, StandardOpenOption.WRITE)) {
outChan.truncate(randomInt(10));
}
logger.info("--> verifying snapshot state for [{}]", snapshot1);
List<SnapshotInfo> snapshotInfos = client().admin().cluster().prepareGetSnapshots("test-repo").get().getSnapshots();
assertThat(snapshotInfos.size(), equalTo(1));
assertThat(snapshotInfos.get(0).state(), equalTo(SnapshotState.SUCCESS));
assertThat(snapshotInfos.get(0).snapshotId().getName(), equalTo(snapshot1));
logger.info("--> deleting index [{}]", indexName);
assertAcked(client().admin().indices().prepareDelete(indexName));
logger.info("--> restoring snapshot [{}]", snapshot1);
client().admin().cluster().prepareRestoreSnapshot("test-repo", snapshot1)
.setRestoreGlobalState(randomBoolean())
.setWaitForCompletion(true)
.get();
ensureGreen();
assertHitCount(client().prepareSearch(indexName).setSize(0).get(), nDocs);
logger.info("--> indexing [{}] more documents into [{}]", nDocs, indexName);
for (int j = 0; j < nDocs; j++) {
documents[j] = client.prepareIndex(indexName, "_doc").setSource("foo2", "bar2");
}
indexRandom(true, documents);
final String snapshot2 = "test-snap-2";
logger.info("--> creating snapshot [{}]", snapshot2);
final SnapshotInfo snapshotInfo2 = client().admin().cluster().prepareCreateSnapshot("test-repo", snapshot2)
.setWaitForCompletion(true)
.get()
.getSnapshotInfo();
assertThat(snapshotInfo2.state(), equalTo(SnapshotState.SUCCESS));
assertThat(snapshotInfo2.failedShards(), equalTo(0));
assertThat(snapshotInfo2.successfulShards(), equalTo(snapshotInfo.totalShards()));
assertThat(snapshotInfo2.indices(), hasSize(1));
logger.info("--> deleting index [{}]", indexName);
assertAcked(client().admin().indices().prepareDelete(indexName));
logger.info("--> restoring snapshot [{}]", snapshot2);
client().admin().cluster().prepareRestoreSnapshot("test-repo", snapshot2)
.setRestoreGlobalState(randomBoolean())
.setWaitForCompletion(true)
.get();
ensureGreen();
assertHitCount(client().prepareSearch(indexName).setSize(0).get(), 2 * nDocs);
}
public void testCannotCreateSnapshotsWithSameName() throws Exception {
final String repositoryName = "test-repo";
final String snapshotName = "test-snap";