mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-30 11:58:36 +00:00
The loading of `RepositoryData` is not an atomic operation. It uses a list + get combination of calls. This lead to accidentally returning an empty repository data for generations >=0 which can never not exist unless the repository is corrupted. In the test #48122 (and other SLM tests) there was a low chance of running into this concurrent modification scenario and the repository actually moving two index generations between listing out the index-N and loading the latest version of it. Since we only keep two index-N around at a time this lead to unexpectedly absent snapshots in status APIs. Fixing the behavior to be more resilient is non-trivial but in the works. For now I think we should simply throw in this scenario. This will also help prevent corruption in the unlikely event but possible of running into this issue in a snapshot create or delete operation on master failover on a repository like S3 which doesn't have the "no overwrites" protection on writing a new index-N. Fixes #48122
This commit is contained in:
parent
a22f6fbe3c
commit
3c20541823
server/src/main/java/org/elasticsearch/repositories/blobstore
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm
@ -907,9 +907,6 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
|
||||
public RepositoryData getRepositoryData() {
|
||||
try {
|
||||
return getRepositoryData(latestIndexBlobId());
|
||||
} catch (NoSuchFileException ex) {
|
||||
// repository doesn't have an index blob, its a new blank repo
|
||||
return RepositoryData.EMPTY;
|
||||
} catch (IOException ioe) {
|
||||
throw new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe);
|
||||
}
|
||||
@ -922,17 +919,12 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
|
||||
try {
|
||||
final String snapshotsIndexBlobName = INDEX_FILE_PREFIX + Long.toString(indexGen);
|
||||
|
||||
RepositoryData repositoryData;
|
||||
// EMPTY is safe here because RepositoryData#fromXContent calls namedObject
|
||||
try (InputStream blob = blobContainer().readBlob(snapshotsIndexBlobName);
|
||||
XContentParser parser = XContentType.JSON.xContent().createParser(NamedXContentRegistry.EMPTY,
|
||||
LoggingDeprecationHandler.INSTANCE, blob)) {
|
||||
repositoryData = RepositoryData.snapshotsFromXContent(parser, indexGen);
|
||||
return RepositoryData.snapshotsFromXContent(parser, indexGen);
|
||||
}
|
||||
return repositoryData;
|
||||
} catch (NoSuchFileException ex) {
|
||||
// repository doesn't have an index blob, its a new blank repo
|
||||
return RepositoryData.EMPTY;
|
||||
} catch (IOException ioe) {
|
||||
throw new RepositoryException(metadata.name(), "could not read repository data from index blob", ioe);
|
||||
}
|
||||
|
@ -390,18 +390,24 @@ public class SLMSnapshotBlockingIntegTests extends ESIntegTestCase {
|
||||
logger.info("--> waiting for {} snapshot [{}] to be deleted", expectedUnsuccessfulState, failedSnapshotName.get());
|
||||
assertBusy(() -> {
|
||||
try {
|
||||
try {
|
||||
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
|
||||
.prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
|
||||
assertThat(snapshotsStatusResponse.getSnapshots(), empty());
|
||||
} catch (SnapshotMissingException e) {
|
||||
// This is what we want to happen
|
||||
}
|
||||
logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists",
|
||||
expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get());
|
||||
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
|
||||
.prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
|
||||
assertThat(snapshotsStatusResponse.getSnapshots(), empty());
|
||||
} catch (SnapshotMissingException e) {
|
||||
// This is what we want to happen
|
||||
.prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
|
||||
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
|
||||
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
|
||||
} catch (RepositoryException re) {
|
||||
// Concurrent status calls and write operations may lead to failures in determining the current repository generation
|
||||
// TODO: Remove this hack once tracking the current repository generation has been made consistent
|
||||
throw new AssertionError(re);
|
||||
}
|
||||
logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists",
|
||||
expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get());
|
||||
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
|
||||
.prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
|
||||
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
|
||||
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user