Cleanup Concurrent RepositoryData Loading () ()

The loading of `RepositoryData` is not an atomic operation.
It uses a list + get combination of calls.
This lead to accidentally returning an empty repository data
for generations >=0 which can never not exist unless the repository
is corrupted.
In the test  (and other SLM tests) there was a low chance of
running into this concurrent modification scenario and the repository
actually moving two index generations between listing out the
index-N and loading the latest version of it. Since we only keep
two index-N around at a time this lead to unexpectedly absent
snapshots in status APIs.
Fixing the behavior to be more resilient is non-trivial but in the works.
For now I think we should simply throw in this scenario. This will also
help prevent corruption in the unlikely event but possible of running into this
issue in a snapshot create or delete operation on master failover on a
repository like S3 which doesn't have the "no overwrites" protection on
writing a new index-N.

Fixes 
This commit is contained in:
Armin Braun 2019-11-02 20:42:29 +01:00 committed by GitHub
parent a22f6fbe3c
commit 3c20541823
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 17 additions and 19 deletions
server/src/main/java/org/elasticsearch/repositories/blobstore
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm

@ -907,9 +907,6 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
public RepositoryData getRepositoryData() {
try {
return getRepositoryData(latestIndexBlobId());
} catch (NoSuchFileException ex) {
// repository doesn't have an index blob, its a new blank repo
return RepositoryData.EMPTY;
} catch (IOException ioe) {
throw new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe);
}
@ -922,17 +919,12 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
try {
final String snapshotsIndexBlobName = INDEX_FILE_PREFIX + Long.toString(indexGen);
RepositoryData repositoryData;
// EMPTY is safe here because RepositoryData#fromXContent calls namedObject
try (InputStream blob = blobContainer().readBlob(snapshotsIndexBlobName);
XContentParser parser = XContentType.JSON.xContent().createParser(NamedXContentRegistry.EMPTY,
LoggingDeprecationHandler.INSTANCE, blob)) {
repositoryData = RepositoryData.snapshotsFromXContent(parser, indexGen);
return RepositoryData.snapshotsFromXContent(parser, indexGen);
}
return repositoryData;
} catch (NoSuchFileException ex) {
// repository doesn't have an index blob, its a new blank repo
return RepositoryData.EMPTY;
} catch (IOException ioe) {
throw new RepositoryException(metadata.name(), "could not read repository data from index blob", ioe);
}

@ -390,18 +390,24 @@ public class SLMSnapshotBlockingIntegTests extends ESIntegTestCase {
logger.info("--> waiting for {} snapshot [{}] to be deleted", expectedUnsuccessfulState, failedSnapshotName.get());
assertBusy(() -> {
try {
try {
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
.prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
assertThat(snapshotsStatusResponse.getSnapshots(), empty());
} catch (SnapshotMissingException e) {
// This is what we want to happen
}
logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists",
expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get());
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
.prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
assertThat(snapshotsStatusResponse.getSnapshots(), empty());
} catch (SnapshotMissingException e) {
// This is what we want to happen
.prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
} catch (RepositoryException re) {
// Concurrent status calls and write operations may lead to failures in determining the current repository generation
// TODO: Remove this hack once tracking the current repository generation has been made consistent
throw new AssertionError(re);
}
logger.info("--> {} snapshot [{}] has been deleted, checking successful snapshot [{}] still exists",
expectedUnsuccessfulState, failedSnapshotName.get(), successfulSnapshotName.get());
GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
.prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
});
}
}