Fix Infinite Retry Loop in loading RepositoryData (#50987) (#51093)

* Fix Infinite Retry Loop in loading RepositoryData

We were running into an infinite loop when trying to load corrupted (or otherwise un-loadable)
repository data for a repo that uses best effort consistency (e.g. that was just freshly mounted
as done in the test) because we kepy resetting to `-1` on `IOException`, listing and finding the broken
generation `N` and then interpreted the subsequent reset to `-1` as a concurrent change to the repository.
This commit is contained in:
Armin Braun 2020-01-16 21:08:35 +01:00 committed by GitHub
parent f6c89b4599
commit e51b209dd3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 51 additions and 4 deletions

View File

@ -1065,6 +1065,9 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
return;
}
// Retry loading RepositoryData in a loop in case we run into concurrent modifications of the repository.
// Keep track of the most recent generation we failed to load so we can break out of the loop if we fail to load the same
// generation repeatedly.
long lastFailedGeneration = RepositoryData.UNKNOWN_REPO_GEN;
while (true) {
final long genToLoad;
if (bestEffortConsistency) {
@ -1074,7 +1077,9 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
try {
generation = latestIndexBlobId();
} catch (IOException ioe) {
throw new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe);
listener.onFailure(
new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe));
return;
}
genToLoad = latestKnownRepoGen.updateAndGet(known -> Math.max(known, generation));
if (genToLoad > generation) {
@ -1089,7 +1094,9 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
listener.onResponse(getRepositoryData(genToLoad));
return;
} catch (RepositoryException e) {
if (genToLoad != latestKnownRepoGen.get()) {
// If the generation to load changed concurrently and we didn't just try loading the same generation before we retry
if (genToLoad != latestKnownRepoGen.get() && genToLoad != lastFailedGeneration) {
lastFailedGeneration = genToLoad;
logger.warn("Failed to load repository data generation [" + genToLoad +
"] because a concurrent operation moved the current generation to [" + latestKnownRepoGen.get() + "]", e);
continue;
@ -1099,10 +1106,10 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
// of N so we mark this repository as corrupted.
markRepoCorrupted(genToLoad, e,
ActionListener.wrap(v -> listener.onFailure(corruptedStateException(e)), listener::onFailure));
return;
} else {
throw e;
listener.onFailure(e);
}
return;
}
}
}

View File

@ -303,6 +303,46 @@ public class CorruptedBlobStoreRepositoryIT extends AbstractSnapshotIntegTestCas
}
}
public void testMountCorruptedRepositoryData() throws Exception {
disableRepoConsistencyCheck("This test intentionally corrupts the repository contents");
Client client = client();
Path repo = randomRepoPath();
final String repoName = "test-repo";
logger.info("--> creating repository at {}", repo.toAbsolutePath());
assertAcked(client.admin().cluster().preparePutRepository(repoName)
.setType("fs").setSettings(Settings.builder()
.put("location", repo)
.put("compress", false)));
final String snapshot = "test-snap";
logger.info("--> creating snapshot");
CreateSnapshotResponse createSnapshotResponse = client.admin().cluster().prepareCreateSnapshot(repoName, snapshot)
.setWaitForCompletion(true).setIndices("test-idx-*").get();
assertThat(createSnapshotResponse.getSnapshotInfo().successfulShards(),
equalTo(createSnapshotResponse.getSnapshotInfo().totalShards()));
logger.info("--> corrupt index-N blob");
final Repository repository = internalCluster().getCurrentMasterNodeInstance(RepositoriesService.class).repository(repoName);
final RepositoryData repositoryData = getRepositoryData(repository);
Files.write(repo.resolve("index-" + repositoryData.getGenId()), randomByteArrayOfLength(randomIntBetween(1, 100)));
logger.info("--> verify loading repository data throws RepositoryException");
expectThrows(RepositoryException.class, () -> getRepositoryData(repository));
logger.info("--> mount repository path in a new repository");
final String otherRepoName = "other-repo";
assertAcked(client.admin().cluster().preparePutRepository(otherRepoName)
.setType("fs").setSettings(Settings.builder()
.put("location", repo)
.put("compress", false)));
final Repository otherRepo = internalCluster().getCurrentMasterNodeInstance(RepositoriesService.class).repository(otherRepoName);
logger.info("--> verify loading repository data from newly mounted repository throws RepositoryException");
expectThrows(RepositoryException.class, () -> getRepositoryData(otherRepo));
}
private void assertRepositoryBlocked(Client client, String repo, String existingSnapshot) {
logger.info("--> try to delete snapshot");
final RepositoryException repositoryException3 = expectThrows(RepositoryException.class,