From e51b209dd35f4153bdc9be9f4cd2c083b14e6c28 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Thu, 16 Jan 2020 21:08:35 +0100 Subject: [PATCH] Fix Infinite Retry Loop in loading RepositoryData (#50987) (#51093) * Fix Infinite Retry Loop in loading RepositoryData We were running into an infinite loop when trying to load corrupted (or otherwise un-loadable) repository data for a repo that uses best effort consistency (e.g. that was just freshly mounted as done in the test) because we kepy resetting to `-1` on `IOException`, listing and finding the broken generation `N` and then interpreted the subsequent reset to `-1` as a concurrent change to the repository. --- .../blobstore/BlobStoreRepository.java | 15 +++++-- .../CorruptedBlobStoreRepositoryIT.java | 40 +++++++++++++++++++ 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java index d64e8d63b23..fe53ba3208d 100644 --- a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java +++ b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java @@ -1065,6 +1065,9 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp return; } // Retry loading RepositoryData in a loop in case we run into concurrent modifications of the repository. + // Keep track of the most recent generation we failed to load so we can break out of the loop if we fail to load the same + // generation repeatedly. + long lastFailedGeneration = RepositoryData.UNKNOWN_REPO_GEN; while (true) { final long genToLoad; if (bestEffortConsistency) { @@ -1074,7 +1077,9 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp try { generation = latestIndexBlobId(); } catch (IOException ioe) { - throw new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe); + listener.onFailure( + new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe)); + return; } genToLoad = latestKnownRepoGen.updateAndGet(known -> Math.max(known, generation)); if (genToLoad > generation) { @@ -1089,7 +1094,9 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp listener.onResponse(getRepositoryData(genToLoad)); return; } catch (RepositoryException e) { - if (genToLoad != latestKnownRepoGen.get()) { + // If the generation to load changed concurrently and we didn't just try loading the same generation before we retry + if (genToLoad != latestKnownRepoGen.get() && genToLoad != lastFailedGeneration) { + lastFailedGeneration = genToLoad; logger.warn("Failed to load repository data generation [" + genToLoad + "] because a concurrent operation moved the current generation to [" + latestKnownRepoGen.get() + "]", e); continue; @@ -1099,10 +1106,10 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp // of N so we mark this repository as corrupted. markRepoCorrupted(genToLoad, e, ActionListener.wrap(v -> listener.onFailure(corruptedStateException(e)), listener::onFailure)); - return; } else { - throw e; + listener.onFailure(e); } + return; } } } diff --git a/server/src/test/java/org/elasticsearch/snapshots/CorruptedBlobStoreRepositoryIT.java b/server/src/test/java/org/elasticsearch/snapshots/CorruptedBlobStoreRepositoryIT.java index 6269d9bcdd8..56866029464 100644 --- a/server/src/test/java/org/elasticsearch/snapshots/CorruptedBlobStoreRepositoryIT.java +++ b/server/src/test/java/org/elasticsearch/snapshots/CorruptedBlobStoreRepositoryIT.java @@ -303,6 +303,46 @@ public class CorruptedBlobStoreRepositoryIT extends AbstractSnapshotIntegTestCas } } + public void testMountCorruptedRepositoryData() throws Exception { + disableRepoConsistencyCheck("This test intentionally corrupts the repository contents"); + Client client = client(); + + Path repo = randomRepoPath(); + final String repoName = "test-repo"; + logger.info("--> creating repository at {}", repo.toAbsolutePath()); + assertAcked(client.admin().cluster().preparePutRepository(repoName) + .setType("fs").setSettings(Settings.builder() + .put("location", repo) + .put("compress", false))); + + final String snapshot = "test-snap"; + + logger.info("--> creating snapshot"); + CreateSnapshotResponse createSnapshotResponse = client.admin().cluster().prepareCreateSnapshot(repoName, snapshot) + .setWaitForCompletion(true).setIndices("test-idx-*").get(); + assertThat(createSnapshotResponse.getSnapshotInfo().successfulShards(), + equalTo(createSnapshotResponse.getSnapshotInfo().totalShards())); + + logger.info("--> corrupt index-N blob"); + final Repository repository = internalCluster().getCurrentMasterNodeInstance(RepositoriesService.class).repository(repoName); + final RepositoryData repositoryData = getRepositoryData(repository); + Files.write(repo.resolve("index-" + repositoryData.getGenId()), randomByteArrayOfLength(randomIntBetween(1, 100))); + + logger.info("--> verify loading repository data throws RepositoryException"); + expectThrows(RepositoryException.class, () -> getRepositoryData(repository)); + + logger.info("--> mount repository path in a new repository"); + final String otherRepoName = "other-repo"; + assertAcked(client.admin().cluster().preparePutRepository(otherRepoName) + .setType("fs").setSettings(Settings.builder() + .put("location", repo) + .put("compress", false))); + final Repository otherRepo = internalCluster().getCurrentMasterNodeInstance(RepositoriesService.class).repository(otherRepoName); + + logger.info("--> verify loading repository data from newly mounted repository throws RepositoryException"); + expectThrows(RepositoryException.class, () -> getRepositoryData(otherRepo)); + } + private void assertRepositoryBlocked(Client client, String repo, String existingSnapshot) { logger.info("--> try to delete snapshot"); final RepositoryException repositoryException3 = expectThrows(RepositoryException.class,