Fixes retrieval of the latest snapshot index blob (#22700)

This commit ensures that the index.latest blob is first examined to determine the latest index-N blob id, before attempting to list all index-N blobs and picking the blob with the highest N. It also fixes the MockRepository#move so that tests are able to handle non-atomic moves. This is done by adding a special setting to the MockRepository that requires the test to specify if it can handle non-atomic moves. If so, then the MockRepository#move operation will be non-atomic to allow testing for against such repositories.
2017-01-20 17:00:46 -06:00 · 2017-01-20 17:00:46 -06:00 · 3bf06d1440
parent f7524fbdef
commit 3bf06d1440
3 changed files with 36 additions and 77 deletions
--- a/core/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
+++ b/core/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
@ -190,15 +190,15 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp

    private static final String INDEX_METADATA_CODEC = "index-metadata";

-    protected static final String SNAPSHOT_NAME_FORMAT = SNAPSHOT_PREFIX + "%s.dat";
+    private static final String SNAPSHOT_NAME_FORMAT = SNAPSHOT_PREFIX + "%s.dat";

-    protected static final String SNAPSHOT_INDEX_PREFIX = "index-";
+    private static final String SNAPSHOT_INDEX_PREFIX = "index-";

-    protected static final String SNAPSHOT_INDEX_NAME_FORMAT = SNAPSHOT_INDEX_PREFIX + "%s";
+    private static final String SNAPSHOT_INDEX_NAME_FORMAT = SNAPSHOT_INDEX_PREFIX + "%s";

-    protected static final String SNAPSHOT_INDEX_CODEC = "snapshots";
+    private static final String SNAPSHOT_INDEX_CODEC = "snapshots";

-    protected static final String DATA_BLOB_PREFIX = "__";
+    private static final String DATA_BLOB_PREFIX = "__";

    private final RateLimiter snapshotRateLimiter;

@ -732,23 +732,22 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
     */
    long latestIndexBlobId() throws IOException {
        try {
-            // first, try listing the blobs and determining which index blob is the latest
-            return listBlobsToGetLatestIndexId();
-        } catch (UnsupportedOperationException e) {
-            // could not list the blobs because the repository does not support the operation,
-            // try reading from the index-latest file
-            try {
+            // first, try reading the latest index generation from the index.latest blob
            return readSnapshotIndexLatestBlob();
        } catch (IOException ioe) {
-                // we likely could not find the blob, this can happen in two scenarios:
+            // we could not find the index.latest blob, this can happen in two scenarios:
            //  (1) its an empty repository
            //  (2) when writing the index-latest blob, if the blob already exists,
            //      we first delete it, then atomically write the new blob.  there is
            //      a small window in time when the blob is deleted and the new one
            //      written - if the node crashes during that time, we won't have an
            //      index-latest blob
-                // in a read-only repository, we can't know which of the two scenarios it is,
-                // but we will assume (1) because we can't do anything about (2) anyway
+            // lets try to list all index-N blobs to determine the last one, if listing the blobs
+            // is not a supported operation (which is the case for read-only repositories), then
+            // assume its an empty repository.
+            try {
+                return listBlobsToGetLatestIndexId();
+            } catch (UnsupportedOperationException uoe) {
                return RepositoryData.EMPTY_REPO_GEN;
            }
        }
--- a/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java
+++ b/core/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java
@ -2738,6 +2738,8 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
                Settings.builder()
                    .put("location", repoPath)
                    .put("random_control_io_exception_rate", randomIntBetween(5, 20) / 100f)
+                    // test that we can take a snapshot after a failed one, even if a partial index-N was written
+                    .put("atomic_move", false)
                    .put("random", randomAsciiOfLength(10))));

        logger.info("--> indexing some data");
--- a/core/src/test/java/org/elasticsearch/snapshots/mockstore/MockRepository.java
+++ b/core/src/test/java/org/elasticsearch/snapshots/mockstore/MockRepository.java
@ -42,7 +42,6 @@ import org.elasticsearch.common.blobstore.BlobContainer;
 import org.elasticsearch.common.blobstore.BlobMetaData;
 import org.elasticsearch.common.blobstore.BlobPath;
 import org.elasticsearch.common.blobstore.BlobStore;
-import org.elasticsearch.common.compress.NotXContentException;
 import org.elasticsearch.common.io.PathUtils;
 import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.settings.Setting.Property;
@ -52,7 +51,6 @@ import org.elasticsearch.env.Environment;
 import org.elasticsearch.plugins.RepositoryPlugin;
 import org.elasticsearch.repositories.Repository;
 import org.elasticsearch.repositories.IndexId;
-import org.elasticsearch.repositories.RepositoryData;
 import org.elasticsearch.repositories.fs.FsRepository;
 import org.elasticsearch.snapshots.SnapshotId;

@ -102,6 +100,8 @@ public class MockRepository extends FsRepository {

    private volatile boolean blockOnDataFiles;

+    private volatile boolean atomicMove;
+
    private volatile boolean blocked = false;

    public MockRepository(RepositoryMetaData metadata, Environment environment,
@ -116,6 +116,7 @@ public class MockRepository extends FsRepository {
        blockOnInitialization = metadata.settings().getAsBoolean("block_on_init", false);
        randomPrefix = metadata.settings().get("random", "default");
        waitAfterUnblock = metadata.settings().getAsLong("wait_after_unblock", 0L);
+        atomicMove = metadata.settings().getAsBoolean("atomic_move", true);
        logger.info("starting mock repository with random prefix {}", randomPrefix);
        mockBlobStore = new MockBlobStore(super.blobStore());
    }
@ -156,52 +157,7 @@ public class MockRepository extends FsRepository {
        return mockBlobStore;
    }

-    public void unblock() {
-        unblockExecution();
-    }
-
-    public void blockOnDataFiles(boolean blocked) {
-        blockOnDataFiles = blocked;
-    }
-
-    @Override
-    public RepositoryData getRepositoryData() {
-        final int numIterations = 10;
-        int count = 0;
-        NotXContentException ex = null;
-        RepositoryData repositoryData = null;
-        while (count < numIterations) {
-            try {
-                repositoryData = super.getRepositoryData();
-            } catch (NotXContentException e) {
-                ex = e;
-            }
-            if (repositoryData != null) {
-                break;
-            }
-            count++;
-            try {
-                Thread.sleep(1000L);
-            } catch (InterruptedException e) {
-            }
-        }
-        if (ex != null) {
-            logger.info("--> [{}] repository failed to read x-content from index file, on iteration [{}] the repository data was [{}]",
-                metadata.name(), count, repositoryData);
-            throw ex;
-        }
-        return repositoryData;
-    }
-
-    public void blockOnControlFiles(boolean blocked) {
-        blockOnControlFiles = blocked;
-    }
-
-    public boolean blockOnDataFiles() {
-        return blockOnDataFiles;
-    }
-
-    public synchronized void unblockExecution() {
+    public synchronized void unblock() {
        blocked = false;
        // Clean blocking flags, so we wouldn't try to block again
        blockOnDataFiles = false;
@ -210,6 +166,10 @@ public class MockRepository extends FsRepository {
        this.notifyAll();
    }

+    public void blockOnDataFiles(boolean blocked) {
+        blockOnDataFiles = blocked;
+    }
+
    public boolean blocked() {
        return blocked;
    }
@ -300,9 +260,7 @@ public class MockRepository extends FsRepository {
                            }
                        }
                    }
-                }
-                // don't block on the index-N files, as getRepositoryData depends on it
-                else if (blobName.startsWith("index-") == false) {
+                } else {
                    if (shouldFail(blobName, randomControlIOExceptionRate) && (incrementAndGetFailureCount() < maximumNumberOfFailures)) {
                        logger.info("throwing random IOException for file [{}] at path [{}]", blobName, path());
                        throw new IOException("Random IOException");
@ -357,16 +315,16 @@ public class MockRepository extends FsRepository {

            @Override
            public void move(String sourceBlob, String targetBlob) throws IOException {
-                if (RandomizedContext.current().getRandom().nextBoolean()) {
+                if (atomicMove) {
+                    // atomic move since this inherits from FsBlobContainer which provides atomic moves
+                    maybeIOExceptionOrBlock(targetBlob);
+                    super.move(sourceBlob, targetBlob);
+                } else {
                    // simulate a non-atomic move, since many blob container implementations
                    // will not have an atomic move, and we should be able to handle that
                    maybeIOExceptionOrBlock(targetBlob);
                    super.writeBlob(targetBlob, super.readBlob(sourceBlob), 0L);
                    super.deleteBlob(sourceBlob);
-                } else {
-                    // atomic move since this inherits from FsBlobContainer which provides atomic moves
-                    maybeIOExceptionOrBlock(targetBlob);
-                    super.move(sourceBlob, targetBlob);
                }
            }