Allow shards on shared filesystems to be recovered on any node

Currently, when all copies of a shard are lost, we reach out to all other nodes to see whether they have a copy of the data. For a shared filesystem, though, we can assume that each node has a copy of the data available, so return a state version of at least 0 for each node. This feature is set using the dynamic index setting `index.shared_filesystem.recover_on_any_node`, which defaults to `false`. Fixes #10932
2015-05-04 10:22:11 -06:00 · 2015-05-04 10:22:11 -06:00 · 18ab718693
parent c0659ce4d4
commit 18ab718693
4 changed files with 62 additions and 2 deletions
--- a/src/main/java/org/elasticsearch/cluster/metadata/IndexMetaData.java
+++ b/src/main/java/org/elasticsearch/cluster/metadata/IndexMetaData.java
@ -165,6 +165,7 @@ public class IndexMetaData implements Diffable<IndexMetaData> {
    public static final String SETTING_LEGACY_ROUTING_HASH_FUNCTION = "index.legacy.routing.hash.type";
    public static final String SETTING_LEGACY_ROUTING_USE_TYPE = "index.legacy.routing.use_type";
    public static final String SETTING_DATA_PATH = "index.data_path";
+    public static final String SETTING_SHARED_FS_ALLOW_RECOVERY_ON_ANY_NODE = "index.shared_filesystem.recover_on_any_node";
    public static final String INDEX_UUID_NA_VALUE = "_na_";

    // hard-coded hash function as of 2.0
--- a/src/main/java/org/elasticsearch/gateway/GatewayAllocator.java
+++ b/src/main/java/org/elasticsearch/gateway/GatewayAllocator.java
@ -372,6 +372,14 @@ public class GatewayAllocator extends AbstractComponent {
        return changed;
    }

+    /**
+     * Build a map of DiscoveryNodes to shard state number for the given shard.
+     * A state of -1 means the shard does not exist on the node, where any
+     * shard state >= 0 is the state version of the shard on that node's disk.
+     *
+     * A shard on shared storage will return at least shard state 0 for all
+     * nodes, indicating that the shard can be allocated to any node.
+     */
    private ObjectLongOpenHashMap<DiscoveryNode> buildShardStates(final DiscoveryNodes nodes, MutableShardRouting shard, IndexMetaData indexMetaData) {
        ObjectLongOpenHashMap<DiscoveryNode> shardStates = cachedShardsState.get(shard.shardId());
        ObjectOpenHashSet<String> nodeIds;
@ -405,10 +413,18 @@ public class GatewayAllocator extends AbstractComponent {
        logListActionFailures(shard, "state", response.failures());

        for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState : response) {
+            long version = nodeShardState.version();
+            Settings idxSettings = indexMetaData.settings();
+            if (IndexMetaData.isOnSharedFilesystem(idxSettings) &&
+                    idxSettings.getAsBoolean(IndexMetaData.SETTING_SHARED_FS_ALLOW_RECOVERY_ON_ANY_NODE, false)) {
+                // Shared filesystems use 0 as a minimum shard state, which
+                // means that the shard can be allocated to any node
+                version = Math.max(0, version);
+            }
            // -1 version means it does not exists, which is what the API returns, and what we expect to
            logger.trace("[{}] on node [{}] has version [{}] of shard",
-                    shard, nodeShardState.getNode(), nodeShardState.version());
-            shardStates.put(nodeShardState.getNode(), nodeShardState.version());
+                    shard, nodeShardState.getNode(), version);
+            shardStates.put(nodeShardState.getNode(), version);
        }
        return shardStates;
    }
--- a/src/main/java/org/elasticsearch/index/settings/IndexDynamicSettingsModule.java
+++ b/src/main/java/org/elasticsearch/index/settings/IndexDynamicSettingsModule.java
@ -71,6 +71,7 @@ public class IndexDynamicSettingsModule extends AbstractModule {
        indexDynamicSettings.addDynamicSetting(IndexMetaData.SETTING_BLOCKS_READ);
        indexDynamicSettings.addDynamicSetting(IndexMetaData.SETTING_BLOCKS_WRITE);
        indexDynamicSettings.addDynamicSetting(IndexMetaData.SETTING_BLOCKS_METADATA);
+        indexDynamicSettings.addDynamicSetting(IndexMetaData.SETTING_SHARED_FS_ALLOW_RECOVERY_ON_ANY_NODE);
        indexDynamicSettings.addDynamicSetting(IndicesTTLService.INDEX_TTL_DISABLE_PURGE);
        indexDynamicSettings.addDynamicSetting(IndexShard.INDEX_REFRESH_INTERVAL, Validator.TIME);
        indexDynamicSettings.addDynamicSetting(GatewayAllocator.INDEX_RECOVERY_INITIAL_SHARDS);
--- a/src/test/java/org/elasticsearch/index/IndexWithShadowReplicasTests.java
+++ b/src/test/java/org/elasticsearch/index/IndexWithShadowReplicasTests.java
@ -448,4 +448,46 @@ public class IndexWithShadowReplicasTests extends ElasticsearchIntegrationTest {
        assertThat(hits[2].field("foo").getValue().toString(), equalTo("eggplant"));
        assertThat(hits[3].field("foo").getValue().toString(), equalTo("foo"));
    }
+
+    @Test
+    public void testIndexOnSharedFSRecoversToAnyNode() throws Exception {
+        Settings nodeSettings = ImmutableSettings.builder()
+                .put("node.add_id_to_custom_path", false)
+                .put("node.enable_custom_paths", true)
+                .build();
+
+        internalCluster().startNode(nodeSettings);
+        Path dataPath = createTempDir();
+        String IDX = "test";
+
+        Settings idxSettings = ImmutableSettings.builder()
+                .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 5)
+                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
+                .put(IndexMetaData.SETTING_DATA_PATH, dataPath.toAbsolutePath().toString())
+                .put(IndexMetaData.SETTING_SHARED_FILESYSTEM, true)
+                .put(IndexMetaData.SETTING_SHARED_FS_ALLOW_RECOVERY_ON_ANY_NODE, true)
+                .build();
+
+        // only one node, so all primaries will end up on node1
+        prepareCreate(IDX).setSettings(idxSettings).addMapping("doc", "foo", "type=string,index=not_analyzed").get();
+        ensureGreen(IDX);
+
+        // Index some documents
+        client().prepareIndex(IDX, "doc", "1").setSource("foo", "foo").get();
+        client().prepareIndex(IDX, "doc", "2").setSource("foo", "bar").get();
+        client().prepareIndex(IDX, "doc", "3").setSource("foo", "baz").get();
+        client().prepareIndex(IDX, "doc", "4").setSource("foo", "eggplant").get();
+
+        // start a second node
+        internalCluster().startNode(nodeSettings);
+
+        // node1 is master, stop that one, since we only have primaries,
+        // usually this would mean data loss, but not on shared fs!
+        internalCluster().stopCurrentMasterNode();
+
+        ensureGreen(IDX);
+        refresh();
+        SearchResponse resp = client().prepareSearch(IDX).setQuery(matchAllQuery()).addFieldDataField("foo").addSort("foo", SortOrder.ASC).get();
+        assertHitCount(resp, 4);
+    }
 }