Local Gateway: Expose `gateway.local.initial_shards` to control when to recover and index, closes #352.

This commit is contained in:
kimchy 2010-09-03 18:38:24 +03:00
parent 575f5b406b
commit b40f0bc5a4
2 changed files with 180 additions and 19 deletions

View File

@ -21,6 +21,7 @@ package org.elasticsearch.gateway.local;
import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.FailedNodeException; import org.elasticsearch.action.FailedNodeException;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.*; import org.elasticsearch.cluster.routing.*;
@ -31,6 +32,9 @@ import org.elasticsearch.common.collect.Sets;
import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.trove.ExtTObjectIntHasMap;
import org.elasticsearch.common.trove.TObjectIntHashMap;
import org.elasticsearch.common.trove.TObjectIntIterator;
import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections; import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
@ -65,6 +69,8 @@ public class LocalGatewayNodeAllocation extends NodeAllocation {
private final TimeValue listTimeout; private final TimeValue listTimeout;
private final String initialShards;
@Inject public LocalGatewayNodeAllocation(Settings settings, IndicesService indicesService, @Inject public LocalGatewayNodeAllocation(Settings settings, IndicesService indicesService,
TransportNodesListGatewayStartedShards listGatewayStartedShards, TransportNodesListShardStoreMetaData listShardStoreMetaData) { TransportNodesListGatewayStartedShards listGatewayStartedShards, TransportNodesListShardStoreMetaData listShardStoreMetaData) {
super(settings); super(settings);
@ -73,6 +79,7 @@ public class LocalGatewayNodeAllocation extends NodeAllocation {
this.listShardStoreMetaData = listShardStoreMetaData; this.listShardStoreMetaData = listShardStoreMetaData;
this.listTimeout = componentSettings.getAsTime("list_timeout", TimeValue.timeValueSeconds(30)); this.listTimeout = componentSettings.getAsTime("list_timeout", TimeValue.timeValueSeconds(30));
this.initialShards = componentSettings.get("initial_shards", "quorum");
} }
@Override public void applyStartedShards(NodeAllocations nodeAllocations, RoutingNodes routingNodes, DiscoveryNodes nodes, List<? extends ShardRouting> startedShards) { @Override public void applyStartedShards(NodeAllocations nodeAllocations, RoutingNodes routingNodes, DiscoveryNodes nodes, List<? extends ShardRouting> startedShards) {
@ -166,16 +173,21 @@ public class LocalGatewayNodeAllocation extends NodeAllocation {
// make a list of ShardId to Node, each one from the latest version // make a list of ShardId to Node, each one from the latest version
Map<ShardId, Tuple<DiscoveryNode, Long>> shards = Maps.newHashMap(); Map<ShardId, Tuple<DiscoveryNode, Long>> shards = Maps.newHashMap();
// and a list of the number of shard instances
TObjectIntHashMap<ShardId> shardsCounts = new ExtTObjectIntHasMap<ShardId>().defaultReturnValue(-1);
for (TransportNodesListGatewayStartedShards.NodeLocalGatewayStartedShards nodeState : nodesState) { for (TransportNodesListGatewayStartedShards.NodeLocalGatewayStartedShards nodeState : nodesState) {
if (nodeState.state() == null) { if (nodeState.state() == null) {
continue; continue;
} }
for (Map.Entry<ShardId, Long> entry : nodeState.state().shards().entrySet()) { for (Map.Entry<ShardId, Long> entry : nodeState.state().shards().entrySet()) {
if (entry.getKey().index().name().equals(indexRoutingTable.index())) { ShardId shardId = entry.getKey();
Tuple<DiscoveryNode, Long> t = shards.get(entry.getKey()); if (shardId.index().name().equals(indexRoutingTable.index())) {
shardsCounts.adjustOrPutValue(shardId, 1, 1);
Tuple<DiscoveryNode, Long> t = shards.get(shardId);
if (t == null || entry.getValue() > t.v2().longValue()) { if (t == null || entry.getValue() > t.v2().longValue()) {
t = new Tuple<DiscoveryNode, Long>(nodeState.node(), entry.getValue()); t = new Tuple<DiscoveryNode, Long>(nodeState.node(), entry.getValue());
shards.put(entry.getKey(), t); shards.put(shardId, t);
} }
} }
} }
@ -183,14 +195,34 @@ public class LocalGatewayNodeAllocation extends NodeAllocation {
// check if we managed to allocate to all of them, if not, move all relevant shards to ignored // check if we managed to allocate to all of them, if not, move all relevant shards to ignored
if (shards.size() < indexRoutingTable.shards().size()) { if (shards.size() < indexRoutingTable.shards().size()) {
for (Iterator<MutableShardRouting> it = routingNodes.unassigned().iterator(); it.hasNext();) { moveIndexToIgnoreUnassigned(routingNodes, indexRoutingTable);
MutableShardRouting shardRouting = it.next(); } else {
if (shardRouting.index().equals(indexRoutingTable.index())) { // check if the counts meets the minimum set
it.remove(); int requiredNumber = 1;
routingNodes.ignoredUnassigned().add(shardRouting); IndexMetaData indexMetaData = routingNodes.metaData().index(indexRoutingTable.index());
if ("quorum".equals(initialShards)) {
if (indexMetaData.numberOfReplicas() > 1) {
requiredNumber = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
} }
} else if ("full".equals(initialShards)) {
requiredNumber = indexMetaData.numberOfReplicas() + 1;
} else if ("full-1".equals(initialShards)) {
if (indexMetaData.numberOfReplicas() > 1) {
requiredNumber = indexMetaData.numberOfReplicas();
} }
} else { } else {
requiredNumber = Integer.parseInt(initialShards);
}
boolean allocate = true;
for (TObjectIntIterator<ShardId> it = shardsCounts.iterator(); it.hasNext();) {
it.advance();
if (it.value() < requiredNumber) {
allocate = false;
}
}
if (allocate) {
changed = true; changed = true;
// we found all nodes to allocate to, do the allocation // we found all nodes to allocate to, do the allocation
for (Iterator<MutableShardRouting> it = routingNodes.unassigned().iterator(); it.hasNext();) { for (Iterator<MutableShardRouting> it = routingNodes.unassigned().iterator(); it.hasNext();) {
@ -203,6 +235,9 @@ public class LocalGatewayNodeAllocation extends NodeAllocation {
it.remove(); it.remove();
} }
} }
} else {
moveIndexToIgnoreUnassigned(routingNodes, indexRoutingTable);
}
} }
} }
} }
@ -322,6 +357,16 @@ public class LocalGatewayNodeAllocation extends NodeAllocation {
return changed; return changed;
} }
private void moveIndexToIgnoreUnassigned(RoutingNodes routingNodes, IndexRoutingTable indexRoutingTable) {
for (Iterator<MutableShardRouting> it = routingNodes.unassigned().iterator(); it.hasNext();) {
MutableShardRouting shardRouting = it.next();
if (shardRouting.index().equals(indexRoutingTable.index())) {
it.remove();
routingNodes.ignoredUnassigned().add(shardRouting);
}
}
}
private ConcurrentMap<DiscoveryNode, IndexStore.StoreFilesMetaData> buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) { private ConcurrentMap<DiscoveryNode, IndexStore.StoreFilesMetaData> buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) {
ConcurrentMap<DiscoveryNode, IndexStore.StoreFilesMetaData> shardStores = cachedStores.get(shard.shardId()); ConcurrentMap<DiscoveryNode, IndexStore.StoreFilesMetaData> shardStores = cachedStores.get(shard.shardId());
if (shardStores == null) { if (shardStores == null) {

View File

@ -0,0 +1,116 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.integration.gateway.local;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus;
import org.elasticsearch.gateway.Gateway;
import org.elasticsearch.node.Node;
import org.elasticsearch.node.internal.InternalNode;
import org.elasticsearch.test.integration.AbstractNodesTests;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.Test;
import static org.elasticsearch.client.Requests.*;
import static org.elasticsearch.common.settings.ImmutableSettings.*;
import static org.elasticsearch.common.xcontent.XContentFactory.*;
import static org.elasticsearch.index.query.xcontent.QueryBuilders.*;
import static org.hamcrest.MatcherAssert.*;
import static org.hamcrest.Matchers.*;
/**
* @author kimchy (shay.banon)
*/
public class QuorumLocalGatewayTests extends AbstractNodesTests {
@AfterMethod public void cleanAndCloseNodes() throws Exception {
for (int i = 0; i < 10; i++) {
if (node("node" + i) != null) {
node("node" + i).stop();
// since we store (by default) the index snapshot under the gateway, resetting it will reset the index data as well
((InternalNode) node("node" + i)).injector().getInstance(Gateway.class).reset();
}
}
closeAllNodes();
}
@Test public void testQuorumRecovery() throws Exception {
// clean three nodes
buildNode("node1", settingsBuilder().put("gateway.type", "local").build());
buildNode("node2", settingsBuilder().put("gateway.type", "local").build());
buildNode("node3", settingsBuilder().put("gateway.type", "local").build());
cleanAndCloseNodes();
Node node1 = startNode("node1", settingsBuilder().put("gateway.type", "local").put("index.number_of_shards", 2).put("index.number_of_replicas", 2).build());
Node node2 = startNode("node2", settingsBuilder().put("gateway.type", "local").put("index.number_of_shards", 2).put("index.number_of_replicas", 2).build());
Node node3 = startNode("node3", settingsBuilder().put("gateway.type", "local").put("index.number_of_shards", 2).put("index.number_of_replicas", 2).build());
node1.client().prepareIndex("test", "type1", "1").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).execute().actionGet();
node1.client().admin().indices().prepareFlush().execute().actionGet();
node1.client().prepareIndex("test", "type1", "2").setSource(jsonBuilder().startObject().field("field", "value2").endObject()).execute().actionGet();
node1.client().admin().indices().prepareRefresh().execute().actionGet();
logger.info("--> running cluster_health (wait for the shards to startup)");
ClusterHealthResponse clusterHealth = client("node1").admin().cluster().health(clusterHealthRequest().waitForGreenStatus().waitForActiveShards(6)).actionGet();
logger.info("--> done cluster_health, status " + clusterHealth.status());
assertThat(clusterHealth.timedOut(), equalTo(false));
assertThat(clusterHealth.status(), equalTo(ClusterHealthStatus.GREEN));
for (int i = 0; i < 10; i++) {
assertThat(node1.client().prepareCount().setQuery(matchAllQuery()).execute().actionGet().count(), equalTo(2l));
}
logger.info("--> closing first node, and indexing more data to the second node");
closeNode("node1");
logger.info("--> running cluster_health (wait for the shards to startup)");
clusterHealth = client("node2").admin().cluster().health(clusterHealthRequest().waitForYellowStatus().waitForActiveShards(4)).actionGet();
logger.info("--> done cluster_health, status " + clusterHealth.status());
assertThat(clusterHealth.timedOut(), equalTo(false));
assertThat(clusterHealth.status(), equalTo(ClusterHealthStatus.YELLOW));
node2.client().prepareIndex("test", "type1", "3").setSource(jsonBuilder().startObject().field("field", "value3").endObject()).execute().actionGet();
node2.client().admin().indices().prepareRefresh().execute().actionGet();
for (int i = 0; i < 10; i++) {
assertThat(node2.client().prepareCount().setQuery(matchAllQuery()).execute().actionGet().count(), equalTo(3l));
}
logger.info("--> closing the second node and third node");
closeNode("node2");
closeNode("node3");
logger.info("--> starting the nodes back, verifying we got the latest version");
node1 = startNode("node1", settingsBuilder().put("gateway.type", "local").build());
node2 = startNode("node2", settingsBuilder().put("gateway.type", "local").build());
node2 = startNode("node3", settingsBuilder().put("gateway.type", "local").build());
logger.info("--> running cluster_health (wait for the shards to startup)");
clusterHealth = client("node1").admin().cluster().health(clusterHealthRequest().waitForGreenStatus().waitForActiveShards(6)).actionGet();
logger.info("--> done cluster_health, status " + clusterHealth.status());
assertThat(clusterHealth.timedOut(), equalTo(false));
assertThat(clusterHealth.status(), equalTo(ClusterHealthStatus.GREEN));
for (int i = 0; i < 10; i++) {
assertThat(node1.client().prepareCount().setQuery(matchAllQuery()).execute().actionGet().count(), equalTo(3l));
}
}
}