improve local shard allocation to utilize same version shards allocation within the same replication group

2011-06-09 10:51:45 +03:00 · 2011-06-09 10:51:45 +03:00 · a07030ccf3
parent 1a0ee00fbb
commit a07030ccf3
1 changed files with 53 additions and 20 deletions
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/gateway/local/LocalGatewayNodeAllocation.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/gateway/local/LocalGatewayNodeAllocation.java
@ -27,7 +27,11 @@ import org.elasticsearch.cluster.routing.MutableShardRouting;
 import org.elasticsearch.cluster.routing.RoutingNode;
 import org.elasticsearch.cluster.routing.RoutingNodes;
 import org.elasticsearch.cluster.routing.ShardRouting;
-import org.elasticsearch.cluster.routing.allocation.*;
+import org.elasticsearch.cluster.routing.allocation.FailedRerouteAllocation;
 import org.elasticsearch.cluster.routing.allocation.NodeAllocation;
 import org.elasticsearch.cluster.routing.allocation.NodeAllocations;
 import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
 import org.elasticsearch.cluster.routing.allocation.StartedRerouteAllocation;
 import org.elasticsearch.common.collect.Maps;
 import org.elasticsearch.common.collect.Sets;
 import org.elasticsearch.common.inject.Inject;
@ -110,8 +114,8 @@ public class LocalGatewayNodeAllocation extends NodeAllocation {
            int numberOfAllocationsFound = 0;
            long highestVersion = -1;
-            DiscoveryNode nodeWithHighestVersion = null;
+            Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet();
-            for (TObjectLongIterator<DiscoveryNode> it = nodesState.iterator(); it.hasNext();) {
+            for (TObjectLongIterator<DiscoveryNode> it = nodesState.iterator(); it.hasNext(); ) {
                it.advance();
                DiscoveryNode node = it.key();
                long version = it.value();
@ -122,12 +126,15 @@ public class LocalGatewayNodeAllocation extends NodeAllocation {
                if (version != -1) {
                    numberOfAllocationsFound++;
                    if (highestVersion == -1) {
-                        nodeWithHighestVersion = node;
+                        nodesWithHighestVersion.add(node);
                        highestVersion = version;
                    } else {
                        if (version > highestVersion) {
-                            nodeWithHighestVersion = node;
+                            nodesWithHighestVersion.clear();
                            nodesWithHighestVersion.add(node);
                            highestVersion = version;
                        } else if (version == highestVersion) {
                            nodesWithHighestVersion.add(node);
                        }
                    }
                }
@ -161,24 +168,50 @@ public class LocalGatewayNodeAllocation extends NodeAllocation {
                continue;
            }
-            RoutingNode node = routingNodes.node(nodeWithHighestVersion.id());
+            Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
-            // check if we need to throttle, NOTE, we don't check on NO since it does not apply
+            Set<DiscoveryNode> noNodes = Sets.newHashSet();
-            // since this is our master data!
+            for (DiscoveryNode discoNode : nodesWithHighestVersion) {
-            if (nodeAllocations.canAllocate(shard, node, allocation) == NodeAllocation.Decision.THROTTLE) {
+                RoutingNode node = routingNodes.node(discoNode.id());
                Decision decision = nodeAllocations.canAllocate(shard, node, allocation);
                if (decision == NodeAllocation.Decision.THROTTLE) {
                    throttledNodes.add(discoNode);
                } else if (decision == Decision.NO) {
                    noNodes.add(discoNode);
                } else {
                    if (logger.isDebugEnabled()) {
                        logger.debug("[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode);
                    }
                    // we found a match
                    changed = true;
                    node.add(shard);
                    unassignedIterator.remove();
                    // found a node, so no throttling, no "no", and break out of the loop
                    throttledNodes.clear();
                    noNodes.clear();
                    break;
                }
            }
            if (throttledNodes.isEmpty()) {
                // if we have a node that we "can't" allocate to, force allocation, since this is our master data!
                if (!noNodes.isEmpty()) {
                    DiscoveryNode discoNode = noNodes.iterator().next();
                    RoutingNode node = routingNodes.node(discoNode.id());
                    if (logger.isDebugEnabled()) {
                        logger.debug("[{}][{}]: forcing allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode);
                    }
                    // we found a match
                    changed = true;
                    node.add(shard);
                    unassignedIterator.remove();
                }
            } else {
                if (logger.isDebugEnabled()) {
-                    logger.debug("[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, nodeWithHighestVersion);
+                    logger.debug("[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, throttledNodes);
                }
                // we are throttling this, but we have enough to allocate to this node, ignore it for now
                unassignedIterator.remove();
                routingNodes.ignoredUnassigned().add(shard);
            } else {
                if (logger.isDebugEnabled()) {
                    logger.debug("[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, nodeWithHighestVersion);
                }
                // we found a match
                changed = true;
                node.add(shard);
                unassignedIterator.remove();
            }
        }
@ -301,7 +334,7 @@ public class LocalGatewayNodeAllocation extends NodeAllocation {
            nodeIds = nodes.dataNodes().keySet();
        } else {
            // clean nodes that have failed
-            for (TObjectLongIterator<DiscoveryNode> it = shardStates.iterator(); it.hasNext();) {
+            for (TObjectLongIterator<DiscoveryNode> it = shardStates.iterator(); it.hasNext(); ) {
                it.advance();
                if (!nodes.nodeExists(it.key().id())) {
                    it.remove();
@ -351,7 +384,7 @@ public class LocalGatewayNodeAllocation extends NodeAllocation {
        } else {
            nodesIds = Sets.newHashSet();
            // clean nodes that have failed
-            for (Iterator<DiscoveryNode> it = shardStores.keySet().iterator(); it.hasNext();) {
+            for (Iterator<DiscoveryNode> it = shardStores.keySet().iterator(); it.hasNext(); ) {
                DiscoveryNode node = it.next();
                if (!nodes.nodeExists(node.id())) {
                    it.remove();