Allow primaries that have never been allocated to be allocated if under the low watermark
Fixes #6196
This commit is contained in:
parent
80321d89d9
commit
7023caa1a1
|
@ -131,14 +131,21 @@ public class DiskThresholdDecider extends AllocationDecider {
|
|||
}
|
||||
|
||||
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
|
||||
|
||||
// Always allow allocation if the decider is disabled
|
||||
if (!enabled) {
|
||||
return allocation.decision(Decision.YES, NAME, "disk threshold decider disabled");
|
||||
}
|
||||
|
||||
// Allow allocation regardless if only a single node is available
|
||||
if (allocation.nodes().size() <= 1) {
|
||||
if (logger.isTraceEnabled()) {
|
||||
logger.trace("Only a single node is present, allowing allocation");
|
||||
}
|
||||
return allocation.decision(Decision.YES, NAME, "only a single node is present");
|
||||
}
|
||||
|
||||
// Fail open there is no info available
|
||||
ClusterInfo clusterInfo = allocation.clusterInfo();
|
||||
if (clusterInfo == null) {
|
||||
if (logger.isTraceEnabled()) {
|
||||
|
@ -147,6 +154,7 @@ public class DiskThresholdDecider extends AllocationDecider {
|
|||
return allocation.decision(Decision.YES, NAME, "cluster info unavailable");
|
||||
}
|
||||
|
||||
// Fail open if there are no disk usages available
|
||||
Map<String, DiskUsage> usages = clusterInfo.getNodeDiskUsages();
|
||||
Map<String, Long> shardSizes = clusterInfo.getShardSizes();
|
||||
if (usages.isEmpty()) {
|
||||
|
@ -173,21 +181,72 @@ public class DiskThresholdDecider extends AllocationDecider {
|
|||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Node [{}] has {}% free disk", node.nodeId(), freeDiskPercentage);
|
||||
}
|
||||
|
||||
// a flag for whether the primary shard has been previously allocated
|
||||
boolean primaryHasBeenAllocated = allocation.routingTable().index(shardRouting.index()).shard(shardRouting.id()).primaryAllocatedPostApi();
|
||||
|
||||
// checks for exact byte comparisons
|
||||
if (freeBytes < freeBytesThresholdLow.bytes()) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation",
|
||||
freeBytesThresholdLow, freeBytes, node.nodeId());
|
||||
// If the shard is a replica or has a primary that has already been allocated before, check the low threshold
|
||||
if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation",
|
||||
freeBytesThresholdLow, freeBytes, node.nodeId());
|
||||
}
|
||||
return allocation.decision(Decision.NO, NAME, "less than required [%s] free on node, free: [%s]",
|
||||
freeBytesThresholdLow, new ByteSizeValue(freeBytes));
|
||||
} else if (freeBytes > freeBytesThresholdHigh.bytes()) {
|
||||
// Allow the shard to be allocated because it is primary that
|
||||
// has never been allocated if it's under the high watermark
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Less than the required {} free bytes threshold ({} bytes free) on node {}, " +
|
||||
"but allowing allocation because primary has never been allocated",
|
||||
freeBytesThresholdLow, freeBytes, node.nodeId());
|
||||
}
|
||||
return allocation.decision(Decision.YES, NAME, "primary has never been allocated before");
|
||||
} else {
|
||||
// Even though the primary has never been allocated, the node is
|
||||
// above the high watermark, so don't allow allocating the shard
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Less than the required {} free bytes threshold ({} bytes free) on node {}, " +
|
||||
"preventing allocation even though primary has never been allocated",
|
||||
freeBytesThresholdHigh, freeBytes, node.nodeId());
|
||||
}
|
||||
return allocation.decision(Decision.NO, NAME, "less than required [%s] free on node, free: [%s]",
|
||||
freeBytesThresholdHigh, new ByteSizeValue(freeBytes));
|
||||
}
|
||||
return allocation.decision(Decision.NO, NAME, "less than required [%s] free on node, free: [%s]",
|
||||
freeBytesThresholdLow, new ByteSizeValue(freeBytes));
|
||||
}
|
||||
|
||||
// checks for percentage comparisons
|
||||
if (freeDiskPercentage < freeDiskThresholdLow) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Less than the required {}% free disk threshold ({}% free) on node [{}], preventing allocation",
|
||||
freeDiskThresholdLow, freeDiskPercentage, node.nodeId());
|
||||
// If the shard is a replica or has a primary that has already been allocated before, check the low threshold
|
||||
if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Less than the required {}% free disk threshold ({}% free) on node [{}], preventing allocation",
|
||||
freeDiskThresholdLow, freeDiskPercentage, node.nodeId());
|
||||
}
|
||||
return allocation.decision(Decision.NO, NAME, "less than required [%s%%] free disk on node, free: [%s%%]",
|
||||
freeDiskThresholdLow, freeDiskThresholdLow);
|
||||
} else if (freeDiskPercentage > freeDiskThresholdHigh) {
|
||||
// Allow the shard to be allocated because it is primary that
|
||||
// has never been allocated if it's under the high watermark
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Less than the required {}% free disk threshold ({}% free) on node [{}], " +
|
||||
"but allowing allocation because primary has never been allocated",
|
||||
freeDiskThresholdLow, freeDiskPercentage, node.nodeId());
|
||||
}
|
||||
return allocation.decision(Decision.YES, NAME, "primary has never been allocated before");
|
||||
} else {
|
||||
// Even though the primary has never been allocated, the node is
|
||||
// above the high watermark, so don't allow allocating the shard
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Less than the required {} free bytes threshold ({} bytes free) on node {}, " +
|
||||
"preventing allocation even though primary has never been allocated",
|
||||
freeDiskThresholdHigh, freeDiskPercentage, node.nodeId());
|
||||
}
|
||||
return allocation.decision(Decision.NO, NAME, "less than required [%s%%] free disk on node, free: [%s%%]",
|
||||
freeDiskThresholdLow, freeDiskThresholdLow);
|
||||
}
|
||||
return allocation.decision(Decision.NO, NAME, "less than required [%s%%] free disk on node, free: [%s%%]",
|
||||
freeDiskThresholdLow, freeDiskThresholdLow);
|
||||
}
|
||||
|
||||
// Secondly, check that allocating the shard to this node doesn't put it above the high watermark
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.elasticsearch.cluster.routing.allocation.allocator.ShardsAllocators;
|
|||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.test.ElasticsearchAllocationTestCase;
|
||||
import org.elasticsearch.test.junit.annotations.TestLogging;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
@ -237,17 +238,19 @@ public class DiskThresholdDeciderTests extends ElasticsearchAllocationTestCase {
|
|||
}
|
||||
|
||||
@Test
|
||||
@TestLogging("cluster.routing.allocation.decider:TRACE")
|
||||
public void diskThresholdWithAbsoluteSizesTest() {
|
||||
Settings diskSettings = settingsBuilder()
|
||||
.put(DiskThresholdDecider.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED, true)
|
||||
.put(DiskThresholdDecider.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK, "30b")
|
||||
.put(DiskThresholdDecider.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK, "20b").build();
|
||||
.put(DiskThresholdDecider.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK, "9b").build();
|
||||
|
||||
Map<String, DiskUsage> usages = new HashMap<>();
|
||||
usages.put("node1", new DiskUsage("node1", 100, 10)); // 90% used
|
||||
usages.put("node2", new DiskUsage("node2", 100, 35)); // 65% used
|
||||
usages.put("node2", new DiskUsage("node2", 100, 10)); // 90% used
|
||||
usages.put("node3", new DiskUsage("node3", 100, 60)); // 40% used
|
||||
usages.put("node4", new DiskUsage("node4", 100, 80)); // 20% used
|
||||
usages.put("node5", new DiskUsage("node5", 100, 85)); // 15% used
|
||||
|
||||
Map<String, Long> shardSizes = new HashMap<>();
|
||||
shardSizes.put("[test][0][p]", 10L); // 10 bytes
|
||||
|
@ -274,7 +277,7 @@ public class DiskThresholdDeciderTests extends ElasticsearchAllocationTestCase {
|
|||
.build(), deciders, new ShardsAllocators(), cis);
|
||||
|
||||
MetaData metaData = MetaData.builder()
|
||||
.put(IndexMetaData.builder("test").numberOfShards(1).numberOfReplicas(1))
|
||||
.put(IndexMetaData.builder("test").numberOfShards(1).numberOfReplicas(2))
|
||||
.build();
|
||||
|
||||
RoutingTable routingTable = RoutingTable.builder()
|
||||
|
@ -283,35 +286,65 @@ public class DiskThresholdDeciderTests extends ElasticsearchAllocationTestCase {
|
|||
|
||||
ClusterState clusterState = ClusterState.builder(org.elasticsearch.cluster.ClusterName.DEFAULT).metaData(metaData).routingTable(routingTable).build();
|
||||
|
||||
logger.info("--> adding two nodes");
|
||||
logger.info("--> adding node1 and node2 node");
|
||||
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder()
|
||||
.put(newNode("node1"))
|
||||
.put(newNode("node2"))
|
||||
.put(newNode("node1"))
|
||||
.put(newNode("node2"))
|
||||
).build();
|
||||
|
||||
routingTable = strategy.reroute(clusterState).routingTable();
|
||||
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
|
||||
logShardStates(clusterState);
|
||||
|
||||
// Primary shard should be initializing, replica should not
|
||||
// Primary should initialize, even though both nodes are over the limit initialize
|
||||
assertThat(clusterState.routingNodes().shardsWithState(INITIALIZING).size(), equalTo(1));
|
||||
|
||||
String nodeWithPrimary, nodeWithoutPrimary;
|
||||
if (clusterState.getRoutingNodes().node("node1").size() == 1) {
|
||||
nodeWithPrimary = "node1";
|
||||
nodeWithoutPrimary = "node2";
|
||||
} else {
|
||||
nodeWithPrimary = "node2";
|
||||
nodeWithoutPrimary = "node1";
|
||||
}
|
||||
|
||||
// Make node without the primary now habitable to replicas
|
||||
usages.put(nodeWithoutPrimary, new DiskUsage(nodeWithoutPrimary, 100, 35)); // 65% used
|
||||
final ClusterInfo clusterInfo2 = new ClusterInfo(ImmutableMap.copyOf(usages), ImmutableMap.copyOf(shardSizes));
|
||||
cis = new ClusterInfoService() {
|
||||
@Override
|
||||
public ClusterInfo getClusterInfo() {
|
||||
logger.info("--> calling fake getClusterInfo");
|
||||
return clusterInfo2;
|
||||
}
|
||||
};
|
||||
strategy = new AllocationService(settingsBuilder()
|
||||
.put("cluster.routing.allocation.concurrent_recoveries", 10)
|
||||
.put("cluster.routing.allocation.allow_rebalance", "always")
|
||||
.put("cluster.routing.allocation.cluster_concurrent_rebalance", -1)
|
||||
.build(), deciders, new ShardsAllocators(), cis);
|
||||
|
||||
routingTable = strategy.reroute(clusterState).routingTable();
|
||||
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
|
||||
logShardStates(clusterState);
|
||||
|
||||
// Now the replica should be able to initialize
|
||||
assertThat(clusterState.routingNodes().shardsWithState(INITIALIZING).size(), equalTo(2));
|
||||
|
||||
logger.info("--> start the shards (primaries)");
|
||||
routingTable = strategy.applyStartedShards(clusterState, clusterState.routingNodes().shardsWithState(INITIALIZING)).routingTable();
|
||||
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
|
||||
|
||||
logShardStates(clusterState);
|
||||
// Assert that we're able to start the primary
|
||||
assertThat(clusterState.routingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(1));
|
||||
// Assert that node1 didn't get any shards because its disk usage is too high
|
||||
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(0));
|
||||
// Assert that we're able to start the primary and replica, since they were both initializing
|
||||
assertThat(clusterState.routingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(2));
|
||||
// Assert that node1 got a single shard (the primary), even though its disk usage is too high
|
||||
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1));
|
||||
// Assert that node2 got a single shard (a replica)
|
||||
assertThat(clusterState.getRoutingNodes().node("node2").size(), equalTo(1));
|
||||
|
||||
logger.info("--> start the shards (replicas)");
|
||||
routingTable = strategy.applyStartedShards(clusterState, clusterState.routingNodes().shardsWithState(INITIALIZING)).routingTable();
|
||||
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
|
||||
|
||||
logShardStates(clusterState);
|
||||
// Assert that the replica couldn't be started since node1 doesn't have enough space
|
||||
assertThat(clusterState.routingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(1));
|
||||
// Assert that one replica is still unassigned
|
||||
//assertThat(clusterState.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED).size(), equalTo(1));
|
||||
|
||||
logger.info("--> adding node3");
|
||||
|
||||
|
@ -323,7 +356,7 @@ public class DiskThresholdDeciderTests extends ElasticsearchAllocationTestCase {
|
|||
|
||||
logShardStates(clusterState);
|
||||
// Assert that the replica is initialized now that node3 is available with enough space
|
||||
assertThat(clusterState.routingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(1));
|
||||
assertThat(clusterState.routingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(2));
|
||||
assertThat(clusterState.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1));
|
||||
|
||||
logger.info("--> start the shards (replicas)");
|
||||
|
@ -331,9 +364,9 @@ public class DiskThresholdDeciderTests extends ElasticsearchAllocationTestCase {
|
|||
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
|
||||
|
||||
logShardStates(clusterState);
|
||||
// Assert that the replica couldn't be started since node1 doesn't have enough space
|
||||
assertThat(clusterState.routingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(2));
|
||||
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(0));
|
||||
// Assert that all replicas could be started
|
||||
assertThat(clusterState.routingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(3));
|
||||
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1));
|
||||
assertThat(clusterState.getRoutingNodes().node("node2").size(), equalTo(1));
|
||||
assertThat(clusterState.getRoutingNodes().node("node3").size(), equalTo(1));
|
||||
|
||||
|
@ -363,8 +396,8 @@ public class DiskThresholdDeciderTests extends ElasticsearchAllocationTestCase {
|
|||
logShardStates(clusterState);
|
||||
|
||||
// Shards remain started
|
||||
assertThat(clusterState.routingNodes().shardsWithState(STARTED).size(), equalTo(2));
|
||||
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(0));
|
||||
assertThat(clusterState.routingNodes().shardsWithState(STARTED).size(), equalTo(3));
|
||||
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1));
|
||||
assertThat(clusterState.getRoutingNodes().node("node2").size(), equalTo(1));
|
||||
assertThat(clusterState.getRoutingNodes().node("node3").size(), equalTo(1));
|
||||
|
||||
|
@ -394,8 +427,8 @@ public class DiskThresholdDeciderTests extends ElasticsearchAllocationTestCase {
|
|||
|
||||
logShardStates(clusterState);
|
||||
// Shards remain started
|
||||
assertThat(clusterState.routingNodes().shardsWithState(STARTED).size(), equalTo(2));
|
||||
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(0));
|
||||
assertThat(clusterState.routingNodes().shardsWithState(STARTED).size(), equalTo(3));
|
||||
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1));
|
||||
// Shard hasn't been moved off of node2 yet because there's nowhere for it to go
|
||||
assertThat(clusterState.getRoutingNodes().node("node2").size(), equalTo(1));
|
||||
assertThat(clusterState.getRoutingNodes().node("node3").size(), equalTo(1));
|
||||
|
@ -410,7 +443,9 @@ public class DiskThresholdDeciderTests extends ElasticsearchAllocationTestCase {
|
|||
|
||||
logShardStates(clusterState);
|
||||
// Shards remain started
|
||||
assertThat(clusterState.routingNodes().shardsWithState(STARTED).size(), equalTo(1));
|
||||
assertThat(clusterState.routingNodes().shardsWithState(STARTED).size(), equalTo(2));
|
||||
// One shard is relocating off of node1
|
||||
assertThat(clusterState.routingNodes().shardsWithState(RELOCATING).size(), equalTo(1));
|
||||
assertThat(clusterState.routingNodes().shardsWithState(INITIALIZING).size(), equalTo(1));
|
||||
|
||||
logger.info("--> apply INITIALIZING shards");
|
||||
|
@ -418,11 +453,42 @@ public class DiskThresholdDeciderTests extends ElasticsearchAllocationTestCase {
|
|||
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
|
||||
|
||||
logShardStates(clusterState);
|
||||
// primary shard already has been relocated away
|
||||
assertThat(clusterState.getRoutingNodes().node(nodeWithPrimary).size(), equalTo(0));
|
||||
// node with increased space still has its shard
|
||||
assertThat(clusterState.getRoutingNodes().node(nodeWithoutPrimary).size(), equalTo(1));
|
||||
assertThat(clusterState.getRoutingNodes().node("node3").size(), equalTo(1));
|
||||
assertThat(clusterState.getRoutingNodes().node("node4").size(), equalTo(1));
|
||||
|
||||
logger.info("--> adding node5");
|
||||
|
||||
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
|
||||
.put(newNode("node5"))
|
||||
).build();
|
||||
routingTable = strategy.reroute(clusterState).routingTable();
|
||||
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
|
||||
|
||||
logShardStates(clusterState);
|
||||
// Shards remain started on node3 and node4
|
||||
assertThat(clusterState.routingNodes().shardsWithState(STARTED).size(), equalTo(2));
|
||||
// One shard is relocating off of node2 now
|
||||
assertThat(clusterState.routingNodes().shardsWithState(RELOCATING).size(), equalTo(1));
|
||||
// Initializing on node5
|
||||
assertThat(clusterState.routingNodes().shardsWithState(INITIALIZING).size(), equalTo(1));
|
||||
|
||||
logger.info("--> apply INITIALIZING shards");
|
||||
routingTable = strategy.applyStartedShards(clusterState, clusterState.routingNodes().shardsWithState(INITIALIZING)).routingTable();
|
||||
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
|
||||
|
||||
logger.info("--> final cluster state:");
|
||||
logShardStates(clusterState);
|
||||
// Node1 still has no shards because it has no space for them
|
||||
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(0));
|
||||
// Node4 is available now, so the shard is moved off of node2
|
||||
// Node5 is available now, so the shard is moved off of node2
|
||||
assertThat(clusterState.getRoutingNodes().node("node2").size(), equalTo(0));
|
||||
assertThat(clusterState.getRoutingNodes().node("node3").size(), equalTo(1));
|
||||
assertThat(clusterState.getRoutingNodes().node("node4").size(), equalTo(1));
|
||||
assertThat(clusterState.getRoutingNodes().node("node5").size(), equalTo(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -537,15 +603,21 @@ public class DiskThresholdDeciderTests extends ElasticsearchAllocationTestCase {
|
|||
).build();
|
||||
routingTable = strategy.reroute(clusterState).routingTable();
|
||||
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
|
||||
|
||||
// Shard can be allocated to node1, even though it only has 25% free,
|
||||
// because it's a primary that's never been allocated before
|
||||
assertThat(clusterState.routingNodes().shardsWithState(INITIALIZING).size(), equalTo(1));
|
||||
|
||||
logger.info("--> start the shards (primaries)");
|
||||
routingTable = strategy.applyStartedShards(clusterState, clusterState.routingNodes().shardsWithState(INITIALIZING)).routingTable();
|
||||
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
|
||||
logShardStates(clusterState);
|
||||
|
||||
// Shard can't be allocated to node1 (or node2) because the average usage is 75% > 70%
|
||||
assertThat(clusterState.routingNodes().shardsWithState(INITIALIZING).size(), equalTo(0));
|
||||
// No shards are started, node1 doesn't have enough disk usage
|
||||
assertThat(clusterState.routingNodes().shardsWithState(STARTED).size(), equalTo(0));
|
||||
// A single shard is started on node1, even though it normally would not
|
||||
// be allowed, because it's a primary that hasn't been allocated, and node1
|
||||
// is still below the high watermark (unlike node3)
|
||||
assertThat(clusterState.routingNodes().shardsWithState(STARTED).size(), equalTo(1));
|
||||
assertThat(clusterState.getRoutingNodes().node("node1").size(), equalTo(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Reference in New Issue