diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index fb64188b79b..ddb11710386 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -228,6 +228,9 @@ Bug Fixes * SOLR-8375: ReplicaAssigner rejects valid nodes (Kelvin Tan, noble) +* SOLR-8728: ReplicaAssigner throws NPE when a partial list of nodes are only participating in replica + placement. splitshard should preassign nodes using rules, if rules are present (noble, Shai Erera) + Optimizations ---------------------- * SOLR-7876: Speed up queries and operations that use many terms when timeAllowed has not been diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java index 401409ad942..c2b7c9065a5 100644 --- a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java +++ b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java @@ -1308,13 +1308,18 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler // TODO: change this to handle sharding a slice into > 2 sub-shards. + + Map nodeMap = identifyNodes(clusterState, + new ArrayList<>(clusterState.getLiveNodes()), + new ZkNodeProps(collection.getProperties()), + subSlices, repFactor - 1); + List> replicas = new ArrayList<>((repFactor - 1) * 2); - for (int i = 1; i <= subSlices.size(); i++) { - Collections.shuffle(nodeList, RANDOM); - String sliceName = subSlices.get(i - 1); - for (int j = 2; j <= repFactor; j++) { - String subShardNodeName = nodeList.get((repFactor * (i - 1) + (j - 2)) % nodeList.size()); - String shardName = collectionName + "_" + sliceName + "_replica" + (j); + + for (Map.Entry entry : nodeMap.entrySet()) { + String sliceName = entry.getKey().shard; + String subShardNodeName = entry.getValue(); + String shardName = collectionName + "_" + sliceName + "_replica" + (entry.getKey().index); log.info("Creating replica shard " + shardName + " as part of slice " + sliceName + " of collection " + collectionName + " on " + subShardNodeName); @@ -1349,7 +1354,6 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler replicas.add(propMap); } - } // we must set the slice state into recovery before actually creating the replica cores // this ensures that the logic inside Overseer to update sub-shard state to 'active' @@ -2078,8 +2082,8 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler ZkNodeProps message, List shardNames, int repFactor) throws IOException { - List maps = (List) message.get("rule"); - if (maps == null) { + List rulesMap = (List) message.get("rule"); + if (rulesMap == null) { int i = 0; Map result = new HashMap<>(); for (String aShard : shardNames) { @@ -2092,7 +2096,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler } List rules = new ArrayList<>(); - for (Object map : maps) rules.add(new Rule((Map) map)); + for (Object map : rulesMap) rules.add(new Rule((Map) map)); Map sharVsReplicaCount = new HashMap<>(); @@ -2159,8 +2163,10 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler ShardHandler shardHandler = shardHandlerFactory.getShardHandler(); // Kind of unnecessary, but it does put the logic of whether to override maxShardsPerNode in one place. - node = getNodesForNewReplicas(clusterState, collection, shard, 1, node, - overseer.getZkController().getCoreContainer()).get(0).nodeName; + if(node == null) { + node = getNodesForNewReplicas(clusterState, collection, shard, 1, node, + overseer.getZkController().getCoreContainer()).get(0).nodeName; + } log.info("Node not provided, Identified {} for creating new replica", node); if (!clusterState.liveNodesContain(node)) { diff --git a/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java b/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java index c9f774f7062..b0249fe2816 100644 --- a/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java +++ b/solr/core/src/java/org/apache/solr/cloud/rule/ReplicaAssigner.java @@ -55,7 +55,7 @@ public class ReplicaAssigner { Map shardVsReplicaCount; Map> nodeVsTags; Map> shardVsNodes; - List liveNodes; + List participatingLiveNodes; Set tagNames = new HashSet<>(); private Map nodeVsCores = new HashMap<>(); @@ -93,12 +93,12 @@ public class ReplicaAssigner { Map shardVsReplicaCount, List snitches, Map> shardVsNodes, - List liveNodes, + List participatingLiveNodes, CoreContainer cc, ClusterState clusterState) { this.rules = rules; for (Rule rule : rules) tagNames.add(rule.tag.name); this.shardVsReplicaCount = shardVsReplicaCount; - this.liveNodes = new ArrayList<>(liveNodes); + this.participatingLiveNodes = new ArrayList<>(participatingLiveNodes); this.nodeVsTags = getTagsForNodes(cc, snitches); this.shardVsNodes = getDeepCopy(shardVsNodes, 2); validateTags(nodeVsTags); @@ -209,7 +209,7 @@ public class ReplicaAssigner { Map result = new LinkedHashMap<>(); int startPosition = 0; Map> copyOfCurrentState = getDeepCopy(shardVsNodes, 2); - List sortedLiveNodes = new ArrayList<>(this.liveNodes); + List sortedLiveNodes = new ArrayList<>(this.participatingLiveNodes); Collections.sort(sortedLiveNodes, new Comparator() { @Override public int compare(String n1, String n2) { @@ -397,7 +397,7 @@ public class ReplicaAssigner { } - for (String node : liveNodes) { + for (String node : participatingLiveNodes) { //now use the Snitch to get the tags for (SnitchInfoImpl info : snitches.values()) { if (!info.myTags.isEmpty()) { @@ -419,7 +419,7 @@ public class ReplicaAssigner { String node = e.getKey(); if (context.exception != null) { failedNodes.put(node, context); - liveNodes.remove(node); + participatingLiveNodes.remove(node); log.warn("Not all tags were obtained from node " + node); context.exception = new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Not all tags were obtained from node " + node); @@ -436,7 +436,7 @@ public class ReplicaAssigner { } } - if (liveNodes.isEmpty()) { + if (participatingLiveNodes.isEmpty()) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Could not get all tags for any nodes"); } diff --git a/solr/core/src/java/org/apache/solr/cloud/rule/Rule.java b/solr/core/src/java/org/apache/solr/cloud/rule/Rule.java index d73a9bf9671..209b7559f7d 100644 --- a/solr/core/src/java/org/apache/solr/cloud/rule/Rule.java +++ b/solr/core/src/java/org/apache/solr/cloud/rule/Rule.java @@ -163,7 +163,9 @@ public class Rule { Map nodesInThisShard = shardVsNodeSet.get(shardCondition.val.equals(WILD_WILD_CARD) ? entry.getKey() : shardName); if (nodesInThisShard != null) { for (Map.Entry aNode : nodesInThisShard.entrySet()) { - Object obj = nodeVsTags.get(aNode.getKey()).get(tag.name); + Map tagValues = nodeVsTags.get(aNode.getKey()); + if(tagValues == null) continue; + Object obj = tagValues.get(tag.name); if (tagCondition.canMatch(obj, phase)) countMatchingThisTagValue += aNode.getValue(); } } diff --git a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java index bbcae33aa1e..22735abdb25 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ShardSplitTest.java @@ -24,7 +24,9 @@ import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.QueryRequest; +import org.apache.solr.client.solrj.response.CollectionAdminResponse; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.cloud.ClusterState; @@ -86,7 +88,32 @@ public class ShardSplitTest extends BasicDistributedZkTest { // and the new sub-shards don't have any. waitForRecoveriesToFinish(true); //waitForThingsToLevelOut(15); + } + @Test + public void testSplitShardWithRule() throws Exception { + waitForThingsToLevelOut(15); + + if (usually()) { + log.info("Using legacyCloud=false for cluster"); + CollectionsAPIDistributedZkTest.setClusterProp(cloudClient, "legacyCloud", "false"); + } + + log.info("Starting testSplitShardWithRule"); + String collectionName = "shardSplitWithRule"; + CollectionAdminRequest.Create createRequest = new CollectionAdminRequest.Create() + .setCollectionName(collectionName) + .setNumShards(1) + .setReplicationFactor(2) + .setRule("shard:*,replica:<2,node:*"); + CollectionAdminResponse response = createRequest.process(cloudClient); + assertEquals(0, response.getStatus()); + + CollectionAdminRequest.SplitShard splitShardRequest = new CollectionAdminRequest.SplitShard() + .setCollectionName(collectionName) + .setShardName("shard1"); + response = splitShardRequest.process(cloudClient); + assertEquals(String.valueOf(response.getErrorMessages()), 0, response.getStatus()); } private void incompleteOrOverlappingCustomRangeTest() throws Exception {