diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index b1adaa51a9c..2d0f4666d84 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -119,6 +119,8 @@ Bug Fixes * SOLR-12513: Reproducing TestCodecSupport.testMixedCompressionMode failure (Erick Erickson) +* SOLR-11665: Improve error handling of shard splitting. Fix splitting of mixed replica types. (ab) + Optimizations ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java index 1d0fa6901fd..b580bd31bd8 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java @@ -172,7 +172,7 @@ public class Assign { return returnShardId; } - private static String buildSolrCoreName(String collectionName, String shard, Replica.Type type, int replicaNum) { + public static String buildSolrCoreName(String collectionName, String shard, Replica.Type type, int replicaNum) { // TODO: Adding the suffix is great for debugging, but may be an issue if at some point we want to support a way to change replica type return String.format(Locale.ROOT, "%s_%s_replica_%s%s", collectionName, shard, type.name().substring(0,1).toLowerCase(Locale.ROOT), replicaNum); } diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java index 327f3651359..ccb111a848e 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java @@ -26,11 +26,15 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import org.apache.solr.client.solrj.cloud.DistributedQueue; +import org.apache.solr.client.solrj.cloud.NodeStateProvider; import org.apache.solr.client.solrj.cloud.autoscaling.PolicyHelper; import org.apache.solr.client.solrj.cloud.SolrCloudManager; +import org.apache.solr.client.solrj.cloud.autoscaling.ReplicaInfo; +import org.apache.solr.client.solrj.cloud.autoscaling.Suggestion; import org.apache.solr.client.solrj.request.CoreAdminRequest; import org.apache.solr.cloud.Overseer; import org.apache.solr.cloud.overseer.OverseerAction; @@ -45,6 +49,7 @@ import org.apache.solr.common.cloud.ReplicaPosition; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.cloud.rule.ImplicitSnitch; import org.apache.solr.common.params.CommonAdminParams; import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.ModifiableSolrParams; @@ -52,11 +57,13 @@ import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.Utils; import org.apache.solr.handler.component.ShardHandler; import org.apache.solr.util.TestInjection; +import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.data.Stat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_TYPE; import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA; import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESHARD; @@ -101,8 +108,11 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { parentShardLeader = zkStateReader.getLeaderRetry(collectionName, slice.get(), 10000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Interrupted."); } + checkDiskSpace(collectionName, slice.get(), parentShardLeader); + // let's record the ephemeralOwner of the parent leader node Stat leaderZnodeStat = zkStateReader.getZkClient().exists(ZkStateReader.LIVE_NODES_ZKNODE + "/" + parentShardLeader.getNodeName(), null, true); if (leaderZnodeStat == null) { @@ -114,7 +124,36 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { List subSlices = new ArrayList<>(); List subShardNames = new ArrayList<>(); - String rangesStr = fillRanges(ocmh.cloudManager, message, collection, parentSlice, subRanges, subSlices, subShardNames); + // reproduce the currently existing number of replicas per type + AtomicInteger numNrt = new AtomicInteger(); + AtomicInteger numTlog = new AtomicInteger(); + AtomicInteger numPull = new AtomicInteger(); + parentSlice.getReplicas().forEach(r -> { + switch (r.getType()) { + case NRT: + numNrt.incrementAndGet(); + break; + case TLOG: + numTlog.incrementAndGet(); + break; + case PULL: + numPull.incrementAndGet(); + } + }); + int repFactor = numNrt.get() + numTlog.get() + numPull.get(); + + // type of the first subreplica will be the same as leader + boolean firstNrtReplica = parentShardLeader.getType() == Replica.Type.NRT; + // verify that we indeed have the right number of correct replica types + if ((firstNrtReplica && numNrt.get() < 1) || (!firstNrtReplica && numTlog.get() < 1)) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "aborting split - inconsistent replica types in collection " + collectionName + + ": nrt=" + numNrt.get() + ", tlog=" + numTlog.get() + ", pull=" + numPull.get() + ", shard leader type is " + + parentShardLeader.getType()); + } + + List> replicas = new ArrayList<>((repFactor - 1) * 2); + + String rangesStr = fillRanges(ocmh.cloudManager, message, collection, parentSlice, subRanges, subSlices, subShardNames, firstNrtReplica); try { @@ -126,7 +165,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { if (state == Slice.State.ACTIVE) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Sub-shard: " + subSlice + " exists in active state. Aborting split shard."); - } else if (state == Slice.State.CONSTRUCTION || state == Slice.State.RECOVERY) { + } else { // delete the shards log.info("Sub-shard: {} already exists therefore requesting its deletion", subSlice); Map propMap = new HashMap<>(); @@ -188,6 +227,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower()); propMap.put(COLLECTION_PROP, collectionName); propMap.put(SHARD_ID_PROP, subSlice); + propMap.put(REPLICA_TYPE, firstNrtReplica ? Replica.Type.NRT.toString() : Replica.Type.TLOG.toString()); propMap.put("node", nodeName); propMap.put(CoreAdminParams.NAME, subShardName); propMap.put(CommonAdminParams.WAIT_FOR_FINAL_STATE, Boolean.toString(waitForFinalState)); @@ -268,21 +308,8 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { log.info("Successfully applied buffered updates on : " + subShardNames); // Replica creation for the new Slices + // replica placement is controlled by the autoscaling policy framework - // look at the replication factor and see if it matches reality - // if it does not, find best nodes to create more cores - - // TODO: Have replication factor decided in some other way instead of numShards for the parent - - int repFactor = parentSlice.getReplicas().size(); - - // we need to look at every node and see how many cores it serves - // add our new cores to existing nodes serving the least number of cores - // but (for now) require that each core goes on a distinct node. - - // TODO: add smarter options that look at the current number of cores per - // node? - // for now we just go random Set nodes = clusterState.getLiveNodes(); List nodeList = new ArrayList<>(nodes.size()); nodeList.addAll(nodes); @@ -294,28 +321,39 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { // TODO: change this to handle sharding a slice into > 2 sub-shards. + // we have already created one subReplica for each subShard on the parent node. + // identify locations for the remaining replicas + if (firstNrtReplica) { + numNrt.decrementAndGet(); + } else { + numTlog.decrementAndGet(); + } + List replicaPositions = Assign.identifyNodes(ocmh.cloudManager, clusterState, new ArrayList<>(clusterState.getLiveNodes()), collectionName, new ZkNodeProps(collection.getProperties()), - subSlices, repFactor - 1, 0, 0); + subSlices, numNrt.get(), numTlog.get(), numPull.get()); sessionWrapper = PolicyHelper.getLastSessionWrapper(true); - List> replicas = new ArrayList<>((repFactor - 1) * 2); - for (ReplicaPosition replicaPosition : replicaPositions) { String sliceName = replicaPosition.shard; String subShardNodeName = replicaPosition.node; - String solrCoreName = collectionName + "_" + sliceName + "_replica" + (replicaPosition.index); + String solrCoreName = Assign.buildSolrCoreName(collectionName, sliceName, replicaPosition.type, replicaPosition.index); log.info("Creating replica shard " + solrCoreName + " as part of slice " + sliceName + " of collection " + collectionName + " on " + subShardNodeName); + // we first create all replicas in DOWN state without actually creating their cores in order to + // avoid a race condition where Overseer may prematurely activate the new sub-slices (and deactivate + // the parent slice) before all new replicas are added. This situation may lead to a loss of performance + // because the new shards will be activated with possibly many fewer replicas. ZkNodeProps props = new ZkNodeProps(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower(), ZkStateReader.COLLECTION_PROP, collectionName, ZkStateReader.SHARD_ID_PROP, sliceName, ZkStateReader.CORE_NAME_PROP, solrCoreName, + ZkStateReader.REPLICA_TYPE, replicaPosition.type.name(), ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(), ZkStateReader.BASE_URL_PROP, zkStateReader.getBaseUrlForNodeName(subShardNodeName), ZkStateReader.NODE_NAME_PROP, subShardNodeName, @@ -326,6 +364,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower()); propMap.put(COLLECTION_PROP, collectionName); propMap.put(SHARD_ID_PROP, sliceName); + propMap.put(REPLICA_TYPE, replicaPosition.type.name()); propMap.put("node", subShardNodeName); propMap.put(CoreAdminParams.NAME, solrCoreName); // copy over property params: @@ -409,6 +448,8 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { ocmh.addReplica(clusterState, new ZkNodeProps(replica), results, null); } + assert TestInjection.injectSplitFailureAfterReplicaCreation(); + ocmh.processResponses(results, shardHandler, true, "SPLITSHARD failed to create subshard replicas", asyncId, requestMap); log.info("Successfully created all replica shards for all sub-slices " + subSlices); @@ -417,6 +458,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { return true; } catch (SolrException e) { + cleanupAfterFailure(zkStateReader, collectionName, parentSlice.getName(), subSlices); throw e; } catch (Exception e) { log.error("Error executing split operation for collection: " + collectionName + " parent shard: " + slice, e); @@ -426,6 +468,106 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { } } + private void checkDiskSpace(String collection, String shard, Replica parentShardLeader) throws SolrException { + // check that enough disk space is available on the parent leader node + // otherwise the actual index splitting will always fail + NodeStateProvider nodeStateProvider = ocmh.cloudManager.getNodeStateProvider(); + Map nodeValues = nodeStateProvider.getNodeValues(parentShardLeader.getNodeName(), + Collections.singletonList(ImplicitSnitch.DISK)); + Map>> infos = nodeStateProvider.getReplicaInfo(parentShardLeader.getNodeName(), + Collections.singletonList(Suggestion.ConditionType.CORE_IDX.metricsAttribute)); + if (infos.get(collection) == null || infos.get(collection).get(shard) == null) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "missing replica information for parent shard leader"); + } + // find the leader + List lst = infos.get(collection).get(shard); + Double indexSize = null; + for (ReplicaInfo info : lst) { + if (info.getCore().equals(parentShardLeader.getCoreName())) { + Number size = (Number)info.getVariable(Suggestion.ConditionType.CORE_IDX.metricsAttribute); + if (size == null) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "missing index size information for parent shard leader"); + } + indexSize = (Double)Suggestion.ConditionType.CORE_IDX.convertVal(size); + break; + } + } + if (indexSize == null) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "missing replica information for parent shard leader"); + } + Number freeSize = (Number)nodeValues.get(ImplicitSnitch.DISK); + if (freeSize == null) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "missing node disk space information for parent shard leader"); + } + if (freeSize.doubleValue() < 2.0 * indexSize) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "not enough free disk space to perform index split on node " + + parentShardLeader.getNodeName() + ", required: " + (2 * indexSize) + ", available: " + freeSize); + } + } + + private void cleanupAfterFailure(ZkStateReader zkStateReader, String collectionName, String parentShard, List subSlices) { + log.debug("- cleanup after failed split of " + collectionName + "/" + parentShard); + // get the latest state + try { + zkStateReader.forceUpdateCollection(collectionName); + } catch (KeeperException | InterruptedException e) { + log.warn("Cleanup after failed split of " + collectionName + "/" + parentShard + ": (force update collection)", e); + return; + } + ClusterState clusterState = zkStateReader.getClusterState(); + DocCollection coll = clusterState.getCollectionOrNull(collectionName); + + if (coll == null) { // may have been deleted + return; + } + + // set already created sub shards states to CONSTRUCTION - this prevents them + // from entering into RECOVERY or ACTIVE (SOLR-9455) + DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient()); + Map propMap = new HashMap<>(); + propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower()); + propMap.put(ZkStateReader.COLLECTION_PROP, collectionName); + for (Slice s : coll.getSlices()) { + if (!subSlices.contains(s.getName())) { + continue; + } + propMap.put(s.getName(), Slice.State.CONSTRUCTION.toString()); + } + + // if parent is inactive activate it again + Slice parentSlice = coll.getSlice(parentShard); + if (parentSlice.getState() == Slice.State.INACTIVE) { + propMap.put(parentShard, Slice.State.ACTIVE.toString()); + } + + try { + ZkNodeProps m = new ZkNodeProps(propMap); + inQueue.offer(Utils.toJSON(m)); + } catch (Exception e) { + // don't give up yet - just log the error, we may still be able to clean up + log.warn("Cleanup after failed split of " + collectionName + "/" + parentShard + ": (slice state changes)", e); + } + + // delete existing subShards + for (String subSlice : subSlices) { + Slice s = coll.getSlice(subSlice); + if (s == null) { + continue; + } + log.info("Sub-shard: {} already exists therefore requesting its deletion", subSlice); + propMap = new HashMap<>(); + propMap.put(Overseer.QUEUE_OPERATION, "deleteshard"); + propMap.put(COLLECTION_PROP, collectionName); + propMap.put(SHARD_ID_PROP, subSlice); + ZkNodeProps m = new ZkNodeProps(propMap); + try { + ocmh.commandMap.get(DELETESHARD).call(clusterState, m, new NamedList()); + } catch (Exception e) { + log.warn("Cleanup after failed split of " + collectionName + "/" + parentShard + ": (deleting existing sub shard " + subSlice + ")", e); + } + } + } + public static Slice getParentSlice(ClusterState clusterState, String collectionName, AtomicReference slice, String splitKey) { DocCollection collection = clusterState.getCollection(collectionName); DocRouter router = collection.getRouter() != null ? collection.getRouter() : DocRouter.DEFAULT; @@ -463,7 +605,8 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { } public static String fillRanges(SolrCloudManager cloudManager, ZkNodeProps message, DocCollection collection, Slice parentSlice, - List subRanges, List subSlices, List subShardNames) { + List subRanges, List subSlices, List subShardNames, + boolean firstReplicaNrt) { String splitKey = message.getStr("split.key"); DocRouter.Range range = parentSlice.getRange(); if (range == null) { @@ -532,7 +675,8 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { for (int i = 0; i < subRanges.size(); i++) { String subSlice = parentSlice.getName() + "_" + i; subSlices.add(subSlice); - String subShardName = Assign.buildSolrCoreName(cloudManager.getDistribStateManager(), collection, subSlice, Replica.Type.NRT); + String subShardName = Assign.buildSolrCoreName(cloudManager.getDistribStateManager(), collection, subSlice, + firstReplicaNrt ? Replica.Type.NRT : Replica.Type.TLOG); subShardNames.add(subShardName); } return rangesStr; diff --git a/solr/core/src/java/org/apache/solr/util/TestInjection.java b/solr/core/src/java/org/apache/solr/util/TestInjection.java index 3a94fac52d0..34915fb6988 100644 --- a/solr/core/src/java/org/apache/solr/util/TestInjection.java +++ b/solr/core/src/java/org/apache/solr/util/TestInjection.java @@ -133,6 +133,8 @@ public class TestInjection { public static String splitFailureBeforeReplicaCreation = null; + public static String splitFailureAfterReplicaCreation = null; + public static String waitForReplicasInSync = "true:60"; public static String failIndexFingerprintRequests = null; @@ -156,6 +158,7 @@ public class TestInjection { updateRandomPause = null; randomDelayInCoreCreation = null; splitFailureBeforeReplicaCreation = null; + splitFailureAfterReplicaCreation = null; prepRecoveryOpPauseForever = null; countPrepRecoveryOpPauseForever = new AtomicInteger(0); waitForReplicasInSync = "true:60"; @@ -386,23 +389,30 @@ public class TestInjection { return true; } - public static boolean injectSplitFailureBeforeReplicaCreation() { - if (splitFailureBeforeReplicaCreation != null) { + private static boolean injectSplitFailure(String probability, String label) { + if (probability != null) { Random rand = random(); if (null == rand) return true; - Pair pair = parseValue(splitFailureBeforeReplicaCreation); + Pair pair = parseValue(probability); boolean enabled = pair.first(); int chanceIn100 = pair.second(); if (enabled && rand.nextInt(100) >= (100 - chanceIn100)) { - log.info("Injecting failure in creating replica for sub-shard"); - throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to create replica"); + log.info("Injecting failure: " + label); + throw new SolrException(ErrorCode.SERVER_ERROR, "Error: " + label); } } - return true; } + public static boolean injectSplitFailureBeforeReplicaCreation() { + return injectSplitFailure(splitFailureBeforeReplicaCreation, "before creating replica for sub-shard"); + } + + public static boolean injectSplitFailureAfterReplicaCreation() { + return injectSplitFailure(splitFailureAfterReplicaCreation, "after creating replica for sub-shard"); + } + @SuppressForbidden(reason = "Need currentTimeMillis, because COMMIT_TIME_MSEC_KEY use currentTimeMillis as value") public static boolean waitForInSyncWithLeader(SolrCore core, ZkController zkController, String collection, String shardId) throws InterruptedException { if (waitForReplicasInSync == null) return true; diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/ShardSplitTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/ShardSplitTest.java index 73adc619f0b..4411bc4e115 100644 --- a/solr/core/src/test/org/apache/solr/cloud/api/collections/ShardSplitTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/ShardSplitTest.java @@ -28,6 +28,7 @@ import java.util.Set; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.solr.client.solrj.SolrClient; @@ -279,13 +280,12 @@ public class ShardSplitTest extends BasicDistributedZkTest { ClusterState state = zkStateReader.getClusterState(); DocCollection collection = state.getCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION); + // should be cleaned up Slice shard10 = collection.getSlice(SHARD1_0); - assertEquals(Slice.State.CONSTRUCTION, shard10.getState()); - assertEquals(1, shard10.getReplicas().size()); + assertNull(shard10); Slice shard11 = collection.getSlice(SHARD1_1); - assertEquals(Slice.State.CONSTRUCTION, shard11.getState()); - assertEquals(1, shard11.getReplicas().size()); + assertNull(shard11); // lets retry the split TestInjection.reset(); // let the split succeed @@ -305,6 +305,95 @@ public class ShardSplitTest extends BasicDistributedZkTest { } @Test + public void testSplitAfterFailedSplit2() throws Exception { + waitForThingsToLevelOut(15); + + TestInjection.splitFailureAfterReplicaCreation = "true:100"; // we definitely want split to fail + try { + try { + CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION); + splitShard.setShardName(SHARD1); + splitShard.process(cloudClient); + fail("Shard split was not supposed to succeed after failure injection!"); + } catch (Exception e) { + // expected + } + + // assert that sub-shards cores exist and sub-shard is in construction state + ZkStateReader zkStateReader = cloudClient.getZkStateReader(); + zkStateReader.forceUpdateCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION); + ClusterState state = zkStateReader.getClusterState(); + DocCollection collection = state.getCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION); + + // should be cleaned up + Slice shard10 = collection.getSlice(SHARD1_0); + assertNull(shard10); + + Slice shard11 = collection.getSlice(SHARD1_1); + assertNull(shard11); + + // lets retry the split + TestInjection.reset(); // let the split succeed + try { + CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION); + splitShard.setShardName(SHARD1); + splitShard.process(cloudClient); + // Yay! + } catch (Exception e) { + log.error("Shard split failed", e); + fail("Shard split did not succeed after a previous failed split attempt left sub-shards in construction state"); + } + + } finally { + TestInjection.reset(); + } + } + + @Test + public void testSplitMixedReplicaTypes() throws Exception { + waitForThingsToLevelOut(15); + String collectionName = "testSplitMixedReplicaTypes"; + CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionName, "conf1", 1, 2, 2, 2); + create.setMaxShardsPerNode(5); // some high number so we can create replicas without hindrance + create.process(cloudClient); + waitForRecoveriesToFinish(collectionName, false); + + CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(collectionName); + splitShard.setShardName(SHARD1); + splitShard.process(cloudClient); + waitForThingsToLevelOut(15); + + cloudClient.getZkStateReader().forceUpdateCollection(collectionName); + ClusterState clusterState = cloudClient.getZkStateReader().getClusterState(); + DocCollection coll = clusterState.getCollection(collectionName); + log.info("coll: " + coll); + + // verify the original shard + verifyShard(coll, SHARD1, Slice.State.INACTIVE, 2, 2, 2); + // verify new sub-shards + verifyShard(coll, SHARD1_0, Slice.State.ACTIVE, 2, 2, 2); + verifyShard(coll, SHARD1_1, Slice.State.ACTIVE, 2, 2, 2); + } + + private void verifyShard(DocCollection coll, String shard, Slice.State expectedState, int numNrt, int numTlog, int numPull) throws Exception { + Slice s = coll.getSlice(shard); + assertEquals("unexpected shard state", expectedState, s.getState()); + AtomicInteger actualNrt = new AtomicInteger(); + AtomicInteger actualTlog = new AtomicInteger(); + AtomicInteger actualPull = new AtomicInteger(); + s.getReplicas().forEach(r -> { + switch (r.getType()) { + case NRT: actualNrt.incrementAndGet(); break; + case TLOG: actualTlog.incrementAndGet(); break; + case PULL: actualPull.incrementAndGet(); break; + } + }); + assertEquals("actual NRT", numNrt, actualNrt.get()); + assertEquals("actual TLOG", numTlog, actualTlog.get()); + assertEquals("actual PULL", numPull, actualPull.get()); + } + + @Test @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") public void testSplitWithChaosMonkey() throws Exception { waitForThingsToLevelOut(15); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java index 4943874bb71..c4fbf2f5986 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java @@ -969,7 +969,7 @@ public class SimClusterStateProvider implements ClusterStateProvider { opDelay(collectionName, CollectionParams.CollectionAction.SPLITSHARD.name()); - SplitShardCmd.fillRanges(cloudManager, message, collection, parentSlice, subRanges, subSlices, subShardNames); + SplitShardCmd.fillRanges(cloudManager, message, collection, parentSlice, subRanges, subSlices, subShardNames, true); // add replicas for new subShards int repFactor = parentSlice.getReplicas().size(); List replicaPositions = Assign.identifyNodes(cloudManager, diff --git a/solr/solr-ref-guide/src/images/diagrams/splitshard-seq.png b/solr/solr-ref-guide/src/images/diagrams/splitshard-seq.png new file mode 100644 index 00000000000..edb4ebbaf6f Binary files /dev/null and b/solr/solr-ref-guide/src/images/diagrams/splitshard-seq.png differ diff --git a/solr/solr-ref-guide/src/images/diagrams/splitshard-seq.puml b/solr/solr-ref-guide/src/images/diagrams/splitshard-seq.puml new file mode 100644 index 00000000000..feb7b4199d1 --- /dev/null +++ b/solr/solr-ref-guide/src/images/diagrams/splitshard-seq.puml @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +@startuml + +' Note: in TEOZ mode som skinparams don't work +' and currently notes on messages can't be customized - +' but this mode creates more compact layout, enable if needed + +'!pragma teoz true + +skinparam noteFontSize 10 +skinparam ParticipantPadding 5 +skinparam SequenceGroupBorderThickness 1 +skinparam SequenceGroupBorderColor #blue +skinparam SequenceDividerBorderThickness 1 +skinparam SequenceDividerFontStyle plain +skinparam sequence { + MessageAlign center +} + +title Split shard into N parts with RF replicas + +actor User +control Overseer +entity ShardLeader +collections OtherNodes + +User -> Overseer : SPLITSHARD +activate Overseer +== Prepare == +Overseer -> Overseer : checkLeaderDiskSpace +Overseer -> Overseer : fillRanges(N) +loop 1..N +Overseer -> Overseer : deleteOldSubShard +rnote right: left-overs from a\nprevious failed split +end +== Sub shards construction == +loop 1..N +Overseer -> Overseer : createSubShard +rnote right : subShard state\nCONSTRUCTION +Overseer -> Overseer : waitForSubShard +Overseer --> Overseer : addSubLeader +rnote right : placeholder, no core,\nsame node as parent +end +Overseer -> Overseer : waitForSubLeadersVisible +loop 1..N +Overseer -> Overseer : waitForCoreNodeVisible +Overseer --> Overseer : setSubLeaderActive +end +Overseer -> Overseer : waitForSubLeadersActive +== Split parent shard leader == +Overseer --> ShardLeader : splitParentCore +activate ShardLeader +ShardLeader --> Overseer : splitComplete +deactivate ShardLeader +loop 1..N +Overseer --> ShardLeader : applyBufferedUpdates +activate ShardLeader +end +ShardLeader --> Overseer : updatesComplete +deactivate ShardLeader +== Create sub replicas == +loop N * (RF-1) +Overseer --> Overseer : addSubReplica +rnote right : placeholder, no core,\nstate DOWN, other node +end +alt +Overseer -> Overseer : checkParentStillLeader +else +Overseer -> User : changed / error +end +Overseer -> Overseer : updateSubShardStates +rnote right : subShards in RECOVERY or\nACTIVE if RF=1 +loop N * (RF-1) +Overseer --> OtherNodes : createSubReplicaCore +activate OtherNodes +end +OtherNodes --> Overseer : createComplete +Overseer -> User : success +deactivate Overseer +... +OtherNodes ---> Overseer : replicasRecovering +... +alt +OtherNodes --> Overseer : allReplicasActive +activate Overseer +Overseer -> Overseer : switchShards +rnote right : parent shard INACTIVE\nsub shards ACTIVE +else +alt +OtherNodes --> Overseer : someReplicasFailed +else +Overseer --> Overseer : parentShardLeaderChanged +end +deactivate OtherNodes +Overseer -> Overseer : shardRecoveryFailed +rnote right : parent shard ACTIVE\nsub shards RECOVERY_FAILED +end +deactivate Overseer +@enduml \ No newline at end of file