diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/ConcurrentCreateCollectionTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/ConcurrentCreateCollectionTest.java index 8842b3c48f7..6d103f14165 100644 --- a/solr/core/src/test/org/apache/solr/cloud/api/collections/ConcurrentCreateCollectionTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/ConcurrentCreateCollectionTest.java @@ -16,48 +16,31 @@ */ package org.apache.solr.cloud.api.collections; -import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicReference; -import org.apache.lucene.util.LuceneTestCase.Nightly; -import org.apache.solr.SolrTestCaseJ4; -import org.apache.solr.client.solrj.SolrClient; -import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.embedded.JettySolrRunner; import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.request.CollectionAdminRequest; -import org.apache.solr.client.solrj.request.UpdateRequest; -import org.apache.solr.client.solrj.response.CollectionAdminResponse; -import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.cloud.CloudTestUtils; -import org.apache.solr.cloud.MiniSolrCloudCluster; import org.apache.solr.cloud.SolrCloudTestCase; -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; -import org.apache.solr.common.util.IOUtils; -import org.apache.solr.common.util.TimeSource; -import org.apache.solr.util.TimeOut; -import org.apache.zookeeper.KeeperException; +import org.apache.solr.common.cloud.Slice; import org.junit.After; import org.junit.Before; import org.junit.BeforeClass; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + + public class ConcurrentCreateCollectionTest extends SolrCloudTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -86,22 +69,94 @@ public class ConcurrentCreateCollectionTest extends SolrCloudTestCase { } + private CollectionAdminRequest.Create createCollectionRequest(String cname, int numShards, int numReplicas) throws Exception { + CollectionAdminRequest.Create creq = CollectionAdminRequest + // nocommit .createCollection(cname, "conf", NODES - 1, NODES - 1) + .createCollection(cname, "conf", numShards, numReplicas) + .setMaxShardsPerNode(100); + creq.setWaitForFinalState(true); + creq.setAutoAddReplicas(true); + return creq; + } public void testConcurrentCreatePlacement() throws Exception { - final int nThreads = 20; + final int nThreads = 2; final int createsPerThread = 1; - final int repFactor = 1; - final boolean useClusterPolicy = true; - final boolean useCollectionPolicy = false; + final int nShards = 1; + final int repFactor = 2; + final boolean useClusterPolicy = false; + final boolean useCollectionPolicy = true; + final boolean startUnbalanced = true; // can help make a smaller test that can still reproduce an issue. + final int unbalancedSize = 1; // the number of replicas to create first + final boolean stopNode = false; // only applicable when startUnbalanced==true... stops a node during first collection creation, then restarts final CloudSolrClient client = cluster.getSolrClient(); + if (startUnbalanced) { + /*** This produces a failure (multiple replicas of single shard on same node) when run with NODES=4 and + final int nThreads = 2; + final int createsPerThread = 1; + final int nShards = 2; + final int repFactor = 2; + final boolean useClusterPolicy = false; + final boolean useCollectionPolicy = true; + final boolean startUnbalanced = true; + // NOTE: useClusterPolicy=true seems to fix it! So does putting both creates in a single thread! + // NOTE: even creating a single replica to start with causes failure later on. + + Also reproduced with smaller cluster: NODES=2 and + final int nThreads = 2; + final int createsPerThread = 1; + final int nShards = 1; + final int repFactor = 2; + final boolean useClusterPolicy = false; + final boolean useCollectionPolicy = true; + final boolean startUnbalanced = true; + + Also, with NODES=3: + final int nThreads = 2; + final int createsPerThread = 1; + final int nShards = 1; + final int repFactor = 2; + final boolean useClusterPolicy = false; + final boolean useCollectionPolicy = true; + final boolean startUnbalanced = false; + + // Also succeeded in replicating a bug where all 5 replicas were on a single node: CORES=5, nThreads=5, repFactor=5, + // unbalancedSize = 16 (4 replicas on each of the up nodes), stopNode=true + ***/ + + + JettySolrRunner downJetty = cluster.getJettySolrRunners().get(0); + if (stopNode) { + cluster.stopJettySolrRunner(downJetty); + } + + String cname = "STARTCOLLECTION"; + CollectionAdminRequest.Create creq = CollectionAdminRequest + // nocommit .createCollection(cname, "conf", NODES - 1, NODES - 1) + .createCollection(cname, "conf", unbalancedSize, 1) + .setMaxShardsPerNode(100); + creq.setWaitForFinalState(true); + // creq.setAutoAddReplicas(true); + if (useCollectionPolicy) { creq.setPolicy("policy1"); } + creq.process(client); + + if (stopNode) { + // this will start it with a new port.... does it matter? + cluster.startJettySolrRunner(downJetty); + } + } + + + if (useClusterPolicy) { String setClusterPolicyCommand = "{" + " 'set-cluster-policy': [" + // " {'cores':'<100', 'node':'#ANY'}," + " {'replica':'<2', 'shard': '#EACH', 'node': '#ANY'}," + + // " {'replica':'<2', 'node': '#ANY'}," + " ]" + "}"; @@ -110,12 +165,36 @@ public class ConcurrentCreateCollectionTest extends SolrCloudTestCase { } if (useCollectionPolicy) { - // NOTE: the meer act of setting this named policy prevents LegacyAssignStrategy from being used, even if the policy is + // NOTE: the mere act of setting this named policy prevents LegacyAssignStrategy from being used, even if the policy is // not used during collection creation. - String commands = "{set-policy :{policy1 : [{replica:'<2' , node:'#ANY'}]}}"; + String commands = "{set-policy : {" + + " policy1 : [{replica:'<2' , node:'#ANY'}]" + + ",policy2 : [{replica:'<2' , shard:'#EACH', node:'#ANY'}]" + + "}}"; client.request(CloudTestUtils.AutoScalingRequest.create(SolrRequest.METHOD.POST, commands)); + + /*** take defaults for cluster preferences + String cmd = "{" + + " 'set-cluster-preferences': [" + + // " {'cores':'<100', 'node':'#ANY'}," + + " {minimize:cores}" + + " ]" + + "}"; + + SolrRequest req = CloudTestUtils.AutoScalingRequest.create(SolrRequest.METHOD.POST, cmd); + client.request(req); + ***/ } + /*** + SolrRequest req = CloudTestUtils.AutoScalingRequest.create(SolrRequest.METHOD.GET, null); + SolrResponse response = req.process(client); + log.info("######### AUTOSCALE " + response); + ***/ + + + byte[] data = client.getZkStateReader().getZkClient().getData("/autoscaling.json", null, null, true); + log.info("AUTOSCALE DATA: " + new String(data, "UTF-8")); final AtomicInteger collectionNum = new AtomicInteger(); Thread[] indexThreads = new Thread[nThreads]; @@ -125,14 +204,17 @@ public class ConcurrentCreateCollectionTest extends SolrCloudTestCase { try { for (int j=0; j> replicaMap = new HashMap<>(); ClusterState cstate = client.getZkStateReader().getClusterState(); @@ -173,11 +256,28 @@ public class ConcurrentCreateCollectionTest extends SolrCloudTestCase { boolean failed = false; for (List replicas : replicaMap.values()) { if (replicas.size() != expectedPerNode ) { - failed = true; + if (expectBalanced) { + failed = true; + } log.error("UNBALANCED CLUSTER: expected replicas per node " + expectedPerNode + " but got " + replicas.size()); } } + // check if there were multiple replicas of the same shard placed on the same node + for (DocCollection collection : cstate.getCollectionsMap().values()) { + for (Slice slice : collection.getSlices()) { + Map nodeToReplica = new HashMap<>(); + for (Replica replica : slice.getReplicas()) { + Replica prev = nodeToReplica.put(replica.getBaseUrl(), replica); + if (prev != null) { + failed = true; + // NOTE: with a replication factor > 2, this will print multiple times per bad slice. + log.error("MULTIPLE REPLICAS OF SINGLE SHARD ON SAME NODE: r1=" + prev + " r2=" + replica); + } + } + } + } + if (failed) { log.error("Cluster state " + cstate.getCollectionsMap()); }