diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java index cd9f1778af1..a842a870422 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java @@ -26,10 +26,12 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; +import java.util.TreeMap; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import org.apache.solr.client.solrj.SolrClient; @@ -305,6 +307,21 @@ public class SimCloudManager implements SolrCloudManager { return systemColl; } + public Map> simGetEventCounts() { + TreeMap> counts = new TreeMap<>(); + synchronized (systemColl) { + for (SolrInputDocument d : systemColl) { + if (!"autoscaling_event".equals(d.getFieldValue("type"))) { + continue; + } + counts.computeIfAbsent((String)d.getFieldValue("event.source_s"), s -> new TreeMap<>()) + .computeIfAbsent((String)d.getFieldValue("stage_s"), s -> new AtomicInteger()) + .incrementAndGet(); + } + } + return counts; + } + /** * Get a {@link SolrClient} implementation where calls are forwarded to this * instance of the cluster. diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java index 48d46b65849..122ff2c810f 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimSolrCloudTestCase.java @@ -35,7 +35,6 @@ import java.util.function.Predicate; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.client.solrj.cloud.autoscaling.ReplicaInfo; -import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.CollectionStatePredicate; import org.apache.solr.common.cloud.DocCollection; @@ -137,19 +136,7 @@ public class SimSolrCloudTestCase extends SolrTestCaseJ4 { log.info("######### Final Solr op counts ##########"); cluster.simGetOpCounts().forEach((k, cnt) -> log.info("##\t\t- " + String.format(Locale.ROOT, "%-14s %4d", k, cnt.get()))); log.info("######### Autoscaling event counts ###########"); - TreeMap> counts = new TreeMap<>(); - - List solrInputDocuments = cluster.simGetSystemCollection(); - synchronized (solrInputDocuments) { - for (SolrInputDocument d : solrInputDocuments) { - if (!"autoscaling_event".equals(d.getFieldValue("type"))) { - continue; - } - counts.computeIfAbsent((String)d.getFieldValue("event.source_s"), s -> new TreeMap<>()) - .computeIfAbsent((String)d.getFieldValue("stage_s"), s -> new AtomicInteger()) - .incrementAndGet(); - } - } + Map> counts = cluster.simGetEventCounts(); counts.forEach((trigger, map) -> { log.info("## * Trigger: " + trigger); map.forEach((s, cnt) -> log.info("##\t\t- " + String.format(Locale.ROOT, "%-11s %4d", s, cnt.get()))); diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java index 15cb226b284..3adf6526d15 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestLargeCluster.java @@ -30,6 +30,7 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrRequest; @@ -61,6 +62,7 @@ import static org.apache.solr.cloud.autoscaling.AutoScalingHandlerTest.createAut /** * */ +@TimeoutSuite(millis = 4 * 3600 * 1000) @LogLevel("org.apache.solr.cloud.autoscaling=DEBUG") public class TestLargeCluster extends SimSolrCloudTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -313,6 +315,11 @@ public class TestLargeCluster extends SimSolrCloudTestCase { 10 }; private static final int[] renard5x = new int[] { + 1, 2, 3, 4, 6, + 10, 16, 25, 40, 63, + 100 + }; + private static final int[] renard5xx = new int[] { 1, 2, 3, 4, 6, 10, 16, 25, 40, 63, 100, 158, 251, 398, 631, @@ -330,12 +337,18 @@ public class TestLargeCluster extends SimSolrCloudTestCase { 100 }; + private static final AtomicInteger ZERO = new AtomicInteger(0); + //@Test public void benchmarkNodeLost() throws Exception { List results = new ArrayList<>(); - for (int wait : renard5) { - for (int delay : renard5) { - SummaryStatistics stat = new SummaryStatistics(); + for (int wait : renard5x) { + for (int delay : renard5x) { + SummaryStatistics totalTime = new SummaryStatistics(); + SummaryStatistics ignoredOurEvents = new SummaryStatistics(); + SummaryStatistics ignoredOtherEvents = new SummaryStatistics(); + SummaryStatistics startedOurEvents = new SummaryStatistics(); + SummaryStatistics startedOtherEvents = new SummaryStatistics(); for (int i = 0; i < 5; i++) { if (cluster != null) { cluster.close(); @@ -344,14 +357,29 @@ public class TestLargeCluster extends SimSolrCloudTestCase { setUp(); setupTest(); long total = doTestNodeLost(wait, delay * 1000, 0); - stat.addValue(total); + totalTime.addValue(total); + // get event counts + Map> counts = cluster.simGetEventCounts(); + Map map = counts.remove("node_lost_trigger"); + startedOurEvents.addValue(map.getOrDefault("STARTED", ZERO).get()); + ignoredOurEvents.addValue(map.getOrDefault("IGNORED", ZERO).get()); + int otherStarted = 0; + int otherIgnored = 0; + for (Map m : counts.values()) { + otherStarted += m.getOrDefault("STARTED", ZERO).get(); + otherIgnored += m.getOrDefault("IGNORED", ZERO).get(); + } + startedOtherEvents.addValue(otherStarted); + ignoredOtherEvents.addValue(otherIgnored); } - results.add(String.format(Locale.ROOT, "%d\t%d\t%6.0f\t%6.0f\t%6.0f\t%6.0f\t%6.0f", wait, delay, - stat.getMin(), stat.getMax(), stat.getMean(), stat.getVariance(), stat.getStandardDeviation())); + results.add(String.format(Locale.ROOT, "%d\t%d\t%4.0f\t%4.0f\t%4.0f\t%4.0f\t%6.0f\t%6.0f\t%6.0f\t%6.0f\t%6.0f", + wait, delay, startedOurEvents.getMean(), ignoredOurEvents.getMean(), + startedOtherEvents.getMean(), ignoredOtherEvents.getMean(), + totalTime.getMin(), totalTime.getMax(), totalTime.getMean(), totalTime.getStandardDeviation(), totalTime.getVariance())); } } log.info("===== RESULTS ======"); - log.info("waitFor\tkillDelay\tmin\tmax\tmean\tvar\tstdev"); + log.info("waitFor\tdelay\tSTRT\tIGN\toSTRT\toIGN\tmin\tmax\tmean\tstdev\tvar"); results.forEach(s -> log.info(s)); } @@ -364,6 +392,7 @@ public class TestLargeCluster extends SimSolrCloudTestCase { "'waitFor' : '" + waitFor + "s'," + "'enabled' : true," + "'actions' : [" + + "{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}," + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" + "]" + @@ -391,6 +420,11 @@ public class TestLargeCluster extends SimSolrCloudTestCase { cluster.simRemoveNode(nodes.get(i), false); cluster.getTimeSource().sleep(killDelay); } + // wait for the trigger to fire + boolean await = triggerFiredLatch.await(10 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS); + assertTrue("trigger did not fire within timeout, " + + "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored, + await); List systemColl = cluster.simGetSystemCollection(); int startedEventPos = -1; for (int i = 0; i < systemColl.size(); i++) { @@ -404,7 +438,9 @@ public class TestLargeCluster extends SimSolrCloudTestCase { break; } } - assertTrue("no STARTED event: " + systemColl, startedEventPos > -1); + assertTrue("no STARTED event: " + systemColl + ", " + + "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored, + startedEventPos > -1); SolrInputDocument startedEvent = systemColl.get(startedEventPos); int ignored = 0; int lastIgnoredPos = startedEventPos; @@ -420,9 +456,13 @@ public class TestLargeCluster extends SimSolrCloudTestCase { } } } - assertTrue("should be at least " + minIgnored + " IGNORED events", ignored >= minIgnored); + assertTrue("should be at least " + minIgnored + " IGNORED events, " + + "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored, + ignored >= minIgnored); // make sure some replicas have been moved - assertTrue("no MOVEREPLICA ops?", cluster.simGetOpCount("MOVEREPLICA") > 0); + assertTrue("no MOVEREPLICA ops? " + + "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored, + cluster.simGetOpCount("MOVEREPLICA") > 0); log.info("Ready after " + waitForState(collectionName, 20 * NUM_NODES, TimeUnit.SECONDS, clusterShape(NUM_NODES / 5, NUM_NODES / 10)) + " ms"); @@ -450,12 +490,16 @@ public class TestLargeCluster extends SimSolrCloudTestCase { } } - assertTrue("did not finish processing changes", finishedEvent != null); + assertTrue("did not finish processing changes, " + + "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored, + finishedEvent != null); long delta = (Long)finishedEvent.getFieldValue("event.time_l") - (Long)startedEvent.getFieldValue("event.time_l"); delta = TimeUnit.NANOSECONDS.toMillis(delta); log.info("#### System stabilized after " + delta + " ms"); long ops = cluster.simGetOpCount("MOVEREPLICA"); - assertTrue("unexpected number of MOVEREPLICA ops: " + ops, ops >= 40); + assertTrue("unexpected number of MOVEREPLICA ops: " + ops + ", " + + "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored, + ops >= 40); return delta; } diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/package-info.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/package-info.java index 0b412cb80a6..8c1ee8642c3 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/package-info.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/package-info.java @@ -15,7 +15,84 @@ * limitations under the License. */ /** - * Simulated environment for autoscaling tests. + *

Simulated environment for autoscaling tests.

+ * + *

Goals

+ *
    + *
  • Use the actual unchanged autoscaling code for cluster state monitoring and autoscaling plan execution.
  • + *
  • Support testing large clusters (> 100 nodes).
  • + *
  • Support fast testing using accelerated time (eg. 100x faster).
  • + *
  • Support enough of other Solr functionality for the test results to be meaningful.
  • + *
+ * + *

Simulated SolrCloudManager - {@link org.apache.solr.cloud.autoscaling.sim.SimCloudManager}

+ * This implementation of {@link org.apache.solr.client.solrj.cloud.autoscaling.SolrCloudManager} + * uses the following simulated components: + *
    + *
  • {@link org.apache.solr.cloud.autoscaling.sim.SimDistribStateManager} - in-memory ZK look-alike, with support for Watcher-s, ephemeral and sequential nodes.
  • + *
  • {@link org.apache.solr.cloud.autoscaling.sim.SimClusterStateProvider} - manages collection, replica infos, states and replica metrics.
  • + *
  • {@link org.apache.solr.cloud.autoscaling.sim.SimNodeStateProvider} - manages node metrics.
  • + *
  • {@link org.apache.solr.cloud.autoscaling.sim.GenericDistributedQueue} - DistributedQueue that uses SimDistribStateManager.
  • + *
+ * SimCloudManager also maintains an up-to-date /live_nodes in SimDistribStateManager, provides a SolrClient instance for use in tests, + * and provides several convenience methods for setting up simulated clusters, populating node and replica metrics, collecting + * autoscaling-related event history, collecting autoscaling event statistics, etc. + * + * SimCloudManager runs actual {@link org.apache.solr.cloud.autoscaling.OverseerTriggerThread} so that it + * uses real trigger and trigger action implementations, as well as real event scheduling and processing code. + * It also provides methods for simulating Overseer leader change. + * + * An important part of the SimCloudManager is also a request handler that processes common autoscaling + * and collection admin requests. Autoscaling requests are processes by an instance of + * {@link org.apache.solr.cloud.autoscaling.AutoScalingHandler} (and result in changes in respective + * data stored in {@link org.apache.solr.cloud.autoscaling.sim.SimDistribStateManager}). Collection + * admin commands are simulated, ie. they don't use actual {@link org.apache.solr.handler.admin.CollectionsHandler} + * due to the complex dependencies on real components. + * + *

{@link org.apache.solr.cloud.autoscaling.sim.SimClusterStateProvider}

+ * This components maintains collection and replica states: + *
    + *
  • Simulates delays between request and the actual cluster state changes
  • + *
  • Marks replicas as down when a node goes down (optionally preserving the replica metrics in order to simulate a node coming back), and keeps track of per-node cores and disk space.
  • + *
  • Runs a shard leader election look-alike on collection state updates.
  • + *
  • Maintains up-to-date /clusterstate.json and /clusterprops.json in SimDistribStateManager (which in turn notifies Watcher-s about collection updates). + * Currently for simplicity it uses the old single /clusterstate.json format for representing ClusterState.
  • + *
+ * + *

{@link org.apache.solr.cloud.autoscaling.sim.SimNodeStateProvider}

+ * This component maintains node metrics. When a simulated cluster is set up using eg. + * {@link org.apache.solr.cloud.autoscaling.sim.SimCloudManager#createCluster(int, org.apache.solr.common.util.TimeSource)} + * method, each simulated node is initialized with some basic metrics that are expected by the autoscaling + * framework, such as node name, fake system load average, heap usage and disk usage. + * + * The number of cores and disk space metrics may be used in autoscaling calculations, so they are + * tracked and adjusted by {@link org.apache.solr.cloud.autoscaling.sim.SimClusterStateProvider} according + * to the currently active replicas located on each node. + * + *

Limitations of the simulation framework

+ * Currently the simulation framework is limited to testing the core autoscaling API in a single JVM. + * Using it for other purposes would require extensive modifications in Solr and in the framework code. + * + * Specifically, the framework supports testing the following autoscaling components: + *
    + *
  • OverseerTriggerThread and components that it uses.
  • + *
  • Autoscaling config, triggers, trigger listeners, ScheduledTriggers, trigger event queues, ComputePlanAction / ExecutePlanAction, etc.
  • + *
+ * Overseer and CollectionsHandler Cmd implementations are NOT used, so cannot be properly tested - some of their functionality is simulated. + * Other SolrCloud components make too many direct references to ZkStateReader, or direct HTTP requests, or rely on too many other components and require much more complex functionality - they may be refactored later but the effort may be too high. + * + * Simulation framework definitely does not support the following functionality: + *
    + *
  • Solr searching and indexing
  • + *
  • Any component that uses ZkController (eg. CoreContainer)
  • + *
  • Any component that uses ShardHandler (eg. CollectionsHandler Cmd-s)
  • + *
+ * + *

Testing framework

+ * A base class {@link org.apache.solr.cloud.autoscaling.sim.SimSolrCloudTestCase} is provided, which + * provides similar helper methods to the ones in the {@link org.apache.solr.cloud.SolrCloudTestCase}. + * */ package org.apache.solr.cloud.autoscaling.sim; +