From 20de3d2ee0d234d04bbcf7c2cddc18ff67a09f8b Mon Sep 17 00:00:00 2001 From: Chris Hostetter Date: Thu, 7 Mar 2019 15:40:57 -0700 Subject: [PATCH] SOLR-12923: harden TestSimLargeCluster - added logging - ensure start/finish trigger action counters are incremented before latches are released - replace arbitrary sleep calls with a trigger listener countdown latch - increase all await() times: This means that 'real' failures (which should be rare and hopefully reproducible) will be 'slow', but the trade off will be less hard to reproduce 'false failures' due to thread contention on slow or heavily loaded (ie: jenkins) machines --- .../autoscaling/sim/TestSimLargeCluster.java | 156 +++++++++--------- 1 file changed, 76 insertions(+), 80 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimLargeCluster.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimLargeCluster.java index 624888adca6..42d307a68a6 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimLargeCluster.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimLargeCluster.java @@ -32,14 +32,10 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.solr.client.solrj.SolrClient; -import org.apache.solr.client.solrj.SolrRequest; -import org.apache.solr.client.solrj.cloud.SolrCloudManager; -import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig; import org.apache.solr.client.solrj.cloud.autoscaling.Suggester; import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventProcessorStage; import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType; import org.apache.solr.client.solrj.request.CollectionAdminRequest; -import org.apache.solr.cloud.CloudTestUtils.AutoScalingRequest; import org.apache.solr.cloud.CloudTestUtils; import org.apache.solr.cloud.autoscaling.ActionContext; import org.apache.solr.cloud.autoscaling.CapturedEvent; @@ -49,14 +45,11 @@ import org.apache.solr.cloud.autoscaling.SearchRateTrigger; import org.apache.solr.cloud.autoscaling.TriggerActionBase; import org.apache.solr.cloud.autoscaling.TriggerEvent; import org.apache.solr.cloud.autoscaling.TriggerListenerBase; -import org.apache.solr.cloud.autoscaling.TriggerValidationException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.params.CollectionParams; -import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.Pair; import org.apache.solr.common.util.TimeSource; -import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.util.LogLevel; import org.apache.solr.util.TimeOut; import org.junit.After; @@ -76,12 +69,13 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { public static final int NUM_NODES = 100; - static Map> listenerEvents = new ConcurrentHashMap<>(); - static AtomicInteger triggerFinishedCount = new AtomicInteger(); - static AtomicInteger triggerStartedCount = new AtomicInteger(); - static CountDownLatch triggerStartedLatch; - static CountDownLatch triggerFinishedLatch; - static int waitForSeconds; + static final Map> listenerEvents = new ConcurrentHashMap<>(); + static final AtomicInteger triggerFinishedCount = new AtomicInteger(); + static final AtomicInteger triggerStartedCount = new AtomicInteger(); + static volatile CountDownLatch triggerStartedLatch; + static volatile CountDownLatch triggerFinishedLatch; + static volatile CountDownLatch listenerEventLatch; + static volatile int waitForSeconds; @After public void tearDownTest() throws Exception { @@ -105,20 +99,28 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { triggerFinishedCount.set(0); triggerStartedLatch = new CountDownLatch(1); triggerFinishedLatch = new CountDownLatch(1); + + // by default assume we want to allow a (virtually) unbounded amount of events, + // tests that expect a specific number can override + listenerEventLatch = new CountDownLatch(Integer.MAX_VALUE); listenerEvents.clear(); } public static class TestTriggerListener extends TriggerListenerBase { - @Override - public void configure(SolrResourceLoader loader, SolrCloudManager cloudManager, AutoScalingConfig.TriggerListenerConfig config) throws TriggerValidationException { - super.configure(loader, cloudManager, config); - } - @Override public synchronized void onEvent(TriggerEvent event, TriggerEventProcessorStage stage, String actionName, ActionContext context, Throwable error, String message) { - List lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>()); - lst.add(new CapturedEvent(cluster.getTimeSource().getTimeNs(), context, config, stage, actionName, event, message)); + CapturedEvent ev = new CapturedEvent(cluster.getTimeSource().getTimeNs(), context, config, stage, actionName, event, message); + final CountDownLatch latch = listenerEventLatch; + synchronized (latch) { + if (0 == latch.getCount()) { + log.warn("Ignoring captured event since latch is 'full': {}", ev); + } else { + List lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>()); + lst.add(ev); + latch.countDown(); + } + } } } @@ -133,8 +135,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { public static class StartTriggerAction extends TriggerActionBase { @Override public void process(TriggerEvent event, ActionContext context) throws Exception { - triggerStartedLatch.countDown(); triggerStartedCount.incrementAndGet(); + triggerStartedLatch.countDown(); } } @@ -142,7 +144,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { @AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // this test hits a timeout easily public void testBasic() throws Exception { SolrClient solrClient = cluster.simGetSolrClient(); - String setTriggerCommand = "{" + + assertAutoScalingRequest + ( "{" + "'set-trigger' : {" + "'name' : 'node_lost_trigger1'," + "'event' : 'nodeLost'," + @@ -154,12 +157,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + - "}}"; - SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand); - NamedList response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + "}}"); - String setListenerCommand = "{" + + assertAutoScalingRequest + ( "{" + "'set-listener' : " + "{" + "'name' : 'foo'," + @@ -169,15 +170,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { "'afterAction' : ['compute', 'execute']," + "'class' : '" + TestTriggerListener.class.getName() + "'" + "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + "}"); assertAutoscalingUpdateComplete(); - cluster.getTimeSource().sleep(5000); - // pick a few random nodes List nodes = new ArrayList<>(); int limit = 75; @@ -240,7 +236,6 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get()); } - log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS, CloudTestUtils.clusterShape(5, 15, false, true)) + "ms"); long newMoveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name()); @@ -254,7 +249,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { @Test public void testAddNode() throws Exception { SolrClient solrClient = cluster.simGetSolrClient(); - String setTriggerCommand = "{" + + assertAutoScalingRequest + ( "{" + "'set-trigger' : {" + "'name' : 'node_added_trigger2'," + "'event' : 'nodeAdded'," + @@ -266,10 +262,7 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + - "}}"; - SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand); - NamedList response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + "}}"); assertAutoscalingUpdateComplete(); @@ -289,16 +282,19 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { List addNodesList = new ArrayList<>(numAddNode); for (int i = 0; i < numAddNode; i++) { addNodesList.add(cluster.simAddNode()); - cluster.getTimeSource().sleep(5000); } // wait until at least one event is generated - boolean await = triggerStartedLatch.await(20000 / SPEED, TimeUnit.MILLISECONDS); - assertTrue("trigger did not fire", await); + assertTrue("Trigger did not start even after await()ing an excessive amount of time", + triggerStartedLatch.await(60, TimeUnit.SECONDS)); // wait until started == finished TimeOut timeOut = new TimeOut(45 * waitForSeconds * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource()); while (!timeOut.hasTimedOut()) { + final int started = triggerStartedCount.get(); + final int finished = triggerFinishedCount.get(); + log.info("started={} =?= finished={}", started, finished); if (triggerStartedCount.get() == triggerFinishedCount.get()) { + log.info("started == finished: {} == {}", started, finished); break; } timeOut.sleep(1000); @@ -324,18 +320,22 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { SolrInputDocument startedEvent = systemColl.get(startedEventPos); int lastIgnoredPos = startedEventPos; // make sure some replicas have been moved - assertTrue("no MOVEREPLICA ops?", cluster.simGetOpCount("MOVEREPLICA") > 0); + long lastNumOps = cluster.simGetOpCount("MOVEREPLICA"); + log.info("1st check: lastNumOps (MOVEREPLICA) = {}", lastNumOps); + assertTrue("no MOVEREPLICA ops?", lastNumOps > 0); log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS, CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3, false, true)) + " ms"); int count = 1000; SolrInputDocument finishedEvent = null; - long lastNumOps = cluster.simGetOpCount("MOVEREPLICA"); + lastNumOps = cluster.simGetOpCount("MOVEREPLICA"); + log.info("2nd check: lastNumOps (MOVEREPLICA) = {}", lastNumOps); while (count-- > 0) { cluster.getTimeSource().sleep(10000); if (cluster.simGetOpCount("MOVEREPLICA") < 2) { + log.info("MOVEREPLICA < 2"); continue; } @@ -349,24 +349,25 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { } if ("SUCCEEDED".equals(d.getFieldValue("stage_s"))) { finishedEvent = d; + log.info("finishedEvent = {}", finishedEvent); break; } } + log.info("breaking because currentNumOps == lastNumOps == {}", currentNumOps); break; } else { lastNumOps = currentNumOps; } } - assertTrue("did not finish processing changes", finishedEvent != null); + assertNotNull("did not finish processing changes", finishedEvent); long delta = (Long)finishedEvent.getFieldValue("event.time_l") - (Long)startedEvent.getFieldValue("event.time_l"); log.info("#### System stabilized after " + TimeUnit.NANOSECONDS.toMillis(delta) + " ms"); - assertTrue("unexpected number of MOVEREPLICA ops", cluster.simGetOpCount("MOVEREPLICA") > 1); + assertTrue("unexpected number of MOVEREPLICA ops: " + cluster.simGetOpCount("MOVEREPLICA"), + cluster.simGetOpCount("MOVEREPLICA") > 1); } @Test - // commented 4-Sep-2018 @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2018-06-18 - // commented out on: 17-Feb-2019 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 14-Oct-2018 public void testNodeLost() throws Exception { doTestNodeLost(waitForSeconds, 5000, 0); } @@ -446,7 +447,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { private long doTestNodeLost(int waitFor, long killDelay, int minIgnored) throws Exception { SolrClient solrClient = cluster.simGetSolrClient(); - String setTriggerCommand = "{" + + assertAutoScalingRequest + ( "{" + "'set-trigger' : {" + "'name' : 'node_lost_trigger3'," + "'event' : 'nodeLost'," + @@ -458,12 +460,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + - "}}"; - SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand); - NamedList response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + "}}"); - String setListenerCommand = "{" + + assertAutoScalingRequest + ( "{" + "'set-listener' : " + "{" + "'name' : 'failures'," + @@ -471,10 +471,7 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { "'stage' : ['FAILED']," + "'class' : '" + TestTriggerListener.class.getName() + "'" + "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + "}"); assertAutoscalingUpdateComplete(); @@ -499,10 +496,9 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { cluster.getTimeSource().sleep(killDelay); } // wait for the trigger to fire and complete at least once - boolean await = triggerFinishedLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS); - assertTrue("trigger did not fire within timeout, " + - "waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored, - await); + assertTrue("Trigger did not finish even after await()ing an excessive amount of time", + triggerFinishedLatch.await(60, TimeUnit.SECONDS)); + List systemColl = cluster.simGetSystemCollection(); int startedEventPos = -1; @@ -646,7 +642,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { // now define the trigger. doing it earlier may cause partial events to be generated (where only some // nodes / replicas exceeded the threshold). - String setTriggerCommand = "{" + + assertAutoScalingRequest + ( "{" + "'set-trigger' : {" + "'name' : 'search_rate_trigger'," + "'event' : 'searchRate'," + @@ -659,11 +656,13 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "]" + - "}}"; - SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand); - NamedList response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); - String setListenerCommand1 = "{" + + "}}"); + + + // we're going to expect our trigger listener to process exactly one captured event + listenerEventLatch = new CountDownLatch(1); + assertAutoScalingRequest + ( "{" + "'set-listener' : " + "{" + "'name' : 'srt'," + @@ -671,23 +670,20 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase { "'stage' : ['FAILED','SUCCEEDED']," + "'class' : '" + TestTriggerListener.class.getName() + "'" + "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand1); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + "}"); assertAutoscalingUpdateComplete(); - boolean await = triggerFinishedLatch.await(waitForSeconds * 45000 / SPEED, TimeUnit.MILLISECONDS); - assertTrue("The trigger did not fire at all", await); - // wait for listener to capture the SUCCEEDED stage - cluster.getTimeSource().sleep(25000); + assertTrue("Trigger did not finish even after await()ing an excessive amount of time", + triggerFinishedLatch.await(60, TimeUnit.SECONDS)); - assertNotNull(listenerEvents.entrySet().iterator().toString(), listenerEvents.get("srt")); + assertTrue("The listener didn't record the event even after await()ing an excessive amount of time", + listenerEventLatch.await(60, TimeUnit.SECONDS)); + List events = listenerEvents.get("srt"); + assertNotNull("no srt events: " + listenerEvents.toString(), events); + assertEquals(events.toString(), 1, events.size()); - assertTrue(listenerEvents.entrySet().iterator().toString(), listenerEvents.get("srt").size() >= 1); - - CapturedEvent ev = listenerEvents.get("srt").get(0); + CapturedEvent ev = events.get(0); assertEquals(TriggerEventType.SEARCHRATE, ev.event.getEventType()); Map m = (Map)ev.event.getProperty(SearchRateTrigger.HOT_NODES); assertNotNull(m);