From 15e5ca999ff7e912653db897781b21642d5307f0 Mon Sep 17 00:00:00 2001 From: Chris Hostetter Date: Tue, 22 Jan 2019 11:47:11 -0700 Subject: [PATCH] SOLR-13140: harden SearchRateTriggerIntegrationTest by using more absolute rate thresholds and latches to track when all events have been processed so we don't need to 'guess' about sleep calls This commit also disables testDeleteNode pending an AwaitsFix on SOLR-13163 --- .../SearchRateTriggerIntegrationTest.java | 652 +++++++++--------- 1 file changed, 323 insertions(+), 329 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java index 08b45ca6eaa..a53a3890563 100644 --- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/SearchRateTriggerIntegrationTest.java @@ -29,7 +29,6 @@ import java.util.concurrent.atomic.AtomicInteger; import com.carrotsearch.randomizedtesting.annotations.Nightly; import com.google.common.util.concurrent.AtomicDouble; import org.apache.lucene.util.LuceneTestCase; -import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.cloud.SolrCloudManager; import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig; import org.apache.solr.client.solrj.cloud.autoscaling.ReplicaInfo; @@ -37,7 +36,6 @@ import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventProcessorStage import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.cloud.CloudTestUtils; -import org.apache.solr.cloud.CloudTestUtils.AutoScalingRequest; import org.apache.solr.cloud.SolrCloudTestCase; import org.apache.solr.common.MapWriter; import org.apache.solr.common.cloud.Replica; @@ -71,10 +69,11 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private static final TimeSource timeSource = TimeSource.NANO_TIME; - private static CountDownLatch listenerCreated = new CountDownLatch(1); - private static Map> listenerEvents = new HashMap<>(); - private static CountDownLatch finished = new CountDownLatch(1); - private static CountDownLatch started = new CountDownLatch(1); + private static volatile CountDownLatch listenerCreated = new CountDownLatch(1); + private static volatile CountDownLatch listenerEventLatch = new CountDownLatch(0); + private static volatile Map> listenerEvents = new HashMap<>(); + private static volatile CountDownLatch finished = new CountDownLatch(1); + private static volatile CountDownLatch started = new CountDownLatch(1); private static SolrCloudManager cloudManager; private int waitForSeconds; @@ -99,15 +98,17 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { // clear any persisted auto scaling configuration Stat stat = zkClient().setData(SOLR_AUTOSCALING_CONF_PATH, Utils.toJSON(new ZkNodeProps()), true); log.info(SOLR_AUTOSCALING_CONF_PATH + " reset, new znode version {}", stat.getVersion()); - timeSource.sleep(5000); deleteChildrenRecursively(ZkStateReader.SOLR_AUTOSCALING_EVENTS_PATH); deleteChildrenRecursively(ZkStateReader.SOLR_AUTOSCALING_TRIGGER_STATE_PATH); deleteChildrenRecursively(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH); deleteChildrenRecursively(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH); - + finished = new CountDownLatch(1); started = new CountDownLatch(1); + listenerCreated = new CountDownLatch(1); listenerEvents = new HashMap<>(); + listenerEventLatch = new CountDownLatch(0); + waitForSeconds = 3 + random().nextInt(5); } @@ -126,100 +127,94 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { CloudTestUtils.waitForState(cloudManager, COLL1, 60, TimeUnit.SECONDS, CloudTestUtils.clusterShape(1, 2)); - // the trigger is initially disabled so that we have the time to set up listeners - // and generate the traffic - String setTriggerCommand = "{" + - "'set-trigger' : {" + - "'name' : 'search_rate_trigger1'," + - "'event' : 'searchRate'," + - "'waitFor' : '" + waitForSeconds + "s'," + - "'enabled' : false," + - "'collections' : '" + COLL1 + "'," + - "'aboveRate' : 1.0," + - "'belowRate' : 0.1," + - "'actions' : [" + - "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + - "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" + - "]" + - "}}"; - SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand); - NamedList response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + // the trigger is initially disabled so that we have the time to set up listeners + // and generate the traffic + "{" + + "'set-trigger' : {" + + "'name' : 'search_rate_trigger1'," + + "'event' : 'searchRate'," + + "'waitFor' : '" + waitForSeconds + "s'," + + "'enabled' : false," + + "'collections' : '" + COLL1 + "'," + + "'aboveRate' : 1.0," + + "'belowRate' : 0.1," + + "'actions' : [" + + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" + + "]" + + "}}"); - String setListenerCommand = "{" + - "'set-listener' : " + - "{" + - "'name' : 'started'," + - "'trigger' : 'search_rate_trigger1'," + - "'stage' : ['STARTED']," + - "'class' : '" + StartedProcessingListener.class.getName() + "'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-listener' : " + + "{" + + "'name' : 'started'," + + "'trigger' : 'search_rate_trigger1'," + + "'stage' : ['STARTED']," + + "'class' : '" + StartedProcessingListener.class.getName() + "'" + + "}" + + "}"); - setListenerCommand = "{" + - "'set-listener' : " + - "{" + - "'name' : 'srt'," + - "'trigger' : 'search_rate_trigger1'," + - "'stage' : ['FAILED','SUCCEEDED']," + - "'afterAction': ['compute', 'execute']," + - "'class' : '" + CapturingTriggerListener.class.getName() + "'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-listener' : " + + "{" + + "'name' : 'srt'," + + "'trigger' : 'search_rate_trigger1'," + + "'stage' : ['FAILED','SUCCEEDED']," + + "'afterAction': ['compute', 'execute']," + + "'class' : '" + CapturingTriggerListener.class.getName() + "'" + + "}" + + "}"); + listenerEventLatch = new CountDownLatch(3); - setListenerCommand = "{" + - "'set-listener' : " + - "{" + - "'name' : 'finished'," + - "'trigger' : 'search_rate_trigger1'," + - "'stage' : ['SUCCEEDED']," + - "'class' : '" + FinishedProcessingListener.class.getName() + "'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-listener' : " + + "{" + + "'name' : 'finished'," + + "'trigger' : 'search_rate_trigger1'," + + "'stage' : ['SUCCEEDED']," + + "'class' : '" + FinishedProcessingListener.class.getName() + "'" + + "}" + + "}"); SolrParams query = params(CommonParams.Q, "*:*"); for (int i = 0; i < 500; i++) { solrClient.query(COLL1, query); } - + // enable the trigger - String resumeTriggerCommand = "{" + - "'resume-trigger' : {" + - "'name' : 'search_rate_trigger1'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, resumeTriggerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'resume-trigger' : {" + + "'name' : 'search_rate_trigger1'" + + "}" + + "}"); - timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS)); - - boolean await = started.await(20, TimeUnit.SECONDS); - assertTrue("The trigger did not fire at all", await); - - await = finished.await(60, TimeUnit.SECONDS); - assertTrue("The trigger did not finish processing", await); + assertTrue("The trigger did not start in a reasonable amount of time", + started.await(60, TimeUnit.SECONDS)); + + assertTrue("The trigger did not finish in a reasonable amount of time", + started.await(60, TimeUnit.SECONDS)); + + assertTrue("the listener should have recorded all events w/in a reasonable amount of time", + listenerEventLatch.await(60, TimeUnit.SECONDS)); // suspend the trigger - String suspendTriggerCommand = "{" + - "'suspend-trigger' : {" + - "'name' : 'search_rate_trigger1'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, suspendTriggerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); - - timeSource.sleep(5000); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'suspend-trigger' : {" + + "'name' : 'search_rate_trigger1'" + + "}" + + "}"); List events = listenerEvents.get("srt"); assertEquals(listenerEvents.toString(), 3, events.size()); @@ -274,8 +269,6 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { } @Test - //17-Aug-2018 commented @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 21-May-2018 - // commented out on: 24-Dec-2018 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // added 15-Sep-2018 public void testBelowSearchRate() throws Exception { CloudSolrClient solrClient = cluster.getSolrClient(); String COLL1 = "belowRate_collection"; @@ -295,100 +288,87 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { CloudTestUtils.waitForState(cloudManager, COLL1, 60, TimeUnit.SECONDS, CloudTestUtils.clusterShape(1, 5)); - String setTriggerCommand = "{" + - "'set-trigger' : {" + - "'name' : 'search_rate_trigger2'," + - "'event' : 'searchRate'," + - "'waitFor' : '" + waitForSeconds + "s'," + - "'enabled' : false," + - "'collections' : '" + COLL1 + "'," + - "'aboveRate' : 1.0," + - // RecoveryStrategy calls /admin/ping, which calls /select so this may not be zero - // even when no external requests were made - "'belowRate' : 0.3," + - "'aboveNodeRate' : 1.0," + - "'belowNodeRate' : 0.3," + - // do nothing but generate an op - "'belowNodeOp' : 'none'," + - "'actions' : [" + - "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + - "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" + - "]" + - "}}"; - SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand); - NamedList response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-trigger' : {" + + "'name' : 'search_rate_trigger2'," + + "'event' : 'searchRate'," + + "'waitFor' : '" + waitForSeconds + "s'," + + "'enabled' : false," + + "'collections' : '" + COLL1 + "'," + + "'aboveRate' : 1.0," + + "'aboveNodeRate' : 1.0," + + // RecoveryStrategy calls /admin/ping, which calls /select so the rate may not be zero + // even when no external requests were made .. but it's hard to predict exactly + // what it will be. use an insanely high rate so all shards/nodes are suspect + // and produce an Op regardless of how much internal traffic is produced... + "'belowRate' : 1.0," + + "'belowNodeRate' : 1.0," + + // ...but do absolutely nothing to nodes except generate an 'NONE' Op + "'belowNodeOp' : 'none'," + + "'actions' : [" + + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" + + "]" + + "}}"); - String setListenerCommand = "{" + - "'set-listener' : " + - "{" + - "'name' : 'started'," + - "'trigger' : 'search_rate_trigger2'," + - "'stage' : ['STARTED']," + - "'class' : '" + StartedProcessingListener.class.getName() + "'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-listener' : " + + "{" + + "'name' : 'started'," + + "'trigger' : 'search_rate_trigger2'," + + "'stage' : ['STARTED']," + + "'class' : '" + StartedProcessingListener.class.getName() + "'" + + "}" + + "}"); - setListenerCommand = "{" + - "'set-listener' : " + - "{" + - "'name' : 'srt'," + - "'trigger' : 'search_rate_trigger2'," + - "'stage' : ['FAILED','SUCCEEDED']," + - "'afterAction': ['compute', 'execute']," + - "'class' : '" + CapturingTriggerListener.class.getName() + "'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-listener' : " + + "{" + + "'name' : 'srt'," + + "'trigger' : 'search_rate_trigger2'," + + "'stage' : ['FAILED','SUCCEEDED']," + + "'afterAction': ['compute', 'execute']," + + "'class' : '" + CapturingTriggerListener.class.getName() + "'" + + "}" + + "}"); + listenerEventLatch = new CountDownLatch(3); + + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-listener' : " + + "{" + + "'name' : 'finished'," + + "'trigger' : 'search_rate_trigger2'," + + "'stage' : ['SUCCEEDED']," + + "'class' : '" + FinishedProcessingListener.class.getName() + "'" + + "}" + + "}"); - setListenerCommand = "{" + - "'set-listener' : " + - "{" + - "'name' : 'finished'," + - "'trigger' : 'search_rate_trigger2'," + - "'stage' : ['SUCCEEDED']," + - "'class' : '" + FinishedProcessingListener.class.getName() + "'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); - - timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS)); + // Explicitly Do Nothing Here // enable the trigger - String resumeTriggerCommand = "{" + - "'resume-trigger' : {" + - "'name' : 'search_rate_trigger2'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, resumeTriggerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + final String resumeTriggerCommand = "{ 'resume-trigger' : { 'name' : 'search_rate_trigger2' } }"; + CloudTestUtils.assertAutoScalingRequest(cloudManager, resumeTriggerCommand); - timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS)); - - boolean await = started.await(20, TimeUnit.SECONDS); - assertTrue("The trigger did not fire at all", await); - await = finished.await(60, TimeUnit.SECONDS); - assertTrue("The trigger did not finish processing", await); + assertTrue("The trigger did not start in a reasonable amount of time", + started.await(60, TimeUnit.SECONDS)); + + assertTrue("The trigger did not finish in a reasonable amount of time", + started.await(60, TimeUnit.SECONDS)); + + assertTrue("the listener should have recorded all events w/in a reasonable amount of time", + listenerEventLatch.await(60, TimeUnit.SECONDS)); // suspend the trigger - String suspendTriggerCommand = "{" + - "'suspend-trigger' : {" + - "'name' : 'search_rate_trigger2'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, suspendTriggerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); - - timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS)); + final String suspendTriggerCommand = "{ 'suspend-trigger' : { 'name' : 'search_rate_trigger2' } }"; + CloudTestUtils.assertAutoScalingRequest(cloudManager, suspendTriggerCommand); List events = listenerEvents.get("srt"); assertEquals(events.toString(), 3, events.size()); @@ -396,8 +376,8 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { assertEquals(ev.toString(), "compute", ev.actionName); List ops = (List)ev.event.getProperty(TriggerEvent.REQUESTED_OPS); assertNotNull("there should be some requestedOps: " + ev.toString(), ops); - // 4 cold nodes, 3 cold replicas - assertEquals(ops.toString(), 7, ops.size()); + // 5 cold nodes, 3 cold replicas + assertEquals(ops.toString(), 5 + 3, ops.size()); AtomicInteger coldNodes = new AtomicInteger(); AtomicInteger coldReplicas = new AtomicInteger(); ops.forEach(op -> { @@ -409,7 +389,7 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { fail("unexpected op: " + op); } }); - assertEquals("cold nodes", 4, coldNodes.get()); + assertEquals("cold nodes", 5, coldNodes.get()); assertEquals("cold replicas", 3, coldReplicas.get()); // now the collection should be down to RF = 2 @@ -417,28 +397,26 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { CloudTestUtils.clusterShape(1, 2)); listenerEvents.clear(); + listenerEventLatch = new CountDownLatch(3); finished = new CountDownLatch(1); started = new CountDownLatch(1); // resume trigger - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, resumeTriggerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest(cloudManager, resumeTriggerCommand); + + assertTrue("The trigger did not start in a reasonable amount of time", + started.await(60, TimeUnit.SECONDS)); + + assertTrue("The trigger did not finish in a reasonable amount of time", + started.await(60, TimeUnit.SECONDS)); + + assertTrue("the listener should have recorded all events w/in a reasonable amount of time", + listenerEventLatch.await(60, TimeUnit.SECONDS)); + + // suspend the trigger + CloudTestUtils.assertAutoScalingRequest(cloudManager, suspendTriggerCommand); // there should be only coldNode ops now, and no coldReplica ops since searchable RF == collection RF - timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS)); - - await = started.await(20, TimeUnit.SECONDS); - assertTrue("The trigger did not fire at all", await); - await = finished.await(60, TimeUnit.SECONDS); - assertTrue("The trigger did not finish processing", await); - - // suspend trigger - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, suspendTriggerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); - - timeSource.sleep(5000); events = listenerEvents.get("srt"); assertEquals(events.toString(), 3, events.size()); @@ -451,51 +429,47 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { assertEquals(ops.toString(), CollectionParams.CollectionAction.NONE, ops.get(0).getAction()); assertEquals(ops.toString(), CollectionParams.CollectionAction.NONE, ops.get(1).getAction()); - // wait for waitFor to elapse for all types of violations - timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds * 2, TimeUnit.SECONDS)); listenerEvents.clear(); + listenerEventLatch = new CountDownLatch(3); finished = new CountDownLatch(1); started = new CountDownLatch(1); log.info("## test single replicas."); // now allow single replicas - setTriggerCommand = "{" + - "'set-trigger' : {" + - "'name' : 'search_rate_trigger2'," + - "'event' : 'searchRate'," + - "'waitFor' : '" + waitForSeconds + "s'," + - "'enabled' : true," + - "'collections' : '" + COLL1 + "'," + - "'aboveRate' : 1.0," + - "'belowRate' : 0.3," + - "'aboveNodeRate' : 1.0," + - "'belowNodeRate' : 0.3," + - "'minReplicas' : 1," + - "'belowNodeOp' : 'none'," + - "'actions' : [" + - "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + - "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" + - "]" + - "}}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-trigger' : {" + + "'name' : 'search_rate_trigger2'," + + "'event' : 'searchRate'," + + "'waitFor' : '" + waitForSeconds + "s'," + + "'enabled' : true," + + "'collections' : '" + COLL1 + "'," + + "'aboveRate' : 1.0," + + "'aboveNodeRate' : 1.0," + + "'belowRate' : 1.0," + // same excessively high values + "'belowNodeRate' : 1.0," + + "'minReplicas' : 1," + // NEW: force lower replicas + "'belowNodeOp' : 'none'," + // still do nothing to nodes + "'actions' : [" + + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" + + "]" + + "}}"); - timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS)); + assertTrue("The trigger did not start in a reasonable amount of time", + started.await(60, TimeUnit.SECONDS)); + + assertTrue("The trigger did not finish in a reasonable amount of time", + started.await(60, TimeUnit.SECONDS)); + + assertTrue("the listener should have recorded all events w/in a reasonable amount of time", + listenerEventLatch.await(60, TimeUnit.SECONDS)); - await = started.await(20, TimeUnit.SECONDS); - assertTrue("The trigger did not fire at all", await); - await = finished.await(60, TimeUnit.SECONDS); - assertTrue("The trigger did not finish processing", await); - - // suspend trigger - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, suspendTriggerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); - - timeSource.sleep(5000); + // suspend the trigger + CloudTestUtils.assertAutoScalingRequest(cloudManager, suspendTriggerCommand); events = listenerEvents.get("srt"); assertEquals(events.toString(), 3, events.size()); @@ -525,7 +499,7 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { } @Test - @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 21-May-2018 + @AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13163") public void testDeleteNode() throws Exception { CloudSolrClient solrClient = cluster.getSolrClient(); String COLL1 = "deleteNode_collection"; @@ -546,103 +520,101 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { CloudTestUtils.waitForState(cloudManager, COLL1, 60, TimeUnit.SECONDS, CloudTestUtils.clusterShape(1, 5)); - String setTriggerCommand = "{" + - "'set-trigger' : {" + - "'name' : 'search_rate_trigger3'," + - "'event' : 'searchRate'," + - "'waitFor' : '" + waitForSeconds + "s'," + - "'enabled' : false," + - "'collections' : '" + COLL1 + "'," + - "'aboveRate' : 1.0," + - "'belowRate' : 0.1," + - // set limits to node rates - "'aboveNodeRate' : 1.0," + - "'belowNodeRate' : 0.1," + - // allow deleting all spare replicas - "'minReplicas' : 1," + - // allow requesting all deletions in one event - "'maxOps' : 10," + - // delete underutilised nodes - "'belowNodeOp' : 'DELETENODE'," + - "'actions' : [" + - "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + - "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" + - "]" + - "}}"; - SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand); - NamedList response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-trigger' : {" + + "'name' : 'search_rate_trigger3'," + + "'event' : 'searchRate'," + + "'waitFor' : '" + waitForSeconds + "s'," + + "'enabled' : false," + + "'collections' : '" + COLL1 + "'," + + "'aboveRate' : 1.0," + + "'aboveNodeRate' : 1.0," + + // RecoveryStrategy calls /admin/ping, which calls /select so the rate may not be zero + // even when no external requests were made .. but it's hard to predict exactly + // what it will be. use an insanely high rate so all shards/nodes are suspect + // and produce an Op regardless of how much internal traffic is produced... + "'belowRate' : 1.0," + + "'belowNodeRate' : 1.0," + + // ...our Ops should be to delete underutilised nodes... + "'belowNodeOp' : 'DELETENODE'," + + // ...allow deleting all spare replicas... + "'minReplicas' : 1," + + // ...and allow requesting all deletions in one event. + "'maxOps' : 10," + + "'actions' : [" + + "{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," + + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" + + "]" + + "}}"); - String setListenerCommand = "{" + - "'set-listener' : " + - "{" + - "'name' : 'started'," + - "'trigger' : 'search_rate_trigger3'," + - "'stage' : ['STARTED']," + - "'class' : '" + StartedProcessingListener.class.getName() + "'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-listener' : " + + "{" + + "'name' : 'started'," + + "'trigger' : 'search_rate_trigger3'," + + "'stage' : ['STARTED']," + + "'class' : '" + StartedProcessingListener.class.getName() + "'" + + "}" + + "}"); - setListenerCommand = "{" + - "'set-listener' : " + - "{" + - "'name' : 'srt'," + - "'trigger' : 'search_rate_trigger3'," + - "'stage' : ['FAILED','SUCCEEDED']," + - "'afterAction': ['compute', 'execute']," + - "'class' : '" + CapturingTriggerListener.class.getName() + "'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); - - setListenerCommand = "{" + - "'set-listener' : " + - "{" + - "'name' : 'finished'," + - "'trigger' : 'search_rate_trigger3'," + - "'stage' : ['SUCCEEDED']," + - "'class' : '" + FinishedProcessingListener.class.getName() + "'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); - - timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS)); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-listener' : " + + "{" + + "'name' : 'srt'," + + "'trigger' : 'search_rate_trigger3'," + + "'stage' : ['FAILED','SUCCEEDED']," + + "'afterAction': ['compute', 'execute']," + + "'class' : '" + CapturingTriggerListener.class.getName() + "'" + + "}" + + "}"); + listenerEventLatch = new CountDownLatch(3); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'set-listener' : " + + "{" + + "'name' : 'finished'," + + "'trigger' : 'search_rate_trigger3'," + + "'stage' : ['SUCCEEDED']," + + "'class' : '" + FinishedProcessingListener.class.getName() + "'" + + "}" + + "}"); + + // Explicitly Do Nothing Here + // enable the trigger - String resumeTriggerCommand = "{" + - "'resume-trigger' : {" + - "'name' : 'search_rate_trigger3'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, resumeTriggerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); - - timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS)); - - boolean await = started.await(20, TimeUnit.SECONDS); - assertTrue("The trigger did not fire at all", await); - await = finished.await(90, TimeUnit.SECONDS); - assertTrue("The trigger did not finish processing", await); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'resume-trigger' : {" + + "'name' : 'search_rate_trigger3'" + + "}" + + "}"); + assertTrue("The trigger did not start in a reasonable amount of time", + started.await(60, TimeUnit.SECONDS)); + + assertTrue("The trigger did not finish in a reasonable amount of time", + started.await(60, TimeUnit.SECONDS)); + + assertTrue("the listener should have recorded all events w/in a reasonable amount of time", + listenerEventLatch.await(60, TimeUnit.SECONDS)); + // suspend the trigger - String suspendTriggerCommand = "{" + - "'suspend-trigger' : {" + - "'name' : 'search_rate_trigger3'" + - "}" + - "}"; - req = AutoScalingRequest.create(SolrRequest.METHOD.POST, suspendTriggerCommand); - response = solrClient.request(req); - assertEquals(response.get("result").toString(), "success"); - - timeSource.sleep(5000); + CloudTestUtils.assertAutoScalingRequest + (cloudManager, + "{" + + "'suspend-trigger' : {" + + "'name' : 'search_rate_trigger3'" + + "}" + + "}"); List events = listenerEvents.get("srt"); assertEquals(events.toString(), 3, events.size()); @@ -651,8 +623,22 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { assertEquals(ev.toString(), "compute", ev.actionName); List ops = (List)ev.event.getProperty(TriggerEvent.REQUESTED_OPS); assertNotNull("there should be some requestedOps: " + ev.toString(), ops); - // 4 DELETEREPLICA, 4 DELETENODE - assertEquals(ops.toString(), 8, ops.size()); + // 4 DELETEREPLICA, 4 DELETENODE (minReplicas==1 & leader should be protected) + assertEquals(ops.toString(), 4 + 4, ops.size()); + // The above assert can fail with actual==9 because all 5 nodes are resulting in a DELETENODE + // Which is problemtatic for 2 reasons: + // 1) it means that the leader node has not been protected from the 'belowNodeOp':'DELETENODE' + // - definitely a bug that needs fixed + // 2) it suggests that minReplicas isn't being respected by 'belowNodeOp':'DELETENODE' + // - something that needs more rigerous testing + // - ie: if belowRate==0 && belowNodeRate==1 && minReplicas==2, will leader + 1 be protected? + // + // In general, to adequately trust testing of 'belowNodeOp':'DELETENODE' we should also test: + // - some nodes with multiple replicas of the shard to ensure best nodes are picked + // - node nodes hosting replicas of multiple shards/collection, only some of which are belowNodeRate + + + AtomicInteger replicas = new AtomicInteger(); AtomicInteger nodes = new AtomicInteger(); ops.forEach(op -> { @@ -715,10 +701,18 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase { @Override public synchronized void onEvent(TriggerEvent event, TriggerEventProcessorStage stage, String actionName, ActionContext context, Throwable error, String message) { - List lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>()); CapturedEvent ev = new CapturedEvent(timeSource.getTimeNs(), context, config, stage, actionName, event, message); - log.info("=======> " + ev); - lst.add(ev); + final CountDownLatch latch = listenerEventLatch; + synchronized (latch) { + if (0 == latch.getCount()) { + log.warn("Ignoring captured event since latch is 'full': {}", ev); + } else { + List lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>()); + log.info("=======> " + ev); + lst.add(ev); + latch.countDown(); + } + } } }