SOLR-13140: harden SearchRateTriggerIntegrationTest by using more absolute rate thresholds and latches to track when all events have been processed so we don't need to 'guess' about sleep calls

This commit also disables testDeleteNode pending an AwaitsFix on SOLR-13163
This commit is contained in:
Chris Hostetter 2019-01-22 11:47:11 -07:00
parent 452ffa3626
commit 15e5ca999f

View File

@ -29,7 +29,6 @@ import java.util.concurrent.atomic.AtomicInteger;
import com.carrotsearch.randomizedtesting.annotations.Nightly;
import com.google.common.util.concurrent.AtomicDouble;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig;
import org.apache.solr.client.solrj.cloud.autoscaling.ReplicaInfo;
@ -37,7 +36,6 @@ import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventProcessorStage
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.cloud.CloudTestUtils;
import org.apache.solr.cloud.CloudTestUtils.AutoScalingRequest;
import org.apache.solr.cloud.SolrCloudTestCase;
import org.apache.solr.common.MapWriter;
import org.apache.solr.common.cloud.Replica;
@ -71,10 +69,11 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final TimeSource timeSource = TimeSource.NANO_TIME;
private static CountDownLatch listenerCreated = new CountDownLatch(1);
private static Map<String, List<CapturedEvent>> listenerEvents = new HashMap<>();
private static CountDownLatch finished = new CountDownLatch(1);
private static CountDownLatch started = new CountDownLatch(1);
private static volatile CountDownLatch listenerCreated = new CountDownLatch(1);
private static volatile CountDownLatch listenerEventLatch = new CountDownLatch(0);
private static volatile Map<String, List<CapturedEvent>> listenerEvents = new HashMap<>();
private static volatile CountDownLatch finished = new CountDownLatch(1);
private static volatile CountDownLatch started = new CountDownLatch(1);
private static SolrCloudManager cloudManager;
private int waitForSeconds;
@ -99,15 +98,17 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
// clear any persisted auto scaling configuration
Stat stat = zkClient().setData(SOLR_AUTOSCALING_CONF_PATH, Utils.toJSON(new ZkNodeProps()), true);
log.info(SOLR_AUTOSCALING_CONF_PATH + " reset, new znode version {}", stat.getVersion());
timeSource.sleep(5000);
deleteChildrenRecursively(ZkStateReader.SOLR_AUTOSCALING_EVENTS_PATH);
deleteChildrenRecursively(ZkStateReader.SOLR_AUTOSCALING_TRIGGER_STATE_PATH);
deleteChildrenRecursively(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH);
deleteChildrenRecursively(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH);
finished = new CountDownLatch(1);
started = new CountDownLatch(1);
listenerCreated = new CountDownLatch(1);
listenerEvents = new HashMap<>();
listenerEventLatch = new CountDownLatch(0);
waitForSeconds = 3 + random().nextInt(5);
}
@ -126,100 +127,94 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
CloudTestUtils.waitForState(cloudManager, COLL1, 60, TimeUnit.SECONDS,
CloudTestUtils.clusterShape(1, 2));
// the trigger is initially disabled so that we have the time to set up listeners
// and generate the traffic
String setTriggerCommand = "{" +
"'set-trigger' : {" +
"'name' : 'search_rate_trigger1'," +
"'event' : 'searchRate'," +
"'waitFor' : '" + waitForSeconds + "s'," +
"'enabled' : false," +
"'collections' : '" + COLL1 + "'," +
"'aboveRate' : 1.0," +
"'belowRate' : 0.1," +
"'actions' : [" +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" +
"]" +
"}}";
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
// the trigger is initially disabled so that we have the time to set up listeners
// and generate the traffic
"{" +
"'set-trigger' : {" +
"'name' : 'search_rate_trigger1'," +
"'event' : 'searchRate'," +
"'waitFor' : '" + waitForSeconds + "s'," +
"'enabled' : false," +
"'collections' : '" + COLL1 + "'," +
"'aboveRate' : 1.0," +
"'belowRate' : 0.1," +
"'actions' : [" +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" +
"]" +
"}}");
String setListenerCommand = "{" +
"'set-listener' : " +
"{" +
"'name' : 'started'," +
"'trigger' : 'search_rate_trigger1'," +
"'stage' : ['STARTED']," +
"'class' : '" + StartedProcessingListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-listener' : " +
"{" +
"'name' : 'started'," +
"'trigger' : 'search_rate_trigger1'," +
"'stage' : ['STARTED']," +
"'class' : '" + StartedProcessingListener.class.getName() + "'" +
"}" +
"}");
setListenerCommand = "{" +
"'set-listener' : " +
"{" +
"'name' : 'srt'," +
"'trigger' : 'search_rate_trigger1'," +
"'stage' : ['FAILED','SUCCEEDED']," +
"'afterAction': ['compute', 'execute']," +
"'class' : '" + CapturingTriggerListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-listener' : " +
"{" +
"'name' : 'srt'," +
"'trigger' : 'search_rate_trigger1'," +
"'stage' : ['FAILED','SUCCEEDED']," +
"'afterAction': ['compute', 'execute']," +
"'class' : '" + CapturingTriggerListener.class.getName() + "'" +
"}" +
"}");
listenerEventLatch = new CountDownLatch(3);
setListenerCommand = "{" +
"'set-listener' : " +
"{" +
"'name' : 'finished'," +
"'trigger' : 'search_rate_trigger1'," +
"'stage' : ['SUCCEEDED']," +
"'class' : '" + FinishedProcessingListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-listener' : " +
"{" +
"'name' : 'finished'," +
"'trigger' : 'search_rate_trigger1'," +
"'stage' : ['SUCCEEDED']," +
"'class' : '" + FinishedProcessingListener.class.getName() + "'" +
"}" +
"}");
SolrParams query = params(CommonParams.Q, "*:*");
for (int i = 0; i < 500; i++) {
solrClient.query(COLL1, query);
}
// enable the trigger
String resumeTriggerCommand = "{" +
"'resume-trigger' : {" +
"'name' : 'search_rate_trigger1'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, resumeTriggerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'resume-trigger' : {" +
"'name' : 'search_rate_trigger1'" +
"}" +
"}");
timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS));
boolean await = started.await(20, TimeUnit.SECONDS);
assertTrue("The trigger did not fire at all", await);
await = finished.await(60, TimeUnit.SECONDS);
assertTrue("The trigger did not finish processing", await);
assertTrue("The trigger did not start in a reasonable amount of time",
started.await(60, TimeUnit.SECONDS));
assertTrue("The trigger did not finish in a reasonable amount of time",
started.await(60, TimeUnit.SECONDS));
assertTrue("the listener should have recorded all events w/in a reasonable amount of time",
listenerEventLatch.await(60, TimeUnit.SECONDS));
// suspend the trigger
String suspendTriggerCommand = "{" +
"'suspend-trigger' : {" +
"'name' : 'search_rate_trigger1'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, suspendTriggerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
timeSource.sleep(5000);
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'suspend-trigger' : {" +
"'name' : 'search_rate_trigger1'" +
"}" +
"}");
List<CapturedEvent> events = listenerEvents.get("srt");
assertEquals(listenerEvents.toString(), 3, events.size());
@ -274,8 +269,6 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
}
@Test
//17-Aug-2018 commented @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 21-May-2018
// commented out on: 24-Dec-2018 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // added 15-Sep-2018
public void testBelowSearchRate() throws Exception {
CloudSolrClient solrClient = cluster.getSolrClient();
String COLL1 = "belowRate_collection";
@ -295,100 +288,87 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
CloudTestUtils.waitForState(cloudManager, COLL1, 60, TimeUnit.SECONDS,
CloudTestUtils.clusterShape(1, 5));
String setTriggerCommand = "{" +
"'set-trigger' : {" +
"'name' : 'search_rate_trigger2'," +
"'event' : 'searchRate'," +
"'waitFor' : '" + waitForSeconds + "s'," +
"'enabled' : false," +
"'collections' : '" + COLL1 + "'," +
"'aboveRate' : 1.0," +
// RecoveryStrategy calls /admin/ping, which calls /select so this may not be zero
// even when no external requests were made
"'belowRate' : 0.3," +
"'aboveNodeRate' : 1.0," +
"'belowNodeRate' : 0.3," +
// do nothing but generate an op
"'belowNodeOp' : 'none'," +
"'actions' : [" +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" +
"]" +
"}}";
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-trigger' : {" +
"'name' : 'search_rate_trigger2'," +
"'event' : 'searchRate'," +
"'waitFor' : '" + waitForSeconds + "s'," +
"'enabled' : false," +
"'collections' : '" + COLL1 + "'," +
"'aboveRate' : 1.0," +
"'aboveNodeRate' : 1.0," +
// RecoveryStrategy calls /admin/ping, which calls /select so the rate may not be zero
// even when no external requests were made .. but it's hard to predict exactly
// what it will be. use an insanely high rate so all shards/nodes are suspect
// and produce an Op regardless of how much internal traffic is produced...
"'belowRate' : 1.0," +
"'belowNodeRate' : 1.0," +
// ...but do absolutely nothing to nodes except generate an 'NONE' Op
"'belowNodeOp' : 'none'," +
"'actions' : [" +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" +
"]" +
"}}");
String setListenerCommand = "{" +
"'set-listener' : " +
"{" +
"'name' : 'started'," +
"'trigger' : 'search_rate_trigger2'," +
"'stage' : ['STARTED']," +
"'class' : '" + StartedProcessingListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-listener' : " +
"{" +
"'name' : 'started'," +
"'trigger' : 'search_rate_trigger2'," +
"'stage' : ['STARTED']," +
"'class' : '" + StartedProcessingListener.class.getName() + "'" +
"}" +
"}");
setListenerCommand = "{" +
"'set-listener' : " +
"{" +
"'name' : 'srt'," +
"'trigger' : 'search_rate_trigger2'," +
"'stage' : ['FAILED','SUCCEEDED']," +
"'afterAction': ['compute', 'execute']," +
"'class' : '" + CapturingTriggerListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-listener' : " +
"{" +
"'name' : 'srt'," +
"'trigger' : 'search_rate_trigger2'," +
"'stage' : ['FAILED','SUCCEEDED']," +
"'afterAction': ['compute', 'execute']," +
"'class' : '" + CapturingTriggerListener.class.getName() + "'" +
"}" +
"}");
listenerEventLatch = new CountDownLatch(3);
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-listener' : " +
"{" +
"'name' : 'finished'," +
"'trigger' : 'search_rate_trigger2'," +
"'stage' : ['SUCCEEDED']," +
"'class' : '" + FinishedProcessingListener.class.getName() + "'" +
"}" +
"}");
setListenerCommand = "{" +
"'set-listener' : " +
"{" +
"'name' : 'finished'," +
"'trigger' : 'search_rate_trigger2'," +
"'stage' : ['SUCCEEDED']," +
"'class' : '" + FinishedProcessingListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS));
// Explicitly Do Nothing Here
// enable the trigger
String resumeTriggerCommand = "{" +
"'resume-trigger' : {" +
"'name' : 'search_rate_trigger2'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, resumeTriggerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
final String resumeTriggerCommand = "{ 'resume-trigger' : { 'name' : 'search_rate_trigger2' } }";
CloudTestUtils.assertAutoScalingRequest(cloudManager, resumeTriggerCommand);
timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS));
boolean await = started.await(20, TimeUnit.SECONDS);
assertTrue("The trigger did not fire at all", await);
await = finished.await(60, TimeUnit.SECONDS);
assertTrue("The trigger did not finish processing", await);
assertTrue("The trigger did not start in a reasonable amount of time",
started.await(60, TimeUnit.SECONDS));
assertTrue("The trigger did not finish in a reasonable amount of time",
started.await(60, TimeUnit.SECONDS));
assertTrue("the listener should have recorded all events w/in a reasonable amount of time",
listenerEventLatch.await(60, TimeUnit.SECONDS));
// suspend the trigger
String suspendTriggerCommand = "{" +
"'suspend-trigger' : {" +
"'name' : 'search_rate_trigger2'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, suspendTriggerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS));
final String suspendTriggerCommand = "{ 'suspend-trigger' : { 'name' : 'search_rate_trigger2' } }";
CloudTestUtils.assertAutoScalingRequest(cloudManager, suspendTriggerCommand);
List<CapturedEvent> events = listenerEvents.get("srt");
assertEquals(events.toString(), 3, events.size());
@ -396,8 +376,8 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
assertEquals(ev.toString(), "compute", ev.actionName);
List<TriggerEvent.Op> ops = (List<TriggerEvent.Op>)ev.event.getProperty(TriggerEvent.REQUESTED_OPS);
assertNotNull("there should be some requestedOps: " + ev.toString(), ops);
// 4 cold nodes, 3 cold replicas
assertEquals(ops.toString(), 7, ops.size());
// 5 cold nodes, 3 cold replicas
assertEquals(ops.toString(), 5 + 3, ops.size());
AtomicInteger coldNodes = new AtomicInteger();
AtomicInteger coldReplicas = new AtomicInteger();
ops.forEach(op -> {
@ -409,7 +389,7 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
fail("unexpected op: " + op);
}
});
assertEquals("cold nodes", 4, coldNodes.get());
assertEquals("cold nodes", 5, coldNodes.get());
assertEquals("cold replicas", 3, coldReplicas.get());
// now the collection should be down to RF = 2
@ -417,28 +397,26 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
CloudTestUtils.clusterShape(1, 2));
listenerEvents.clear();
listenerEventLatch = new CountDownLatch(3);
finished = new CountDownLatch(1);
started = new CountDownLatch(1);
// resume trigger
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, resumeTriggerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest(cloudManager, resumeTriggerCommand);
assertTrue("The trigger did not start in a reasonable amount of time",
started.await(60, TimeUnit.SECONDS));
assertTrue("The trigger did not finish in a reasonable amount of time",
started.await(60, TimeUnit.SECONDS));
assertTrue("the listener should have recorded all events w/in a reasonable amount of time",
listenerEventLatch.await(60, TimeUnit.SECONDS));
// suspend the trigger
CloudTestUtils.assertAutoScalingRequest(cloudManager, suspendTriggerCommand);
// there should be only coldNode ops now, and no coldReplica ops since searchable RF == collection RF
timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS));
await = started.await(20, TimeUnit.SECONDS);
assertTrue("The trigger did not fire at all", await);
await = finished.await(60, TimeUnit.SECONDS);
assertTrue("The trigger did not finish processing", await);
// suspend trigger
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, suspendTriggerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
timeSource.sleep(5000);
events = listenerEvents.get("srt");
assertEquals(events.toString(), 3, events.size());
@ -451,51 +429,47 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
assertEquals(ops.toString(), CollectionParams.CollectionAction.NONE, ops.get(0).getAction());
assertEquals(ops.toString(), CollectionParams.CollectionAction.NONE, ops.get(1).getAction());
// wait for waitFor to elapse for all types of violations
timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds * 2, TimeUnit.SECONDS));
listenerEvents.clear();
listenerEventLatch = new CountDownLatch(3);
finished = new CountDownLatch(1);
started = new CountDownLatch(1);
log.info("## test single replicas.");
// now allow single replicas
setTriggerCommand = "{" +
"'set-trigger' : {" +
"'name' : 'search_rate_trigger2'," +
"'event' : 'searchRate'," +
"'waitFor' : '" + waitForSeconds + "s'," +
"'enabled' : true," +
"'collections' : '" + COLL1 + "'," +
"'aboveRate' : 1.0," +
"'belowRate' : 0.3," +
"'aboveNodeRate' : 1.0," +
"'belowNodeRate' : 0.3," +
"'minReplicas' : 1," +
"'belowNodeOp' : 'none'," +
"'actions' : [" +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" +
"]" +
"}}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-trigger' : {" +
"'name' : 'search_rate_trigger2'," +
"'event' : 'searchRate'," +
"'waitFor' : '" + waitForSeconds + "s'," +
"'enabled' : true," +
"'collections' : '" + COLL1 + "'," +
"'aboveRate' : 1.0," +
"'aboveNodeRate' : 1.0," +
"'belowRate' : 1.0," + // same excessively high values
"'belowNodeRate' : 1.0," +
"'minReplicas' : 1," + // NEW: force lower replicas
"'belowNodeOp' : 'none'," + // still do nothing to nodes
"'actions' : [" +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" +
"]" +
"}}");
timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS));
assertTrue("The trigger did not start in a reasonable amount of time",
started.await(60, TimeUnit.SECONDS));
assertTrue("The trigger did not finish in a reasonable amount of time",
started.await(60, TimeUnit.SECONDS));
assertTrue("the listener should have recorded all events w/in a reasonable amount of time",
listenerEventLatch.await(60, TimeUnit.SECONDS));
await = started.await(20, TimeUnit.SECONDS);
assertTrue("The trigger did not fire at all", await);
await = finished.await(60, TimeUnit.SECONDS);
assertTrue("The trigger did not finish processing", await);
// suspend trigger
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, suspendTriggerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
timeSource.sleep(5000);
// suspend the trigger
CloudTestUtils.assertAutoScalingRequest(cloudManager, suspendTriggerCommand);
events = listenerEvents.get("srt");
assertEquals(events.toString(), 3, events.size());
@ -525,7 +499,7 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
}
@Test
@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 21-May-2018
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13163")
public void testDeleteNode() throws Exception {
CloudSolrClient solrClient = cluster.getSolrClient();
String COLL1 = "deleteNode_collection";
@ -546,103 +520,101 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
CloudTestUtils.waitForState(cloudManager, COLL1, 60, TimeUnit.SECONDS,
CloudTestUtils.clusterShape(1, 5));
String setTriggerCommand = "{" +
"'set-trigger' : {" +
"'name' : 'search_rate_trigger3'," +
"'event' : 'searchRate'," +
"'waitFor' : '" + waitForSeconds + "s'," +
"'enabled' : false," +
"'collections' : '" + COLL1 + "'," +
"'aboveRate' : 1.0," +
"'belowRate' : 0.1," +
// set limits to node rates
"'aboveNodeRate' : 1.0," +
"'belowNodeRate' : 0.1," +
// allow deleting all spare replicas
"'minReplicas' : 1," +
// allow requesting all deletions in one event
"'maxOps' : 10," +
// delete underutilised nodes
"'belowNodeOp' : 'DELETENODE'," +
"'actions' : [" +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" +
"]" +
"}}";
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-trigger' : {" +
"'name' : 'search_rate_trigger3'," +
"'event' : 'searchRate'," +
"'waitFor' : '" + waitForSeconds + "s'," +
"'enabled' : false," +
"'collections' : '" + COLL1 + "'," +
"'aboveRate' : 1.0," +
"'aboveNodeRate' : 1.0," +
// RecoveryStrategy calls /admin/ping, which calls /select so the rate may not be zero
// even when no external requests were made .. but it's hard to predict exactly
// what it will be. use an insanely high rate so all shards/nodes are suspect
// and produce an Op regardless of how much internal traffic is produced...
"'belowRate' : 1.0," +
"'belowNodeRate' : 1.0," +
// ...our Ops should be to delete underutilised nodes...
"'belowNodeOp' : 'DELETENODE'," +
// ...allow deleting all spare replicas...
"'minReplicas' : 1," +
// ...and allow requesting all deletions in one event.
"'maxOps' : 10," +
"'actions' : [" +
"{'name':'compute','class':'" + ComputePlanAction.class.getName() + "'}," +
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}" +
"]" +
"}}");
String setListenerCommand = "{" +
"'set-listener' : " +
"{" +
"'name' : 'started'," +
"'trigger' : 'search_rate_trigger3'," +
"'stage' : ['STARTED']," +
"'class' : '" + StartedProcessingListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-listener' : " +
"{" +
"'name' : 'started'," +
"'trigger' : 'search_rate_trigger3'," +
"'stage' : ['STARTED']," +
"'class' : '" + StartedProcessingListener.class.getName() + "'" +
"}" +
"}");
setListenerCommand = "{" +
"'set-listener' : " +
"{" +
"'name' : 'srt'," +
"'trigger' : 'search_rate_trigger3'," +
"'stage' : ['FAILED','SUCCEEDED']," +
"'afterAction': ['compute', 'execute']," +
"'class' : '" + CapturingTriggerListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
setListenerCommand = "{" +
"'set-listener' : " +
"{" +
"'name' : 'finished'," +
"'trigger' : 'search_rate_trigger3'," +
"'stage' : ['SUCCEEDED']," +
"'class' : '" + FinishedProcessingListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS));
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-listener' : " +
"{" +
"'name' : 'srt'," +
"'trigger' : 'search_rate_trigger3'," +
"'stage' : ['FAILED','SUCCEEDED']," +
"'afterAction': ['compute', 'execute']," +
"'class' : '" + CapturingTriggerListener.class.getName() + "'" +
"}" +
"}");
listenerEventLatch = new CountDownLatch(3);
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'set-listener' : " +
"{" +
"'name' : 'finished'," +
"'trigger' : 'search_rate_trigger3'," +
"'stage' : ['SUCCEEDED']," +
"'class' : '" + FinishedProcessingListener.class.getName() + "'" +
"}" +
"}");
// Explicitly Do Nothing Here
// enable the trigger
String resumeTriggerCommand = "{" +
"'resume-trigger' : {" +
"'name' : 'search_rate_trigger3'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, resumeTriggerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
timeSource.sleep(TimeUnit.MILLISECONDS.convert(waitForSeconds + 1, TimeUnit.SECONDS));
boolean await = started.await(20, TimeUnit.SECONDS);
assertTrue("The trigger did not fire at all", await);
await = finished.await(90, TimeUnit.SECONDS);
assertTrue("The trigger did not finish processing", await);
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'resume-trigger' : {" +
"'name' : 'search_rate_trigger3'" +
"}" +
"}");
assertTrue("The trigger did not start in a reasonable amount of time",
started.await(60, TimeUnit.SECONDS));
assertTrue("The trigger did not finish in a reasonable amount of time",
started.await(60, TimeUnit.SECONDS));
assertTrue("the listener should have recorded all events w/in a reasonable amount of time",
listenerEventLatch.await(60, TimeUnit.SECONDS));
// suspend the trigger
String suspendTriggerCommand = "{" +
"'suspend-trigger' : {" +
"'name' : 'search_rate_trigger3'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, suspendTriggerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
timeSource.sleep(5000);
CloudTestUtils.assertAutoScalingRequest
(cloudManager,
"{" +
"'suspend-trigger' : {" +
"'name' : 'search_rate_trigger3'" +
"}" +
"}");
List<CapturedEvent> events = listenerEvents.get("srt");
assertEquals(events.toString(), 3, events.size());
@ -651,8 +623,22 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
assertEquals(ev.toString(), "compute", ev.actionName);
List<TriggerEvent.Op> ops = (List<TriggerEvent.Op>)ev.event.getProperty(TriggerEvent.REQUESTED_OPS);
assertNotNull("there should be some requestedOps: " + ev.toString(), ops);
// 4 DELETEREPLICA, 4 DELETENODE
assertEquals(ops.toString(), 8, ops.size());
// 4 DELETEREPLICA, 4 DELETENODE (minReplicas==1 & leader should be protected)
assertEquals(ops.toString(), 4 + 4, ops.size());
// The above assert can fail with actual==9 because all 5 nodes are resulting in a DELETENODE
// Which is problemtatic for 2 reasons:
// 1) it means that the leader node has not been protected from the 'belowNodeOp':'DELETENODE'
// - definitely a bug that needs fixed
// 2) it suggests that minReplicas isn't being respected by 'belowNodeOp':'DELETENODE'
// - something that needs more rigerous testing
// - ie: if belowRate==0 && belowNodeRate==1 && minReplicas==2, will leader + 1 be protected?
//
// In general, to adequately trust testing of 'belowNodeOp':'DELETENODE' we should also test:
// - some nodes with multiple replicas of the shard to ensure best nodes are picked
// - node nodes hosting replicas of multiple shards/collection, only some of which are belowNodeRate
AtomicInteger replicas = new AtomicInteger();
AtomicInteger nodes = new AtomicInteger();
ops.forEach(op -> {
@ -715,10 +701,18 @@ public class SearchRateTriggerIntegrationTest extends SolrCloudTestCase {
@Override
public synchronized void onEvent(TriggerEvent event, TriggerEventProcessorStage stage, String actionName,
ActionContext context, Throwable error, String message) {
List<CapturedEvent> lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>());
CapturedEvent ev = new CapturedEvent(timeSource.getTimeNs(), context, config, stage, actionName, event, message);
log.info("=======> " + ev);
lst.add(ev);
final CountDownLatch latch = listenerEventLatch;
synchronized (latch) {
if (0 == latch.getCount()) {
log.warn("Ignoring captured event since latch is 'full': {}", ev);
} else {
List<CapturedEvent> lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>());
log.info("=======> " + ev);
lst.add(ev);
latch.countDown();
}
}
}
}