SOLR-12923: harden TestSimLargeCluster

- added logging
- ensure start/finish trigger action counters are incremented before latches are released
- replace arbitrary sleep calls with a trigger listener countdown latch
- increase all await() times: This means that 'real' failures (which should be rare and hopefully
  reproducible) will be 'slow', but the trade off will be less hard to reproduce 'false failures'
  due to thread contention on slow or heavily loaded (ie: jenkins) machines
This commit is contained in:
Chris Hostetter 2019-03-07 15:40:57 -07:00
parent 683aa3d3e9
commit 20de3d2ee0
1 changed files with 76 additions and 80 deletions

View File

@ -32,14 +32,10 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig;
import org.apache.solr.client.solrj.cloud.autoscaling.Suggester; import org.apache.solr.client.solrj.cloud.autoscaling.Suggester;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventProcessorStage; import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventProcessorStage;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType; import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.cloud.CloudTestUtils.AutoScalingRequest;
import org.apache.solr.cloud.CloudTestUtils; import org.apache.solr.cloud.CloudTestUtils;
import org.apache.solr.cloud.autoscaling.ActionContext; import org.apache.solr.cloud.autoscaling.ActionContext;
import org.apache.solr.cloud.autoscaling.CapturedEvent; import org.apache.solr.cloud.autoscaling.CapturedEvent;
@ -49,14 +45,11 @@ import org.apache.solr.cloud.autoscaling.SearchRateTrigger;
import org.apache.solr.cloud.autoscaling.TriggerActionBase; import org.apache.solr.cloud.autoscaling.TriggerActionBase;
import org.apache.solr.cloud.autoscaling.TriggerEvent; import org.apache.solr.cloud.autoscaling.TriggerEvent;
import org.apache.solr.cloud.autoscaling.TriggerListenerBase; import org.apache.solr.cloud.autoscaling.TriggerListenerBase;
import org.apache.solr.cloud.autoscaling.TriggerValidationException;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.params.CollectionParams; import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.Pair; import org.apache.solr.common.util.Pair;
import org.apache.solr.common.util.TimeSource; import org.apache.solr.common.util.TimeSource;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.util.LogLevel; import org.apache.solr.util.LogLevel;
import org.apache.solr.util.TimeOut; import org.apache.solr.util.TimeOut;
import org.junit.After; import org.junit.After;
@ -76,12 +69,13 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
public static final int NUM_NODES = 100; public static final int NUM_NODES = 100;
static Map<String, List<CapturedEvent>> listenerEvents = new ConcurrentHashMap<>(); static final Map<String, List<CapturedEvent>> listenerEvents = new ConcurrentHashMap<>();
static AtomicInteger triggerFinishedCount = new AtomicInteger(); static final AtomicInteger triggerFinishedCount = new AtomicInteger();
static AtomicInteger triggerStartedCount = new AtomicInteger(); static final AtomicInteger triggerStartedCount = new AtomicInteger();
static CountDownLatch triggerStartedLatch; static volatile CountDownLatch triggerStartedLatch;
static CountDownLatch triggerFinishedLatch; static volatile CountDownLatch triggerFinishedLatch;
static int waitForSeconds; static volatile CountDownLatch listenerEventLatch;
static volatile int waitForSeconds;
@After @After
public void tearDownTest() throws Exception { public void tearDownTest() throws Exception {
@ -105,20 +99,28 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
triggerFinishedCount.set(0); triggerFinishedCount.set(0);
triggerStartedLatch = new CountDownLatch(1); triggerStartedLatch = new CountDownLatch(1);
triggerFinishedLatch = new CountDownLatch(1); triggerFinishedLatch = new CountDownLatch(1);
// by default assume we want to allow a (virtually) unbounded amount of events,
// tests that expect a specific number can override
listenerEventLatch = new CountDownLatch(Integer.MAX_VALUE);
listenerEvents.clear(); listenerEvents.clear();
} }
public static class TestTriggerListener extends TriggerListenerBase { public static class TestTriggerListener extends TriggerListenerBase {
@Override
public void configure(SolrResourceLoader loader, SolrCloudManager cloudManager, AutoScalingConfig.TriggerListenerConfig config) throws TriggerValidationException {
super.configure(loader, cloudManager, config);
}
@Override @Override
public synchronized void onEvent(TriggerEvent event, TriggerEventProcessorStage stage, String actionName, public synchronized void onEvent(TriggerEvent event, TriggerEventProcessorStage stage, String actionName,
ActionContext context, Throwable error, String message) { ActionContext context, Throwable error, String message) {
CapturedEvent ev = new CapturedEvent(cluster.getTimeSource().getTimeNs(), context, config, stage, actionName, event, message);
final CountDownLatch latch = listenerEventLatch;
synchronized (latch) {
if (0 == latch.getCount()) {
log.warn("Ignoring captured event since latch is 'full': {}", ev);
} else {
List<CapturedEvent> lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>()); List<CapturedEvent> lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>());
lst.add(new CapturedEvent(cluster.getTimeSource().getTimeNs(), context, config, stage, actionName, event, message)); lst.add(ev);
latch.countDown();
}
}
} }
} }
@ -133,8 +135,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
public static class StartTriggerAction extends TriggerActionBase { public static class StartTriggerAction extends TriggerActionBase {
@Override @Override
public void process(TriggerEvent event, ActionContext context) throws Exception { public void process(TriggerEvent event, ActionContext context) throws Exception {
triggerStartedLatch.countDown();
triggerStartedCount.incrementAndGet(); triggerStartedCount.incrementAndGet();
triggerStartedLatch.countDown();
} }
} }
@ -142,7 +144,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // this test hits a timeout easily @AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // this test hits a timeout easily
public void testBasic() throws Exception { public void testBasic() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient(); SolrClient solrClient = cluster.simGetSolrClient();
String setTriggerCommand = "{" + assertAutoScalingRequest
( "{" +
"'set-trigger' : {" + "'set-trigger' : {" +
"'name' : 'node_lost_trigger1'," + "'name' : 'node_lost_trigger1'," +
"'event' : 'nodeLost'," + "'event' : 'nodeLost'," +
@ -154,12 +157,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" + "]" +
"}}"; "}}");
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
String setListenerCommand = "{" + assertAutoScalingRequest
( "{" +
"'set-listener' : " + "'set-listener' : " +
"{" + "{" +
"'name' : 'foo'," + "'name' : 'foo'," +
@ -169,15 +170,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"'afterAction' : ['compute', 'execute']," + "'afterAction' : ['compute', 'execute']," +
"'class' : '" + TestTriggerListener.class.getName() + "'" + "'class' : '" + TestTriggerListener.class.getName() + "'" +
"}" + "}" +
"}"; "}");
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
assertAutoscalingUpdateComplete(); assertAutoscalingUpdateComplete();
cluster.getTimeSource().sleep(5000);
// pick a few random nodes // pick a few random nodes
List<String> nodes = new ArrayList<>(); List<String> nodes = new ArrayList<>();
int limit = 75; int limit = 75;
@ -240,7 +236,6 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get()); fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get());
} }
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS, log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS,
CloudTestUtils.clusterShape(5, 15, false, true)) + "ms"); CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
long newMoveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name()); long newMoveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name());
@ -254,7 +249,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
@Test @Test
public void testAddNode() throws Exception { public void testAddNode() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient(); SolrClient solrClient = cluster.simGetSolrClient();
String setTriggerCommand = "{" + assertAutoScalingRequest
( "{" +
"'set-trigger' : {" + "'set-trigger' : {" +
"'name' : 'node_added_trigger2'," + "'name' : 'node_added_trigger2'," +
"'event' : 'nodeAdded'," + "'event' : 'nodeAdded'," +
@ -266,10 +262,7 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" + "]" +
"}}"; "}}");
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
assertAutoscalingUpdateComplete(); assertAutoscalingUpdateComplete();
@ -289,16 +282,19 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
List<String> addNodesList = new ArrayList<>(numAddNode); List<String> addNodesList = new ArrayList<>(numAddNode);
for (int i = 0; i < numAddNode; i++) { for (int i = 0; i < numAddNode; i++) {
addNodesList.add(cluster.simAddNode()); addNodesList.add(cluster.simAddNode());
cluster.getTimeSource().sleep(5000);
} }
// wait until at least one event is generated // wait until at least one event is generated
boolean await = triggerStartedLatch.await(20000 / SPEED, TimeUnit.MILLISECONDS); assertTrue("Trigger did not start even after await()ing an excessive amount of time",
assertTrue("trigger did not fire", await); triggerStartedLatch.await(60, TimeUnit.SECONDS));
// wait until started == finished // wait until started == finished
TimeOut timeOut = new TimeOut(45 * waitForSeconds * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource()); TimeOut timeOut = new TimeOut(45 * waitForSeconds * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource());
while (!timeOut.hasTimedOut()) { while (!timeOut.hasTimedOut()) {
final int started = triggerStartedCount.get();
final int finished = triggerFinishedCount.get();
log.info("started={} =?= finished={}", started, finished);
if (triggerStartedCount.get() == triggerFinishedCount.get()) { if (triggerStartedCount.get() == triggerFinishedCount.get()) {
log.info("started == finished: {} == {}", started, finished);
break; break;
} }
timeOut.sleep(1000); timeOut.sleep(1000);
@ -324,18 +320,22 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
SolrInputDocument startedEvent = systemColl.get(startedEventPos); SolrInputDocument startedEvent = systemColl.get(startedEventPos);
int lastIgnoredPos = startedEventPos; int lastIgnoredPos = startedEventPos;
// make sure some replicas have been moved // make sure some replicas have been moved
assertTrue("no MOVEREPLICA ops?", cluster.simGetOpCount("MOVEREPLICA") > 0); long lastNumOps = cluster.simGetOpCount("MOVEREPLICA");
log.info("1st check: lastNumOps (MOVEREPLICA) = {}", lastNumOps);
assertTrue("no MOVEREPLICA ops?", lastNumOps > 0);
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS, log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3, false, true)) + " ms"); CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3, false, true)) + " ms");
int count = 1000; int count = 1000;
SolrInputDocument finishedEvent = null; SolrInputDocument finishedEvent = null;
long lastNumOps = cluster.simGetOpCount("MOVEREPLICA"); lastNumOps = cluster.simGetOpCount("MOVEREPLICA");
log.info("2nd check: lastNumOps (MOVEREPLICA) = {}", lastNumOps);
while (count-- > 0) { while (count-- > 0) {
cluster.getTimeSource().sleep(10000); cluster.getTimeSource().sleep(10000);
if (cluster.simGetOpCount("MOVEREPLICA") < 2) { if (cluster.simGetOpCount("MOVEREPLICA") < 2) {
log.info("MOVEREPLICA < 2");
continue; continue;
} }
@ -349,24 +349,25 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
} }
if ("SUCCEEDED".equals(d.getFieldValue("stage_s"))) { if ("SUCCEEDED".equals(d.getFieldValue("stage_s"))) {
finishedEvent = d; finishedEvent = d;
log.info("finishedEvent = {}", finishedEvent);
break; break;
} }
} }
log.info("breaking because currentNumOps == lastNumOps == {}", currentNumOps);
break; break;
} else { } else {
lastNumOps = currentNumOps; lastNumOps = currentNumOps;
} }
} }
assertTrue("did not finish processing changes", finishedEvent != null); assertNotNull("did not finish processing changes", finishedEvent);
long delta = (Long)finishedEvent.getFieldValue("event.time_l") - (Long)startedEvent.getFieldValue("event.time_l"); long delta = (Long)finishedEvent.getFieldValue("event.time_l") - (Long)startedEvent.getFieldValue("event.time_l");
log.info("#### System stabilized after " + TimeUnit.NANOSECONDS.toMillis(delta) + " ms"); log.info("#### System stabilized after " + TimeUnit.NANOSECONDS.toMillis(delta) + " ms");
assertTrue("unexpected number of MOVEREPLICA ops", cluster.simGetOpCount("MOVEREPLICA") > 1); assertTrue("unexpected number of MOVEREPLICA ops: " + cluster.simGetOpCount("MOVEREPLICA"),
cluster.simGetOpCount("MOVEREPLICA") > 1);
} }
@Test @Test
// commented 4-Sep-2018 @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2018-06-18
// commented out on: 17-Feb-2019 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 14-Oct-2018
public void testNodeLost() throws Exception { public void testNodeLost() throws Exception {
doTestNodeLost(waitForSeconds, 5000, 0); doTestNodeLost(waitForSeconds, 5000, 0);
} }
@ -446,7 +447,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
private long doTestNodeLost(int waitFor, long killDelay, int minIgnored) throws Exception { private long doTestNodeLost(int waitFor, long killDelay, int minIgnored) throws Exception {
SolrClient solrClient = cluster.simGetSolrClient(); SolrClient solrClient = cluster.simGetSolrClient();
String setTriggerCommand = "{" + assertAutoScalingRequest
( "{" +
"'set-trigger' : {" + "'set-trigger' : {" +
"'name' : 'node_lost_trigger3'," + "'name' : 'node_lost_trigger3'," +
"'event' : 'nodeLost'," + "'event' : 'nodeLost'," +
@ -458,12 +460,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" + "]" +
"}}"; "}}");
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
String setListenerCommand = "{" + assertAutoScalingRequest
( "{" +
"'set-listener' : " + "'set-listener' : " +
"{" + "{" +
"'name' : 'failures'," + "'name' : 'failures'," +
@ -471,10 +471,7 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"'stage' : ['FAILED']," + "'stage' : ['FAILED']," +
"'class' : '" + TestTriggerListener.class.getName() + "'" + "'class' : '" + TestTriggerListener.class.getName() + "'" +
"}" + "}" +
"}"; "}");
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
assertAutoscalingUpdateComplete(); assertAutoscalingUpdateComplete();
@ -499,10 +496,9 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
cluster.getTimeSource().sleep(killDelay); cluster.getTimeSource().sleep(killDelay);
} }
// wait for the trigger to fire and complete at least once // wait for the trigger to fire and complete at least once
boolean await = triggerFinishedLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS); assertTrue("Trigger did not finish even after await()ing an excessive amount of time",
assertTrue("trigger did not fire within timeout, " + triggerFinishedLatch.await(60, TimeUnit.SECONDS));
"waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
await);
List<SolrInputDocument> systemColl = cluster.simGetSystemCollection(); List<SolrInputDocument> systemColl = cluster.simGetSystemCollection();
int startedEventPos = -1; int startedEventPos = -1;
@ -646,7 +642,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
// now define the trigger. doing it earlier may cause partial events to be generated (where only some // now define the trigger. doing it earlier may cause partial events to be generated (where only some
// nodes / replicas exceeded the threshold). // nodes / replicas exceeded the threshold).
String setTriggerCommand = "{" + assertAutoScalingRequest
( "{" +
"'set-trigger' : {" + "'set-trigger' : {" +
"'name' : 'search_rate_trigger'," + "'name' : 'search_rate_trigger'," +
"'event' : 'searchRate'," + "'event' : 'searchRate'," +
@ -659,11 +656,13 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," + "{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" + "{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" + "]" +
"}}"; "}}");
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success"); // we're going to expect our trigger listener to process exactly one captured event
String setListenerCommand1 = "{" + listenerEventLatch = new CountDownLatch(1);
assertAutoScalingRequest
( "{" +
"'set-listener' : " + "'set-listener' : " +
"{" + "{" +
"'name' : 'srt'," + "'name' : 'srt'," +
@ -671,23 +670,20 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"'stage' : ['FAILED','SUCCEEDED']," + "'stage' : ['FAILED','SUCCEEDED']," +
"'class' : '" + TestTriggerListener.class.getName() + "'" + "'class' : '" + TestTriggerListener.class.getName() + "'" +
"}" + "}" +
"}"; "}");
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand1);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
assertAutoscalingUpdateComplete(); assertAutoscalingUpdateComplete();
boolean await = triggerFinishedLatch.await(waitForSeconds * 45000 / SPEED, TimeUnit.MILLISECONDS); assertTrue("Trigger did not finish even after await()ing an excessive amount of time",
assertTrue("The trigger did not fire at all", await); triggerFinishedLatch.await(60, TimeUnit.SECONDS));
// wait for listener to capture the SUCCEEDED stage
cluster.getTimeSource().sleep(25000);
assertNotNull(listenerEvents.entrySet().iterator().toString(), listenerEvents.get("srt")); assertTrue("The listener didn't record the event even after await()ing an excessive amount of time",
listenerEventLatch.await(60, TimeUnit.SECONDS));
List<CapturedEvent> events = listenerEvents.get("srt");
assertNotNull("no srt events: " + listenerEvents.toString(), events);
assertEquals(events.toString(), 1, events.size());
assertTrue(listenerEvents.entrySet().iterator().toString(), listenerEvents.get("srt").size() >= 1); CapturedEvent ev = events.get(0);
CapturedEvent ev = listenerEvents.get("srt").get(0);
assertEquals(TriggerEventType.SEARCHRATE, ev.event.getEventType()); assertEquals(TriggerEventType.SEARCHRATE, ev.event.getEventType());
Map<String, Number> m = (Map<String, Number>)ev.event.getProperty(SearchRateTrigger.HOT_NODES); Map<String, Number> m = (Map<String, Number>)ev.event.getProperty(SearchRateTrigger.HOT_NODES);
assertNotNull(m); assertNotNull(m);