SOLR-12923: harden TestSimLargeCluster

- added logging
- ensure start/finish trigger action counters are incremented before latches are released
- replace arbitrary sleep calls with a trigger listener countdown latch
- increase all await() times: This means that 'real' failures (which should be rare and hopefully
  reproducible) will be 'slow', but the trade off will be less hard to reproduce 'false failures'
  due to thread contention on slow or heavily loaded (ie: jenkins) machines

(cherry picked from commit 20de3d2ee0)
This commit is contained in:
Chris Hostetter 2019-03-07 15:40:57 -07:00
parent c55415cda8
commit 27aeb11a37
1 changed files with 76 additions and 80 deletions

View File

@ -32,14 +32,10 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig;
import org.apache.solr.client.solrj.cloud.autoscaling.Suggester;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventProcessorStage;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.cloud.CloudTestUtils.AutoScalingRequest;
import org.apache.solr.cloud.CloudTestUtils;
import org.apache.solr.cloud.autoscaling.ActionContext;
import org.apache.solr.cloud.autoscaling.CapturedEvent;
@ -49,14 +45,11 @@ import org.apache.solr.cloud.autoscaling.SearchRateTrigger;
import org.apache.solr.cloud.autoscaling.TriggerActionBase;
import org.apache.solr.cloud.autoscaling.TriggerEvent;
import org.apache.solr.cloud.autoscaling.TriggerListenerBase;
import org.apache.solr.cloud.autoscaling.TriggerValidationException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.Pair;
import org.apache.solr.common.util.TimeSource;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.util.LogLevel;
import org.apache.solr.util.TimeOut;
import org.junit.After;
@ -76,12 +69,13 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
public static final int NUM_NODES = 100;
static Map<String, List<CapturedEvent>> listenerEvents = new ConcurrentHashMap<>();
static AtomicInteger triggerFinishedCount = new AtomicInteger();
static AtomicInteger triggerStartedCount = new AtomicInteger();
static CountDownLatch triggerStartedLatch;
static CountDownLatch triggerFinishedLatch;
static int waitForSeconds;
static final Map<String, List<CapturedEvent>> listenerEvents = new ConcurrentHashMap<>();
static final AtomicInteger triggerFinishedCount = new AtomicInteger();
static final AtomicInteger triggerStartedCount = new AtomicInteger();
static volatile CountDownLatch triggerStartedLatch;
static volatile CountDownLatch triggerFinishedLatch;
static volatile CountDownLatch listenerEventLatch;
static volatile int waitForSeconds;
@After
public void tearDownTest() throws Exception {
@ -105,20 +99,28 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
triggerFinishedCount.set(0);
triggerStartedLatch = new CountDownLatch(1);
triggerFinishedLatch = new CountDownLatch(1);
// by default assume we want to allow a (virtually) unbounded amount of events,
// tests that expect a specific number can override
listenerEventLatch = new CountDownLatch(Integer.MAX_VALUE);
listenerEvents.clear();
}
public static class TestTriggerListener extends TriggerListenerBase {
@Override
public void configure(SolrResourceLoader loader, SolrCloudManager cloudManager, AutoScalingConfig.TriggerListenerConfig config) throws TriggerValidationException {
super.configure(loader, cloudManager, config);
}
@Override
public synchronized void onEvent(TriggerEvent event, TriggerEventProcessorStage stage, String actionName,
ActionContext context, Throwable error, String message) {
List<CapturedEvent> lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>());
lst.add(new CapturedEvent(cluster.getTimeSource().getTimeNs(), context, config, stage, actionName, event, message));
CapturedEvent ev = new CapturedEvent(cluster.getTimeSource().getTimeNs(), context, config, stage, actionName, event, message);
final CountDownLatch latch = listenerEventLatch;
synchronized (latch) {
if (0 == latch.getCount()) {
log.warn("Ignoring captured event since latch is 'full': {}", ev);
} else {
List<CapturedEvent> lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>());
lst.add(ev);
latch.countDown();
}
}
}
}
@ -133,8 +135,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
public static class StartTriggerAction extends TriggerActionBase {
@Override
public void process(TriggerEvent event, ActionContext context) throws Exception {
triggerStartedLatch.countDown();
triggerStartedCount.incrementAndGet();
triggerStartedLatch.countDown();
}
}
@ -142,7 +144,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // this test hits a timeout easily
public void testBasic() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient();
String setTriggerCommand = "{" +
assertAutoScalingRequest
( "{" +
"'set-trigger' : {" +
"'name' : 'node_lost_trigger1'," +
"'event' : 'nodeLost'," +
@ -154,12 +157,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" +
"}}";
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
"}}");
String setListenerCommand = "{" +
assertAutoScalingRequest
( "{" +
"'set-listener' : " +
"{" +
"'name' : 'foo'," +
@ -169,15 +170,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"'afterAction' : ['compute', 'execute']," +
"'class' : '" + TestTriggerListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
"}");
assertAutoscalingUpdateComplete();
cluster.getTimeSource().sleep(5000);
// pick a few random nodes
List<String> nodes = new ArrayList<>();
int limit = 75;
@ -240,7 +236,6 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get());
}
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS,
CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
long newMoveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name());
@ -254,7 +249,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
@Test
public void testAddNode() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient();
String setTriggerCommand = "{" +
assertAutoScalingRequest
( "{" +
"'set-trigger' : {" +
"'name' : 'node_added_trigger2'," +
"'event' : 'nodeAdded'," +
@ -266,10 +262,7 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" +
"}}";
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
"}}");
assertAutoscalingUpdateComplete();
@ -289,16 +282,19 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
List<String> addNodesList = new ArrayList<>(numAddNode);
for (int i = 0; i < numAddNode; i++) {
addNodesList.add(cluster.simAddNode());
cluster.getTimeSource().sleep(5000);
}
// wait until at least one event is generated
boolean await = triggerStartedLatch.await(20000 / SPEED, TimeUnit.MILLISECONDS);
assertTrue("trigger did not fire", await);
assertTrue("Trigger did not start even after await()ing an excessive amount of time",
triggerStartedLatch.await(60, TimeUnit.SECONDS));
// wait until started == finished
TimeOut timeOut = new TimeOut(45 * waitForSeconds * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource());
while (!timeOut.hasTimedOut()) {
final int started = triggerStartedCount.get();
final int finished = triggerFinishedCount.get();
log.info("started={} =?= finished={}", started, finished);
if (triggerStartedCount.get() == triggerFinishedCount.get()) {
log.info("started == finished: {} == {}", started, finished);
break;
}
timeOut.sleep(1000);
@ -324,18 +320,22 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
SolrInputDocument startedEvent = systemColl.get(startedEventPos);
int lastIgnoredPos = startedEventPos;
// make sure some replicas have been moved
assertTrue("no MOVEREPLICA ops?", cluster.simGetOpCount("MOVEREPLICA") > 0);
long lastNumOps = cluster.simGetOpCount("MOVEREPLICA");
log.info("1st check: lastNumOps (MOVEREPLICA) = {}", lastNumOps);
assertTrue("no MOVEREPLICA ops?", lastNumOps > 0);
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3, false, true)) + " ms");
int count = 1000;
SolrInputDocument finishedEvent = null;
long lastNumOps = cluster.simGetOpCount("MOVEREPLICA");
lastNumOps = cluster.simGetOpCount("MOVEREPLICA");
log.info("2nd check: lastNumOps (MOVEREPLICA) = {}", lastNumOps);
while (count-- > 0) {
cluster.getTimeSource().sleep(10000);
if (cluster.simGetOpCount("MOVEREPLICA") < 2) {
log.info("MOVEREPLICA < 2");
continue;
}
@ -349,24 +349,25 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
}
if ("SUCCEEDED".equals(d.getFieldValue("stage_s"))) {
finishedEvent = d;
log.info("finishedEvent = {}", finishedEvent);
break;
}
}
log.info("breaking because currentNumOps == lastNumOps == {}", currentNumOps);
break;
} else {
lastNumOps = currentNumOps;
}
}
assertTrue("did not finish processing changes", finishedEvent != null);
assertNotNull("did not finish processing changes", finishedEvent);
long delta = (Long)finishedEvent.getFieldValue("event.time_l") - (Long)startedEvent.getFieldValue("event.time_l");
log.info("#### System stabilized after " + TimeUnit.NANOSECONDS.toMillis(delta) + " ms");
assertTrue("unexpected number of MOVEREPLICA ops", cluster.simGetOpCount("MOVEREPLICA") > 1);
assertTrue("unexpected number of MOVEREPLICA ops: " + cluster.simGetOpCount("MOVEREPLICA"),
cluster.simGetOpCount("MOVEREPLICA") > 1);
}
@Test
// commented 4-Sep-2018 @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2018-06-18
// commented out on: 17-Feb-2019 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 14-Oct-2018
public void testNodeLost() throws Exception {
doTestNodeLost(waitForSeconds, 5000, 0);
}
@ -446,7 +447,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
private long doTestNodeLost(int waitFor, long killDelay, int minIgnored) throws Exception {
SolrClient solrClient = cluster.simGetSolrClient();
String setTriggerCommand = "{" +
assertAutoScalingRequest
( "{" +
"'set-trigger' : {" +
"'name' : 'node_lost_trigger3'," +
"'event' : 'nodeLost'," +
@ -458,12 +460,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" +
"}}";
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
"}}");
String setListenerCommand = "{" +
assertAutoScalingRequest
( "{" +
"'set-listener' : " +
"{" +
"'name' : 'failures'," +
@ -471,10 +471,7 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"'stage' : ['FAILED']," +
"'class' : '" + TestTriggerListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
"}");
assertAutoscalingUpdateComplete();
@ -499,10 +496,9 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
cluster.getTimeSource().sleep(killDelay);
}
// wait for the trigger to fire and complete at least once
boolean await = triggerFinishedLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS);
assertTrue("trigger did not fire within timeout, " +
"waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
await);
assertTrue("Trigger did not finish even after await()ing an excessive amount of time",
triggerFinishedLatch.await(60, TimeUnit.SECONDS));
List<SolrInputDocument> systemColl = cluster.simGetSystemCollection();
int startedEventPos = -1;
@ -646,7 +642,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
// now define the trigger. doing it earlier may cause partial events to be generated (where only some
// nodes / replicas exceeded the threshold).
String setTriggerCommand = "{" +
assertAutoScalingRequest
( "{" +
"'set-trigger' : {" +
"'name' : 'search_rate_trigger'," +
"'event' : 'searchRate'," +
@ -659,11 +656,13 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
"]" +
"}}";
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
String setListenerCommand1 = "{" +
"}}");
// we're going to expect our trigger listener to process exactly one captured event
listenerEventLatch = new CountDownLatch(1);
assertAutoScalingRequest
( "{" +
"'set-listener' : " +
"{" +
"'name' : 'srt'," +
@ -671,23 +670,20 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
"'stage' : ['FAILED','SUCCEEDED']," +
"'class' : '" + TestTriggerListener.class.getName() + "'" +
"}" +
"}";
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand1);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
"}");
assertAutoscalingUpdateComplete();
boolean await = triggerFinishedLatch.await(waitForSeconds * 45000 / SPEED, TimeUnit.MILLISECONDS);
assertTrue("The trigger did not fire at all", await);
// wait for listener to capture the SUCCEEDED stage
cluster.getTimeSource().sleep(25000);
assertTrue("Trigger did not finish even after await()ing an excessive amount of time",
triggerFinishedLatch.await(60, TimeUnit.SECONDS));
assertNotNull(listenerEvents.entrySet().iterator().toString(), listenerEvents.get("srt"));
assertTrue("The listener didn't record the event even after await()ing an excessive amount of time",
listenerEventLatch.await(60, TimeUnit.SECONDS));
List<CapturedEvent> events = listenerEvents.get("srt");
assertNotNull("no srt events: " + listenerEvents.toString(), events);
assertEquals(events.toString(), 1, events.size());
assertTrue(listenerEvents.entrySet().iterator().toString(), listenerEvents.get("srt").size() >= 1);
CapturedEvent ev = listenerEvents.get("srt").get(0);
CapturedEvent ev = events.get(0);
assertEquals(TriggerEventType.SEARCHRATE, ev.event.getEventType());
Map<String, Number> m = (Map<String, Number>)ev.event.getProperty(SearchRateTrigger.HOT_NODES);
assertNotNull(m);