mirror of https://github.com/apache/lucene.git
SOLR-12923: harden TestSimLargeCluster
- added logging - ensure start/finish trigger action counters are incremented before latches are released - replace arbitrary sleep calls with a trigger listener countdown latch - increase all await() times: This means that 'real' failures (which should be rare and hopefully reproducible) will be 'slow', but the trade off will be less hard to reproduce 'false failures' due to thread contention on slow or heavily loaded (ie: jenkins) machines
This commit is contained in:
parent
683aa3d3e9
commit
20de3d2ee0
|
@ -32,14 +32,10 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||
|
||||
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
|
||||
import org.apache.solr.client.solrj.SolrClient;
|
||||
import org.apache.solr.client.solrj.SolrRequest;
|
||||
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
|
||||
import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig;
|
||||
import org.apache.solr.client.solrj.cloud.autoscaling.Suggester;
|
||||
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventProcessorStage;
|
||||
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
|
||||
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
|
||||
import org.apache.solr.cloud.CloudTestUtils.AutoScalingRequest;
|
||||
import org.apache.solr.cloud.CloudTestUtils;
|
||||
import org.apache.solr.cloud.autoscaling.ActionContext;
|
||||
import org.apache.solr.cloud.autoscaling.CapturedEvent;
|
||||
|
@ -49,14 +45,11 @@ import org.apache.solr.cloud.autoscaling.SearchRateTrigger;
|
|||
import org.apache.solr.cloud.autoscaling.TriggerActionBase;
|
||||
import org.apache.solr.cloud.autoscaling.TriggerEvent;
|
||||
import org.apache.solr.cloud.autoscaling.TriggerListenerBase;
|
||||
import org.apache.solr.cloud.autoscaling.TriggerValidationException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.cloud.Replica;
|
||||
import org.apache.solr.common.params.CollectionParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.Pair;
|
||||
import org.apache.solr.common.util.TimeSource;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.solr.util.LogLevel;
|
||||
import org.apache.solr.util.TimeOut;
|
||||
import org.junit.After;
|
||||
|
@ -76,12 +69,13 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
|
||||
public static final int NUM_NODES = 100;
|
||||
|
||||
static Map<String, List<CapturedEvent>> listenerEvents = new ConcurrentHashMap<>();
|
||||
static AtomicInteger triggerFinishedCount = new AtomicInteger();
|
||||
static AtomicInteger triggerStartedCount = new AtomicInteger();
|
||||
static CountDownLatch triggerStartedLatch;
|
||||
static CountDownLatch triggerFinishedLatch;
|
||||
static int waitForSeconds;
|
||||
static final Map<String, List<CapturedEvent>> listenerEvents = new ConcurrentHashMap<>();
|
||||
static final AtomicInteger triggerFinishedCount = new AtomicInteger();
|
||||
static final AtomicInteger triggerStartedCount = new AtomicInteger();
|
||||
static volatile CountDownLatch triggerStartedLatch;
|
||||
static volatile CountDownLatch triggerFinishedLatch;
|
||||
static volatile CountDownLatch listenerEventLatch;
|
||||
static volatile int waitForSeconds;
|
||||
|
||||
@After
|
||||
public void tearDownTest() throws Exception {
|
||||
|
@ -105,20 +99,28 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
triggerFinishedCount.set(0);
|
||||
triggerStartedLatch = new CountDownLatch(1);
|
||||
triggerFinishedLatch = new CountDownLatch(1);
|
||||
|
||||
// by default assume we want to allow a (virtually) unbounded amount of events,
|
||||
// tests that expect a specific number can override
|
||||
listenerEventLatch = new CountDownLatch(Integer.MAX_VALUE);
|
||||
listenerEvents.clear();
|
||||
}
|
||||
|
||||
public static class TestTriggerListener extends TriggerListenerBase {
|
||||
@Override
|
||||
public void configure(SolrResourceLoader loader, SolrCloudManager cloudManager, AutoScalingConfig.TriggerListenerConfig config) throws TriggerValidationException {
|
||||
super.configure(loader, cloudManager, config);
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void onEvent(TriggerEvent event, TriggerEventProcessorStage stage, String actionName,
|
||||
ActionContext context, Throwable error, String message) {
|
||||
List<CapturedEvent> lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>());
|
||||
lst.add(new CapturedEvent(cluster.getTimeSource().getTimeNs(), context, config, stage, actionName, event, message));
|
||||
CapturedEvent ev = new CapturedEvent(cluster.getTimeSource().getTimeNs(), context, config, stage, actionName, event, message);
|
||||
final CountDownLatch latch = listenerEventLatch;
|
||||
synchronized (latch) {
|
||||
if (0 == latch.getCount()) {
|
||||
log.warn("Ignoring captured event since latch is 'full': {}", ev);
|
||||
} else {
|
||||
List<CapturedEvent> lst = listenerEvents.computeIfAbsent(config.name, s -> new ArrayList<>());
|
||||
lst.add(ev);
|
||||
latch.countDown();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -133,8 +135,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
public static class StartTriggerAction extends TriggerActionBase {
|
||||
@Override
|
||||
public void process(TriggerEvent event, ActionContext context) throws Exception {
|
||||
triggerStartedLatch.countDown();
|
||||
triggerStartedCount.incrementAndGet();
|
||||
triggerStartedLatch.countDown();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -142,7 +144,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // this test hits a timeout easily
|
||||
public void testBasic() throws Exception {
|
||||
SolrClient solrClient = cluster.simGetSolrClient();
|
||||
String setTriggerCommand = "{" +
|
||||
assertAutoScalingRequest
|
||||
( "{" +
|
||||
"'set-trigger' : {" +
|
||||
"'name' : 'node_lost_trigger1'," +
|
||||
"'event' : 'nodeLost'," +
|
||||
|
@ -154,12 +157,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
|
||||
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
|
||||
"]" +
|
||||
"}}";
|
||||
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
|
||||
NamedList<Object> response = solrClient.request(req);
|
||||
assertEquals(response.get("result").toString(), "success");
|
||||
"}}");
|
||||
|
||||
String setListenerCommand = "{" +
|
||||
assertAutoScalingRequest
|
||||
( "{" +
|
||||
"'set-listener' : " +
|
||||
"{" +
|
||||
"'name' : 'foo'," +
|
||||
|
@ -169,15 +170,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
"'afterAction' : ['compute', 'execute']," +
|
||||
"'class' : '" + TestTriggerListener.class.getName() + "'" +
|
||||
"}" +
|
||||
"}";
|
||||
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
|
||||
response = solrClient.request(req);
|
||||
assertEquals(response.get("result").toString(), "success");
|
||||
"}");
|
||||
|
||||
assertAutoscalingUpdateComplete();
|
||||
|
||||
cluster.getTimeSource().sleep(5000);
|
||||
|
||||
// pick a few random nodes
|
||||
List<String> nodes = new ArrayList<>();
|
||||
int limit = 75;
|
||||
|
@ -240,7 +236,6 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
fail("did not finish processing all events in time: started=" + triggerStartedCount.get() + ", finished=" + triggerFinishedCount.get());
|
||||
}
|
||||
|
||||
|
||||
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 30 * nodes.size(), TimeUnit.SECONDS,
|
||||
CloudTestUtils.clusterShape(5, 15, false, true)) + "ms");
|
||||
long newMoveReplicaOps = cluster.simGetOpCount(CollectionParams.CollectionAction.MOVEREPLICA.name());
|
||||
|
@ -254,7 +249,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
@Test
|
||||
public void testAddNode() throws Exception {
|
||||
SolrClient solrClient = cluster.simGetSolrClient();
|
||||
String setTriggerCommand = "{" +
|
||||
assertAutoScalingRequest
|
||||
( "{" +
|
||||
"'set-trigger' : {" +
|
||||
"'name' : 'node_added_trigger2'," +
|
||||
"'event' : 'nodeAdded'," +
|
||||
|
@ -266,10 +262,7 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
|
||||
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
|
||||
"]" +
|
||||
"}}";
|
||||
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
|
||||
NamedList<Object> response = solrClient.request(req);
|
||||
assertEquals(response.get("result").toString(), "success");
|
||||
"}}");
|
||||
|
||||
assertAutoscalingUpdateComplete();
|
||||
|
||||
|
@ -289,16 +282,19 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
List<String> addNodesList = new ArrayList<>(numAddNode);
|
||||
for (int i = 0; i < numAddNode; i++) {
|
||||
addNodesList.add(cluster.simAddNode());
|
||||
cluster.getTimeSource().sleep(5000);
|
||||
}
|
||||
// wait until at least one event is generated
|
||||
boolean await = triggerStartedLatch.await(20000 / SPEED, TimeUnit.MILLISECONDS);
|
||||
assertTrue("trigger did not fire", await);
|
||||
assertTrue("Trigger did not start even after await()ing an excessive amount of time",
|
||||
triggerStartedLatch.await(60, TimeUnit.SECONDS));
|
||||
|
||||
// wait until started == finished
|
||||
TimeOut timeOut = new TimeOut(45 * waitForSeconds * NUM_NODES, TimeUnit.SECONDS, cluster.getTimeSource());
|
||||
while (!timeOut.hasTimedOut()) {
|
||||
final int started = triggerStartedCount.get();
|
||||
final int finished = triggerFinishedCount.get();
|
||||
log.info("started={} =?= finished={}", started, finished);
|
||||
if (triggerStartedCount.get() == triggerFinishedCount.get()) {
|
||||
log.info("started == finished: {} == {}", started, finished);
|
||||
break;
|
||||
}
|
||||
timeOut.sleep(1000);
|
||||
|
@ -324,18 +320,22 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
SolrInputDocument startedEvent = systemColl.get(startedEventPos);
|
||||
int lastIgnoredPos = startedEventPos;
|
||||
// make sure some replicas have been moved
|
||||
assertTrue("no MOVEREPLICA ops?", cluster.simGetOpCount("MOVEREPLICA") > 0);
|
||||
long lastNumOps = cluster.simGetOpCount("MOVEREPLICA");
|
||||
log.info("1st check: lastNumOps (MOVEREPLICA) = {}", lastNumOps);
|
||||
assertTrue("no MOVEREPLICA ops?", lastNumOps > 0);
|
||||
|
||||
log.info("Ready after " + CloudTestUtils.waitForState(cluster, collectionName, 20 * NUM_NODES, TimeUnit.SECONDS,
|
||||
CloudTestUtils.clusterShape(NUM_NODES / 10, NUM_NODES / 8 * 3, false, true)) + " ms");
|
||||
|
||||
int count = 1000;
|
||||
SolrInputDocument finishedEvent = null;
|
||||
long lastNumOps = cluster.simGetOpCount("MOVEREPLICA");
|
||||
lastNumOps = cluster.simGetOpCount("MOVEREPLICA");
|
||||
log.info("2nd check: lastNumOps (MOVEREPLICA) = {}", lastNumOps);
|
||||
while (count-- > 0) {
|
||||
cluster.getTimeSource().sleep(10000);
|
||||
|
||||
if (cluster.simGetOpCount("MOVEREPLICA") < 2) {
|
||||
log.info("MOVEREPLICA < 2");
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -349,24 +349,25 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
}
|
||||
if ("SUCCEEDED".equals(d.getFieldValue("stage_s"))) {
|
||||
finishedEvent = d;
|
||||
log.info("finishedEvent = {}", finishedEvent);
|
||||
break;
|
||||
}
|
||||
}
|
||||
log.info("breaking because currentNumOps == lastNumOps == {}", currentNumOps);
|
||||
break;
|
||||
} else {
|
||||
lastNumOps = currentNumOps;
|
||||
}
|
||||
}
|
||||
|
||||
assertTrue("did not finish processing changes", finishedEvent != null);
|
||||
assertNotNull("did not finish processing changes", finishedEvent);
|
||||
long delta = (Long)finishedEvent.getFieldValue("event.time_l") - (Long)startedEvent.getFieldValue("event.time_l");
|
||||
log.info("#### System stabilized after " + TimeUnit.NANOSECONDS.toMillis(delta) + " ms");
|
||||
assertTrue("unexpected number of MOVEREPLICA ops", cluster.simGetOpCount("MOVEREPLICA") > 1);
|
||||
assertTrue("unexpected number of MOVEREPLICA ops: " + cluster.simGetOpCount("MOVEREPLICA"),
|
||||
cluster.simGetOpCount("MOVEREPLICA") > 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
// commented 4-Sep-2018 @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2018-06-18
|
||||
// commented out on: 17-Feb-2019 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 14-Oct-2018
|
||||
public void testNodeLost() throws Exception {
|
||||
doTestNodeLost(waitForSeconds, 5000, 0);
|
||||
}
|
||||
|
@ -446,7 +447,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
|
||||
private long doTestNodeLost(int waitFor, long killDelay, int minIgnored) throws Exception {
|
||||
SolrClient solrClient = cluster.simGetSolrClient();
|
||||
String setTriggerCommand = "{" +
|
||||
assertAutoScalingRequest
|
||||
( "{" +
|
||||
"'set-trigger' : {" +
|
||||
"'name' : 'node_lost_trigger3'," +
|
||||
"'event' : 'nodeLost'," +
|
||||
|
@ -458,12 +460,10 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
|
||||
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
|
||||
"]" +
|
||||
"}}";
|
||||
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
|
||||
NamedList<Object> response = solrClient.request(req);
|
||||
assertEquals(response.get("result").toString(), "success");
|
||||
"}}");
|
||||
|
||||
String setListenerCommand = "{" +
|
||||
assertAutoScalingRequest
|
||||
( "{" +
|
||||
"'set-listener' : " +
|
||||
"{" +
|
||||
"'name' : 'failures'," +
|
||||
|
@ -471,10 +471,7 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
"'stage' : ['FAILED']," +
|
||||
"'class' : '" + TestTriggerListener.class.getName() + "'" +
|
||||
"}" +
|
||||
"}";
|
||||
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand);
|
||||
response = solrClient.request(req);
|
||||
assertEquals(response.get("result").toString(), "success");
|
||||
"}");
|
||||
|
||||
assertAutoscalingUpdateComplete();
|
||||
|
||||
|
@ -499,10 +496,9 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
cluster.getTimeSource().sleep(killDelay);
|
||||
}
|
||||
// wait for the trigger to fire and complete at least once
|
||||
boolean await = triggerFinishedLatch.await(20 * waitFor * 1000 / SPEED, TimeUnit.MILLISECONDS);
|
||||
assertTrue("trigger did not fire within timeout, " +
|
||||
"waitFor=" + waitFor + ", killDelay=" + killDelay + ", minIgnored=" + minIgnored,
|
||||
await);
|
||||
assertTrue("Trigger did not finish even after await()ing an excessive amount of time",
|
||||
triggerFinishedLatch.await(60, TimeUnit.SECONDS));
|
||||
|
||||
List<SolrInputDocument> systemColl = cluster.simGetSystemCollection();
|
||||
int startedEventPos = -1;
|
||||
|
||||
|
@ -646,7 +642,8 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
|
||||
// now define the trigger. doing it earlier may cause partial events to be generated (where only some
|
||||
// nodes / replicas exceeded the threshold).
|
||||
String setTriggerCommand = "{" +
|
||||
assertAutoScalingRequest
|
||||
( "{" +
|
||||
"'set-trigger' : {" +
|
||||
"'name' : 'search_rate_trigger'," +
|
||||
"'event' : 'searchRate'," +
|
||||
|
@ -659,11 +656,13 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
"{'name':'execute','class':'" + ExecutePlanAction.class.getName() + "'}," +
|
||||
"{'name':'test','class':'" + FinishTriggerAction.class.getName() + "'}" +
|
||||
"]" +
|
||||
"}}";
|
||||
SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
|
||||
NamedList<Object> response = solrClient.request(req);
|
||||
assertEquals(response.get("result").toString(), "success");
|
||||
String setListenerCommand1 = "{" +
|
||||
"}}");
|
||||
|
||||
|
||||
// we're going to expect our trigger listener to process exactly one captured event
|
||||
listenerEventLatch = new CountDownLatch(1);
|
||||
assertAutoScalingRequest
|
||||
( "{" +
|
||||
"'set-listener' : " +
|
||||
"{" +
|
||||
"'name' : 'srt'," +
|
||||
|
@ -671,23 +670,20 @@ public class TestSimLargeCluster extends SimSolrCloudTestCase {
|
|||
"'stage' : ['FAILED','SUCCEEDED']," +
|
||||
"'class' : '" + TestTriggerListener.class.getName() + "'" +
|
||||
"}" +
|
||||
"}";
|
||||
req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setListenerCommand1);
|
||||
response = solrClient.request(req);
|
||||
assertEquals(response.get("result").toString(), "success");
|
||||
"}");
|
||||
|
||||
assertAutoscalingUpdateComplete();
|
||||
|
||||
boolean await = triggerFinishedLatch.await(waitForSeconds * 45000 / SPEED, TimeUnit.MILLISECONDS);
|
||||
assertTrue("The trigger did not fire at all", await);
|
||||
// wait for listener to capture the SUCCEEDED stage
|
||||
cluster.getTimeSource().sleep(25000);
|
||||
assertTrue("Trigger did not finish even after await()ing an excessive amount of time",
|
||||
triggerFinishedLatch.await(60, TimeUnit.SECONDS));
|
||||
|
||||
assertNotNull(listenerEvents.entrySet().iterator().toString(), listenerEvents.get("srt"));
|
||||
assertTrue("The listener didn't record the event even after await()ing an excessive amount of time",
|
||||
listenerEventLatch.await(60, TimeUnit.SECONDS));
|
||||
List<CapturedEvent> events = listenerEvents.get("srt");
|
||||
assertNotNull("no srt events: " + listenerEvents.toString(), events);
|
||||
assertEquals(events.toString(), 1, events.size());
|
||||
|
||||
assertTrue(listenerEvents.entrySet().iterator().toString(), listenerEvents.get("srt").size() >= 1);
|
||||
|
||||
CapturedEvent ev = listenerEvents.get("srt").get(0);
|
||||
CapturedEvent ev = events.get(0);
|
||||
assertEquals(TriggerEventType.SEARCHRATE, ev.event.getEventType());
|
||||
Map<String, Number> m = (Map<String, Number>)ev.event.getProperty(SearchRateTrigger.HOT_NODES);
|
||||
assertNotNull(m);
|
||||
|
|
Loading…
Reference in New Issue