SOLR-13072: Wait for autoscaling config refresh to finish before modifying the cluster

and enable the tests for now.
This commit is contained in:
Andrzej Bialecki 2019-01-08 16:16:55 +01:00
parent 825e63c00b
commit a37e2c609c
3 changed files with 100 additions and 28 deletions

View File

@ -74,6 +74,8 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
private Map<String, AutoScaling.Trigger> activeTriggers = new HashMap<>(); private Map<String, AutoScaling.Trigger> activeTriggers = new HashMap<>();
private volatile int processedZnodeVersion = -1;
private volatile boolean isClosed = false; private volatile boolean isClosed = false;
private AutoScalingConfig autoScalingConfig; private AutoScalingConfig autoScalingConfig;
@ -109,6 +111,16 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
return scheduledTriggers; return scheduledTriggers;
} }
/**
* For tests, to ensure that all processing has been completed in response to an update of /autoscaling.json.
* @lucene.internal
* @return version of /autoscaling.json for which all configuration updates &amp; processing have been completed.
* Until then this value will always be smaller than the current znodeVersion of /autoscaling.json.
*/
public int getProcessedZnodeVersion() {
return processedZnodeVersion;
}
@Override @Override
public boolean isClosed() { public boolean isClosed() {
return isClosed; return isClosed;
@ -248,6 +260,7 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
log.debug("-- cleaning old nodeLost / nodeAdded markers"); log.debug("-- cleaning old nodeLost / nodeAdded markers");
removeMarkers(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH); removeMarkers(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH);
removeMarkers(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH); removeMarkers(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH);
processedZnodeVersion = znodeVersion;
} }
} }

View File

@ -372,6 +372,10 @@ public class SimClusterStateProvider implements ClusterStateProvider {
} }
} }
public synchronized String simGetOverseerLeader() {
return overseerLeader;
}
// this method needs to be called under a lock // this method needs to be called under a lock
private void setReplicaStates(String nodeId, Replica.State state, Set<String> changedCollections) { private void setReplicaStates(String nodeId, Replica.State state, Set<String> changedCollections) {
List<ReplicaInfo> replicas = nodeReplicaMap.get(nodeId); List<ReplicaInfo> replicas = nodeReplicaMap.get(nodeId);

View File

@ -35,13 +35,11 @@ import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantLock; import java.util.concurrent.locks.ReentrantLock;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.cloud.SolrCloudManager; import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig; import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig;
import org.apache.solr.client.solrj.cloud.autoscaling.ReplicaInfo; import org.apache.solr.client.solrj.cloud.autoscaling.ReplicaInfo;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventProcessorStage; import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventProcessorStage;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.cloud.CloudTestUtils; import org.apache.solr.cloud.CloudTestUtils;
import org.apache.solr.cloud.autoscaling.ActionContext; import org.apache.solr.cloud.autoscaling.ActionContext;
@ -147,7 +145,7 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
} }
@Test @Test
@LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2018-06-18 //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-13072")
public void testTriggerThrottling() throws Exception { public void testTriggerThrottling() throws Exception {
// for this test we want to create two triggers so we must assert that the actions were created twice // for this test we want to create two triggers so we must assert that the actions were created twice
actionInitCalled = new CountDownLatch(2); actionInitCalled = new CountDownLatch(2);
@ -177,6 +175,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"'actions' : [{'name':'test','class':'" + ThrottlingTesterAction.class.getName() + "'}]" + "'actions' : [{'name':'test','class':'" + ThrottlingTesterAction.class.getName() + "'}]" +
"}}"); "}}");
assertAutoscalingUpdateComplete();
// wait until the two instances of action are created // wait until the two instances of action are created
if (!actionInitCalled.await(10000 / SPEED, TimeUnit.MILLISECONDS)) { if (!actionInitCalled.await(10000 / SPEED, TimeUnit.MILLISECONDS)) {
fail("Two TriggerAction instances should have been created by now"); fail("Two TriggerAction instances should have been created by now");
@ -213,6 +213,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"'actions' : [{'name':'test','class':'" + ThrottlingTesterAction.class.getName() + "'}]" + "'actions' : [{'name':'test','class':'" + ThrottlingTesterAction.class.getName() + "'}]" +
"}}"); "}}");
assertAutoscalingUpdateComplete();
// wait until the two instances of action are created // wait until the two instances of action are created
if (!actionInitCalled.await(3000 / SPEED, TimeUnit.MILLISECONDS)) { if (!actionInitCalled.await(3000 / SPEED, TimeUnit.MILLISECONDS)) {
fail("Two TriggerAction instances should have been created by now"); fail("Two TriggerAction instances should have been created by now");
@ -286,6 +288,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time", assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time",
actionInitCalled.await(60, TimeUnit.SECONDS)); actionInitCalled.await(60, TimeUnit.SECONDS));
assertAutoscalingUpdateComplete();
// start a new node that we can kill later // start a new node that we can kill later
final String nodeName = cluster.simAddNode(); final String nodeName = cluster.simAddNode();
@ -372,6 +376,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" + "'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" +
"}}"); "}}");
assertAutoscalingUpdateComplete();
assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time", assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time",
actionInitCalled.await(60, TimeUnit.SECONDS)); actionInitCalled.await(60, TimeUnit.SECONDS));
@ -419,6 +425,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" + "'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" +
"}}"); "}}");
assertAutoscalingUpdateComplete();
assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time", assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time",
actionInitCalled.await(60, TimeUnit.SECONDS)); actionInitCalled.await(60, TimeUnit.SECONDS));
@ -433,7 +441,7 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
} }
@Test @Test
@LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2018-06-18 //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-13072")
public void testNodeAddedTrigger() throws Exception { public void testNodeAddedTrigger() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient(); SolrClient solrClient = cluster.simGetSolrClient();
assertAutoScalingRequest assertAutoScalingRequest
@ -446,6 +454,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" + "'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" +
"}}"); "}}");
assertAutoscalingUpdateComplete();
if (!actionInitCalled.await(5000 / SPEED, TimeUnit.MILLISECONDS)) { if (!actionInitCalled.await(5000 / SPEED, TimeUnit.MILLISECONDS)) {
fail("The TriggerAction should have been created by now"); fail("The TriggerAction should have been created by now");
} }
@ -474,6 +484,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" + "'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" +
"}}"); "}}");
assertAutoscalingUpdateComplete();
// this should be a no-op so the action should have been created but init should not be called // this should be a no-op so the action should have been created but init should not be called
if (!actionConstructorCalled.await(3000 / SPEED, TimeUnit.MILLISECONDS)) { if (!actionConstructorCalled.await(3000 / SPEED, TimeUnit.MILLISECONDS)) {
fail("The TriggerAction should have been created by now"); fail("The TriggerAction should have been created by now");
@ -483,8 +495,7 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
} }
@Test @Test
// commented 4-Sep-2018 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 26-Mar-2018 //@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13072")
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028")
public void testNodeLostTrigger() throws Exception { public void testNodeLostTrigger() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient(); SolrClient solrClient = cluster.simGetSolrClient();
assertAutoScalingRequest assertAutoScalingRequest
@ -497,6 +508,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" + "'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" +
"}}"); "}}");
assertAutoscalingUpdateComplete();
if (!actionInitCalled.await(5000 / SPEED, TimeUnit.MILLISECONDS)) { if (!actionInitCalled.await(5000 / SPEED, TimeUnit.MILLISECONDS)) {
fail("The TriggerAction should have been created by now"); fail("The TriggerAction should have been created by now");
} }
@ -526,6 +539,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" + "'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" +
"}}"); "}}");
assertAutoscalingUpdateComplete();
// this should be a no-op so the action should have been created but init should not be called // this should be a no-op so the action should have been created but init should not be called
if (!actionConstructorCalled.await(3000 / SPEED, TimeUnit.MILLISECONDS)) { if (!actionConstructorCalled.await(3000 / SPEED, TimeUnit.MILLISECONDS)) {
fail("The TriggerAction should have been created by now"); fail("The TriggerAction should have been created by now");
@ -654,7 +669,7 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
public static long eventQueueActionWait = 5000; public static long eventQueueActionWait = 5000;
@Test @Test
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // this test fails easily //@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13072") // this test fails easily
public void testEventQueue() throws Exception { public void testEventQueue() throws Exception {
waitForSeconds = 1; waitForSeconds = 1;
SolrClient solrClient = cluster.simGetSolrClient(); SolrClient solrClient = cluster.simGetSolrClient();
@ -670,6 +685,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"'actions' : [{'name':'test','class':'" + TestEventQueueAction.class.getName() + "'}]" + "'actions' : [{'name':'test','class':'" + TestEventQueueAction.class.getName() + "'}]" +
"}}"); "}}");
assertAutoscalingUpdateComplete();
if (!actionInitCalled.await(3000 / SPEED, TimeUnit.MILLISECONDS)) { if (!actionInitCalled.await(3000 / SPEED, TimeUnit.MILLISECONDS)) {
fail("The TriggerAction should have been created by now"); fail("The TriggerAction should have been created by now");
} }
@ -692,6 +709,9 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
// kill overseer // kill overseer
cluster.simRestartOverseer(overseerLeader); cluster.simRestartOverseer(overseerLeader);
cluster.getTimeSource().sleep(5000); cluster.getTimeSource().sleep(5000);
assertAutoscalingUpdateComplete();
// new overseer leader should be elected and run triggers // new overseer leader should be elected and run triggers
await = actionInterrupted.await(3000 / SPEED, TimeUnit.MILLISECONDS); await = actionInterrupted.await(3000 / SPEED, TimeUnit.MILLISECONDS);
assertTrue("action wasn't interrupted", await); assertTrue("action wasn't interrupted", await);
@ -707,8 +727,7 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
} }
@Test @Test
// commented 4-Sep-2018 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") //2018-03-10 // @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-13072")
@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 14-Oct-2018
public void testEventFromRestoredState() throws Exception { public void testEventFromRestoredState() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient(); SolrClient solrClient = cluster.simGetSolrClient();
assertAutoScalingRequest assertAutoScalingRequest
@ -721,6 +740,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" + "'actions' : [{'name':'test','class':'" + TestTriggerAction.class.getName() + "'}]" +
"}}"); "}}");
assertAutoscalingUpdateComplete();
if (!actionInitCalled.await(10000 / SPEED, TimeUnit.MILLISECONDS)) { if (!actionInitCalled.await(10000 / SPEED, TimeUnit.MILLISECONDS)) {
fail("The TriggerAction should have been created by now"); fail("The TriggerAction should have been created by now");
} }
@ -814,7 +835,7 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
} }
@Test @Test
@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-13072")
public void testNodeMarkersRegistration() throws Exception { public void testNodeMarkersRegistration() throws Exception {
// for this test we want to create two triggers so we must assert that the actions were created twice // for this test we want to create two triggers so we must assert that the actions were created twice
actionInitCalled = new CountDownLatch(2); actionInitCalled = new CountDownLatch(2);
@ -824,8 +845,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
SolrClient solrClient = cluster.simGetSolrClient(); SolrClient solrClient = cluster.simGetSolrClient();
// pick overseer node // get overseer node
String overseerLeader = cluster.getSimClusterStateProvider().simGetRandomNode(); String overseerLeader = cluster.getSimClusterStateProvider().simGetOverseerLeader();
// add a node // add a node
String node = cluster.simAddNode(); String node = cluster.simAddNode();
@ -842,6 +863,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
// stop overseer // stop overseer
log.info("====== KILL OVERSEER 1"); log.info("====== KILL OVERSEER 1");
cluster.simRestartOverseer(overseerLeader); cluster.simRestartOverseer(overseerLeader);
assertAutoscalingUpdateComplete();
if (!listener.onChangeLatch.await(10000, TimeUnit.MILLISECONDS)) { if (!listener.onChangeLatch.await(10000, TimeUnit.MILLISECONDS)) {
fail("onChange listener didn't execute on cluster change"); fail("onChange listener didn't execute on cluster change");
} }
@ -892,7 +915,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"'actions' : [{'name':'test','class':'" + TestEventMarkerAction.class.getName() + "'}]" + "'actions' : [{'name':'test','class':'" + TestEventMarkerAction.class.getName() + "'}]" +
"}}"); "}}");
overseerLeader = cluster.getSimClusterStateProvider().simGetRandomNode(); assertAutoscalingUpdateComplete();
overseerLeader = cluster.getSimClusterStateProvider().simGetOverseerLeader();
// create another node // create another node
log.info("====== ADD NODE 1"); log.info("====== ADD NODE 1");
@ -906,14 +930,10 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
pathAdded = ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH + "/" + node1; pathAdded = ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH + "/" + node1;
assertTrue("Path " + pathAdded + " wasn't created", cluster.getDistribStateManager().hasData(pathAdded)); assertTrue("Path " + pathAdded + " wasn't created", cluster.getDistribStateManager().hasData(pathAdded));
cluster.getTimeSource().sleep(60000);
// nodeAdded marker should be consumed now by nodeAdded trigger
assertFalse("Path " + pathAdded + " should have been deleted",
cluster.getDistribStateManager().hasData(pathAdded));
listener.reset(); listener.reset();
events.clear(); events.clear();
triggerFiredLatch = new CountDownLatch(1); // one nodeAdded (not cleared yet) and one nodeLost
triggerFiredLatch = new CountDownLatch(2);
// kill overseer again // kill overseer again
log.info("====== KILL OVERSEER 2"); log.info("====== KILL OVERSEER 2");
cluster.simRestartOverseer(overseerLeader); cluster.simRestartOverseer(overseerLeader);
@ -921,15 +941,32 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
fail("onChange listener didn't execute on cluster change"); fail("onChange listener didn't execute on cluster change");
} }
assertAutoscalingUpdateComplete();
if (!triggerFiredLatch.await(120000 / SPEED, TimeUnit.MILLISECONDS)) { if (!triggerFiredLatch.await(120000 / SPEED, TimeUnit.MILLISECONDS)) {
fail("Trigger should have fired by now"); fail("Trigger should have fired by now");
} }
assertEquals(1, events.size()); assertEquals(2, events.size());
TriggerEvent ev = events.iterator().next(); TriggerEvent nodeAdded = null;
List<String> nodeNames = (List<String>)ev.getProperty(TriggerEvent.NODE_NAMES); TriggerEvent nodeLost = null;
for (TriggerEvent ev : events) {
switch (ev.getEventType()) {
case NODEADDED:
nodeAdded = ev;
break;
case NODELOST:
nodeLost = ev;
break;
default:
fail("unexpected event type: " + ev);
}
}
assertNotNull("expected nodeAdded event", nodeAdded);
assertNotNull("expected nodeLost event", nodeLost);
List<String> nodeNames = (List<String>)nodeLost.getProperty(TriggerEvent.NODE_NAMES);
assertTrue(nodeNames.contains(overseerLeader)); assertTrue(nodeNames.contains(overseerLeader));
assertEquals(TriggerEventType.NODELOST, ev.getEventType()); nodeNames = (List<String>)nodeAdded.getProperty(TriggerEvent.NODE_NAMES);
assertTrue(nodeNames.contains(node1));
} }
static final Map<String, List<CapturedEvent>> listenerEvents = new ConcurrentHashMap<>(); static final Map<String, List<CapturedEvent>> listenerEvents = new ConcurrentHashMap<>();
@ -1011,6 +1048,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"}" + "}" +
"}"); "}");
assertAutoscalingUpdateComplete();
listenerEvents.clear(); listenerEvents.clear();
failDummyAction = false; failDummyAction = false;
@ -1121,7 +1160,7 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
} }
@Test @Test
@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") //@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-13072")
public void testCooldown() throws Exception { public void testCooldown() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient(); SolrClient solrClient = cluster.simGetSolrClient();
failDummyAction = false; failDummyAction = false;
@ -1149,6 +1188,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"}" + "}" +
"}"); "}");
assertAutoscalingUpdateComplete();
listenerCreated = new CountDownLatch(1); listenerCreated = new CountDownLatch(1);
listenerEvents.clear(); listenerEvents.clear();
@ -1226,7 +1267,7 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
@Test @Test
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // this test is way to sensitive to timing, must be beasted before returned //@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13072") // this test is way to sensitive to timing, must be beasted before returned
public void testSearchRate() throws Exception { public void testSearchRate() throws Exception {
SolrClient solrClient = cluster.simGetSolrClient(); SolrClient solrClient = cluster.simGetSolrClient();
String COLL1 = "collection1"; String COLL1 = "collection1";
@ -1265,6 +1306,8 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
"}" + "}" +
"}"); "}");
assertAutoscalingUpdateComplete();
// SolrParams query = params(CommonParams.Q, "*:*"); // SolrParams query = params(CommonParams.Q, "*:*");
// for (int i = 0; i < 500; i++) { // for (int i = 0; i < 500; i++) {
// solrClient.query(COLL1, query); // solrClient.query(COLL1, query);
@ -1381,4 +1424,16 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
return event; return event;
} }
private static void assertAutoscalingUpdateComplete() throws Exception {
(new TimeOut(30, TimeUnit.SECONDS, cluster.getTimeSource()))
.waitFor("OverseerTriggerThread never caught up to the latest znodeVersion", () -> {
try {
AutoScalingConfig autoscalingConfig = cluster.getDistribStateManager().getAutoScalingConfig();
return autoscalingConfig.getZkVersion() == cluster.getOverseerTriggerThread().getProcessedZnodeVersion();
} catch (Exception e) {
throw new RuntimeException("FAILED", e);
}
});
}
} }