mirror of https://github.com/apache/lucene.git
SOLR-13410: Designated overseer wasn't able to rejoin election queue upon restart
This commit is contained in:
parent
cdd130ccb6
commit
1882a17115
|
@ -249,6 +249,9 @@ Bug Fixes
|
||||||
* SOLR-12291: prematurely reporting not yet finished async Collections API call as completed
|
* SOLR-12291: prematurely reporting not yet finished async Collections API call as completed
|
||||||
when collection's replicas are collocated at least at one node (Varun Thacker, Mikhail Khludnev)
|
when collection's replicas are collocated at least at one node (Varun Thacker, Mikhail Khludnev)
|
||||||
|
|
||||||
|
* SOLR-13410: Designated overseer wasn't able to rejoin election queue upon restart (Ishan Chattopadhyaya,
|
||||||
|
Kesharee Nandan Vishwakarma)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -2138,13 +2138,9 @@ public class ZkController implements Closeable {
|
||||||
public void rejoinOverseerElection(String electionNode, boolean joinAtHead) {
|
public void rejoinOverseerElection(String electionNode, boolean joinAtHead) {
|
||||||
try {
|
try {
|
||||||
if (electionNode != null) {
|
if (electionNode != null) {
|
||||||
//this call is from inside the JVM . not from CoreAdminHandler
|
// Check whether we came to this node by mistake
|
||||||
if (overseerElector.getContext() == null || overseerElector.getContext().leaderSeqPath == null) {
|
if ( overseerElector.getContext() != null && overseerElector.getContext().leaderSeqPath == null
|
||||||
overseerElector.retryElection(new OverseerElectionContext(zkClient,
|
&& !overseerElector.getContext().leaderSeqPath.endsWith(electionNode)) {
|
||||||
overseer, getNodeName()), joinAtHead);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (!overseerElector.getContext().leaderSeqPath.endsWith(electionNode)) {
|
|
||||||
log.warn("Asked to rejoin with wrong election node : {}, current node is {}", electionNode, overseerElector.getContext().leaderSeqPath);
|
log.warn("Asked to rejoin with wrong election node : {}, current node is {}", electionNode, overseerElector.getContext().leaderSeqPath);
|
||||||
//however delete it . This is possible when the last attempt at deleting the election node failed.
|
//however delete it . This is possible when the last attempt at deleting the election node failed.
|
||||||
if (electionNode.startsWith(getNodeName())) {
|
if (electionNode.startsWith(getNodeName())) {
|
||||||
|
@ -2158,6 +2154,10 @@ public class ZkController implements Closeable {
|
||||||
log.warn("Old election node exists , could not be removed ", e);
|
log.warn("Old election node exists , could not be removed ", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else { // We're in the right place, now attempt to rejoin
|
||||||
|
overseerElector.retryElection(new OverseerElectionContext(zkClient,
|
||||||
|
overseer, getNodeName()), joinAtHead);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
overseerElector.retryElection(overseerElector.getContext(), joinAtHead);
|
overseerElector.retryElection(overseerElector.getContext(), joinAtHead);
|
||||||
|
|
|
@ -46,25 +46,33 @@ public class OverseerRolesTest extends SolrCloudTestCase {
|
||||||
|
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void setupCluster() throws Exception {
|
public static void setupCluster() throws Exception {
|
||||||
configureCluster(4)
|
configureCluster(6)
|
||||||
.addConfig("conf", configset("cloud-minimal"))
|
.addConfig("conf", configset("cloud-minimal"))
|
||||||
.configure();
|
.configure();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void waitForNewOverseer(int seconds, Predicate<String> state) throws Exception {
|
|
||||||
|
private void waitForNewOverseer(int seconds, Predicate<String> state, boolean failOnIntermediateTransition) throws Exception {
|
||||||
TimeOut timeout = new TimeOut(seconds, TimeUnit.SECONDS, TimeSource.NANO_TIME);
|
TimeOut timeout = new TimeOut(seconds, TimeUnit.SECONDS, TimeSource.NANO_TIME);
|
||||||
String current = null;
|
String current = null;
|
||||||
while (timeout.hasTimedOut() == false) {
|
while (timeout.hasTimedOut() == false) {
|
||||||
|
String prev = current;
|
||||||
current = OverseerCollectionConfigSetProcessor.getLeaderNode(zkClient());
|
current = OverseerCollectionConfigSetProcessor.getLeaderNode(zkClient());
|
||||||
if (state.test(current))
|
if (state.test(current))
|
||||||
return;
|
return;
|
||||||
|
else if (failOnIntermediateTransition) {
|
||||||
|
if (prev != null && !current.equals(prev)) {
|
||||||
|
fail ("There was an intermediate transition, previous: "+prev+", intermediate transition: "+current);
|
||||||
|
}
|
||||||
|
}
|
||||||
Thread.sleep(100);
|
Thread.sleep(100);
|
||||||
}
|
}
|
||||||
fail("Timed out waiting for overseer state change");
|
fail("Timed out waiting for overseer state change. The current overseer is: "+current);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void waitForNewOverseer(int seconds, String expected) throws Exception {
|
private void waitForNewOverseer(int seconds, String expected, boolean failOnIntermediateTransition) throws Exception {
|
||||||
waitForNewOverseer(seconds, s -> Objects.equals(s, expected));
|
log.info("Expecting node: "+expected);
|
||||||
|
waitForNewOverseer(seconds, s -> Objects.equals(s, expected), failOnIntermediateTransition);
|
||||||
}
|
}
|
||||||
|
|
||||||
private JettySolrRunner getOverseerJetty() throws Exception {
|
private JettySolrRunner getOverseerJetty() throws Exception {
|
||||||
|
@ -93,6 +101,10 @@ public class OverseerRolesTest extends SolrCloudTestCase {
|
||||||
|
|
||||||
logOverseerState();
|
logOverseerState();
|
||||||
List<String> nodes = OverseerCollectionConfigSetProcessor.getSortedOverseerNodeNames(zkClient());
|
List<String> nodes = OverseerCollectionConfigSetProcessor.getSortedOverseerNodeNames(zkClient());
|
||||||
|
// Remove the OVERSEER role, in case it was already assigned by another test in this suite
|
||||||
|
for (String node: nodes) {
|
||||||
|
CollectionAdminRequest.removeRole(node, "overseer").process(cluster.getSolrClient());
|
||||||
|
}
|
||||||
String overseer1 = OverseerCollectionConfigSetProcessor.getLeaderNode(zkClient());
|
String overseer1 = OverseerCollectionConfigSetProcessor.getLeaderNode(zkClient());
|
||||||
nodes.remove(overseer1);
|
nodes.remove(overseer1);
|
||||||
|
|
||||||
|
@ -102,7 +114,7 @@ public class OverseerRolesTest extends SolrCloudTestCase {
|
||||||
|
|
||||||
CollectionAdminRequest.addRole(overseer2, "overseer").process(cluster.getSolrClient());
|
CollectionAdminRequest.addRole(overseer2, "overseer").process(cluster.getSolrClient());
|
||||||
|
|
||||||
waitForNewOverseer(15, overseer2);
|
waitForNewOverseer(15, overseer2, false);
|
||||||
|
|
||||||
//add another node as overseer
|
//add another node as overseer
|
||||||
nodes.remove(overseer2);
|
nodes.remove(overseer2);
|
||||||
|
@ -117,7 +129,7 @@ public class OverseerRolesTest extends SolrCloudTestCase {
|
||||||
logOverseerState();
|
logOverseerState();
|
||||||
|
|
||||||
leaderJetty.stop();
|
leaderJetty.stop();
|
||||||
waitForNewOverseer(10, overseer3);
|
waitForNewOverseer(10, overseer3, false);
|
||||||
|
|
||||||
// add another node as overseer
|
// add another node as overseer
|
||||||
nodes.remove(overseer3);
|
nodes.remove(overseer3);
|
||||||
|
@ -129,7 +141,7 @@ public class OverseerRolesTest extends SolrCloudTestCase {
|
||||||
|
|
||||||
// remove the overseer role from the current overseer
|
// remove the overseer role from the current overseer
|
||||||
CollectionAdminRequest.removeRole(overseer3, "overseer").process(cluster.getSolrClient());
|
CollectionAdminRequest.removeRole(overseer3, "overseer").process(cluster.getSolrClient());
|
||||||
waitForNewOverseer(15, overseer4);
|
waitForNewOverseer(15, overseer4, false);
|
||||||
|
|
||||||
// Add it back again - we now have two delegates, 4 and 3
|
// Add it back again - we now have two delegates, 4 and 3
|
||||||
CollectionAdminRequest.addRole(overseer3, "overseer").process(cluster.getSolrClient());
|
CollectionAdminRequest.addRole(overseer3, "overseer").process(cluster.getSolrClient());
|
||||||
|
@ -142,7 +154,7 @@ public class OverseerRolesTest extends SolrCloudTestCase {
|
||||||
.offer(Utils.toJSON(new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.QUIT.toLower(),
|
.offer(Utils.toJSON(new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.QUIT.toLower(),
|
||||||
"id", leaderId)));
|
"id", leaderId)));
|
||||||
|
|
||||||
waitForNewOverseer(15, s -> Objects.equals(leader, s) == false);
|
waitForNewOverseer(15, s -> Objects.equals(leader, s) == false, false);
|
||||||
|
|
||||||
Thread.sleep(1000);
|
Thread.sleep(1000);
|
||||||
|
|
||||||
|
@ -150,6 +162,37 @@ public class OverseerRolesTest extends SolrCloudTestCase {
|
||||||
assertTrue("The old leader should have rejoined election",
|
assertTrue("The old leader should have rejoined election",
|
||||||
OverseerCollectionConfigSetProcessor.getSortedOverseerNodeNames(zkClient()).contains(leader));
|
OverseerCollectionConfigSetProcessor.getSortedOverseerNodeNames(zkClient()).contains(leader));
|
||||||
|
|
||||||
|
leaderJetty.start(); // starting this back, just for good measure
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDesignatedOverseerRestarts() throws Exception {
|
||||||
|
logOverseerState();
|
||||||
|
List<String> nodes = OverseerCollectionConfigSetProcessor.getSortedOverseerNodeNames(zkClient());
|
||||||
|
// Remove the OVERSEER role, in case it was already assigned by another test in this suite
|
||||||
|
for (String node: nodes) {
|
||||||
|
CollectionAdminRequest.removeRole(node, "overseer").process(cluster.getSolrClient());
|
||||||
|
}
|
||||||
|
String overseer1 = OverseerCollectionConfigSetProcessor.getLeaderNode(zkClient());
|
||||||
|
nodes.remove(overseer1);
|
||||||
|
|
||||||
|
// Setting overseer role to the current overseer
|
||||||
|
CollectionAdminRequest.addRole(overseer1, "overseer").process(cluster.getSolrClient());
|
||||||
|
waitForNewOverseer(15, overseer1, false);
|
||||||
|
|
||||||
|
// kill the current overseer, and check that the next node in the election queue assumes leadership
|
||||||
|
JettySolrRunner leaderJetty = getOverseerJetty();
|
||||||
|
logOverseerState();
|
||||||
|
leaderJetty.stop();
|
||||||
|
log.info("Killing designated overseer: "+overseer1);
|
||||||
|
waitForNewOverseer(10, nodes.get(0), false);
|
||||||
|
|
||||||
|
// after 5 seconds, bring back dead designated overseer and assert that it assumes leadership "right away",
|
||||||
|
// i.e. without any other node assuming leadership before this node becomes leader.
|
||||||
|
Thread.sleep(5);
|
||||||
|
logOverseerState();
|
||||||
|
log.info("Starting back the prioritized overseer..");
|
||||||
|
leaderJetty.start();
|
||||||
|
waitForNewOverseer(1500000, overseer1, true); // assert that there is just a single leadership transition
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue