mirror of https://github.com/apache/lucene.git
SOLR-6530: Commits under network partitions can put any node in down state
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1628945 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
88f7e7c687
commit
14a98ba5d5
|
@ -200,6 +200,9 @@ Bug Fixes
|
||||||
|
|
||||||
* SOLR-6511: Fencepost error in LeaderInitiatedRecoveryThread (Timothy Potter)
|
* SOLR-6511: Fencepost error in LeaderInitiatedRecoveryThread (Timothy Potter)
|
||||||
|
|
||||||
|
* SOLR-6530: Commits under network partitions can put any node in down state.
|
||||||
|
(Ramkumar Aiyengar, Alan Woodward, Mark Miller, shalin)
|
||||||
|
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -808,6 +808,11 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
|
||||||
if (phase != DistribPhase.FROMLEADER)
|
if (phase != DistribPhase.FROMLEADER)
|
||||||
continue; // don't have non-leaders try to recovery other nodes
|
continue; // don't have non-leaders try to recovery other nodes
|
||||||
|
|
||||||
|
// commits are special -- they can run on any node irrespective of whether it is a leader or not
|
||||||
|
// we don't want to run recovery on a node which missed a commit command
|
||||||
|
if (error.req.uReq.getParams().get(COMMIT_END_POINT) != null)
|
||||||
|
continue;
|
||||||
|
|
||||||
final String replicaUrl = error.req.node.getUrl();
|
final String replicaUrl = error.req.node.getUrl();
|
||||||
|
|
||||||
// if the remote replica failed the request because of leader change (SOLR-6511), then fail the request
|
// if the remote replica failed the request because of leader change (SOLR-6511), then fail the request
|
||||||
|
@ -839,7 +844,17 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
|
||||||
" " + shardId + " before putting " + replicaUrl + " into leader-initiated recovery due to: " + exc);
|
" " + shardId + " before putting " + replicaUrl + " into leader-initiated recovery due to: " + exc);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cloudDesc.getCoreNodeName().equals(leaderCoreNodeName)) {
|
List<ZkCoreNodeProps> myReplicas = zkController.getZkStateReader().getReplicaProps(collection,
|
||||||
|
cloudDesc.getShardId(), cloudDesc.getCoreNodeName());
|
||||||
|
boolean foundErrorNodeInReplicaList = false;
|
||||||
|
for (ZkCoreNodeProps replicaProp : myReplicas) {
|
||||||
|
if (((Replica) replicaProp.getNodeProps()).getName().equals(((Replica)stdNode.getNodeProps().getNodeProps()).getName())) {
|
||||||
|
foundErrorNodeInReplicaList = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cloudDesc.getCoreNodeName().equals(leaderCoreNodeName) && foundErrorNodeInReplicaList) {
|
||||||
try {
|
try {
|
||||||
// if false, then the node is probably not "live" anymore
|
// if false, then the node is probably not "live" anymore
|
||||||
sendRecoveryCommand =
|
sendRecoveryCommand =
|
||||||
|
@ -866,11 +881,17 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
|
||||||
// will go ahead and try to send the recovery command once after this error
|
// will go ahead and try to send the recovery command once after this error
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// not the leader anymore maybe?
|
// not the leader anymore maybe or the error'd node is not my replica?
|
||||||
sendRecoveryCommand = false;
|
sendRecoveryCommand = false;
|
||||||
|
if (!foundErrorNodeInReplicaList) {
|
||||||
|
log.warn("Core "+cloudDesc.getCoreNodeName()+" belonging to "+collection+" "+
|
||||||
|
shardId+", does not have error'd node " + stdNode.getNodeProps().getCoreUrl() + " as a replica. " +
|
||||||
|
"No request recovery command will be sent!");
|
||||||
|
} else {
|
||||||
log.warn("Core "+cloudDesc.getCoreNodeName()+" is no longer the leader for "+collection+" "+
|
log.warn("Core "+cloudDesc.getCoreNodeName()+" is no longer the leader for "+collection+" "+
|
||||||
shardId+", no request recovery command will be sent!");
|
shardId+", no request recovery command will be sent!");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} // else not a StdNode, recovery command still gets sent once
|
} // else not a StdNode, recovery command still gets sent once
|
||||||
|
|
||||||
if (!sendRecoveryCommand)
|
if (!sendRecoveryCommand)
|
||||||
|
|
|
@ -0,0 +1,163 @@
|
||||||
|
package org.apache.solr.cloud;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
|
||||||
|
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
||||||
|
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
|
||||||
|
import org.apache.solr.common.cloud.Replica;
|
||||||
|
import org.apache.solr.common.cloud.ZkCoreNodeProps;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Before;
|
||||||
|
|
||||||
|
public class LeaderInitiatedRecoveryOnCommitTest extends BasicDistributedZkTest {
|
||||||
|
|
||||||
|
private static final long sleepMsBeforeHealPartition = 2000L;
|
||||||
|
|
||||||
|
public LeaderInitiatedRecoveryOnCommitTest() {
|
||||||
|
super();
|
||||||
|
sliceCount = 1;
|
||||||
|
shardCount = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Before
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
System.setProperty("numShards", Integer.toString(sliceCount));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
@After
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
System.clearProperty("numShards");
|
||||||
|
|
||||||
|
try {
|
||||||
|
super.tearDown();
|
||||||
|
} catch (Exception exc) {
|
||||||
|
}
|
||||||
|
|
||||||
|
resetExceptionIgnores();
|
||||||
|
|
||||||
|
// close socket proxies after super.tearDown
|
||||||
|
if (!proxies.isEmpty()) {
|
||||||
|
for (SocketProxy proxy : proxies.values()) {
|
||||||
|
proxy.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void doTest() throws Exception {
|
||||||
|
oneShardTest();
|
||||||
|
multiShardTest();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void multiShardTest() throws Exception {
|
||||||
|
// create a collection that has 1 shard and 3 replicas
|
||||||
|
String testCollectionName = "c8n_2x2_commits";
|
||||||
|
createCollection(testCollectionName, 2, 2, 1);
|
||||||
|
cloudClient.setDefaultCollection(testCollectionName);
|
||||||
|
|
||||||
|
List<Replica> notLeaders =
|
||||||
|
ensureAllReplicasAreActive(testCollectionName, "shard1", 2, 2, 30);
|
||||||
|
assertTrue("Expected 1 replicas for collection " + testCollectionName
|
||||||
|
+ " but found " + notLeaders.size() + "; clusterState: "
|
||||||
|
+ printClusterStateInfo(),
|
||||||
|
notLeaders.size() == 1);
|
||||||
|
|
||||||
|
// let's put the leader in it's own partition, no replicas can contact it now
|
||||||
|
Replica leader = cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1");
|
||||||
|
SocketProxy leaderProxy = getProxyForReplica(leader);
|
||||||
|
leaderProxy.close();
|
||||||
|
|
||||||
|
// let's find the leader of shard2 and ask him to commit
|
||||||
|
Replica shard2Leader = cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard2");
|
||||||
|
HttpSolrServer server = new HttpSolrServer(ZkCoreNodeProps.getCoreUrl(shard2Leader.getStr("base_url"), shard2Leader.getStr("core")));
|
||||||
|
server.commit();
|
||||||
|
|
||||||
|
Thread.sleep(sleepMsBeforeHealPartition);
|
||||||
|
|
||||||
|
cloudClient.getZkStateReader().updateClusterState(true); // get the latest state
|
||||||
|
leader = cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1");
|
||||||
|
assertEquals("Leader was not active", "active", leader.getStr("state"));
|
||||||
|
|
||||||
|
// try to clean up
|
||||||
|
try {
|
||||||
|
CollectionAdminRequest req = new CollectionAdminRequest.Delete();
|
||||||
|
req.setCollectionName(testCollectionName);
|
||||||
|
req.process(cloudClient);
|
||||||
|
} catch (Exception e) {
|
||||||
|
// don't fail the test
|
||||||
|
log.warn("Could not delete collection {} after test completed", testCollectionName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void oneShardTest() throws Exception {
|
||||||
|
// create a collection that has 1 shard and 3 replicas
|
||||||
|
String testCollectionName = "c8n_1x3_commits";
|
||||||
|
createCollection(testCollectionName, 1, 3, 1);
|
||||||
|
cloudClient.setDefaultCollection(testCollectionName);
|
||||||
|
|
||||||
|
List<Replica> notLeaders =
|
||||||
|
ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 3, 30);
|
||||||
|
assertTrue("Expected 2 replicas for collection " + testCollectionName
|
||||||
|
+ " but found " + notLeaders.size() + "; clusterState: "
|
||||||
|
+ printClusterStateInfo(),
|
||||||
|
notLeaders.size() == 2);
|
||||||
|
|
||||||
|
// let's put the leader in it's own partition, no replicas can contact it now
|
||||||
|
Replica leader = cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1");
|
||||||
|
SocketProxy leaderProxy = getProxyForReplica(leader);
|
||||||
|
leaderProxy.close();
|
||||||
|
|
||||||
|
Replica replica = notLeaders.get(0);
|
||||||
|
HttpSolrServer server = new HttpSolrServer(ZkCoreNodeProps.getCoreUrl(replica.getStr("base_url"), replica.getStr("core")));
|
||||||
|
server.commit();
|
||||||
|
|
||||||
|
Thread.sleep(sleepMsBeforeHealPartition);
|
||||||
|
|
||||||
|
cloudClient.getZkStateReader().updateClusterState(true); // get the latest state
|
||||||
|
leader = cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1");
|
||||||
|
assertEquals("Leader was not active", "active", leader.getStr("state"));
|
||||||
|
|
||||||
|
// try to clean up
|
||||||
|
try {
|
||||||
|
CollectionAdminRequest req = new CollectionAdminRequest.Delete();
|
||||||
|
req.setCollectionName(testCollectionName);
|
||||||
|
req.process(cloudClient);
|
||||||
|
} catch (Exception e) {
|
||||||
|
// don't fail the test
|
||||||
|
log.warn("Could not delete collection {} after test completed", testCollectionName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Overrides the parent implementation to install a SocketProxy in-front of the Jetty server.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public JettySolrRunner createJetty(File solrHome, String dataDir,
|
||||||
|
String shardList, String solrConfigOverride, String schemaOverride)
|
||||||
|
throws Exception {
|
||||||
|
return createProxiedJetty(solrHome, dataDir, shardList, solrConfigOverride, schemaOverride);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue