SOLR-3993: If multiple SolrCore's for a shard coexist on a node, on cluster restart, leader election would stall until timeout, waiting to see all of the replicas come up.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1408313 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2012-11-12 15:10:47 +00:00
parent 267df1a190
commit f902314d74
5 changed files with 80 additions and 27 deletions

View File

@ -163,6 +163,10 @@ Bug Fixes
cause actions to be targeted at the wrong SolrCores.
(Raintung Li via Mark Miller)
* SOLR-3993: If multiple SolrCore's for a shard coexist on a node, on cluster
restart, leader election would stall until timeout, waiting to see all of
the replicas come up. (Mark Miller, Alexey Kudinov)
Other Changes
----------------------

View File

@ -22,7 +22,7 @@
persistent: Save changes made via the API to this file
sharedLib: path to a lib directory that will be shared across all cores
-->
<solr persistent="false">
<solr persistent="${solr.xml.persist:false}">
<!--
adminPath: RequestHandler path to manage cores.

View File

@ -126,6 +126,7 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
public void setUp() throws Exception {
super.setUp();
System.setProperty("numShards", Integer.toString(sliceCount));
System.setProperty("solr.xml.persist", "true");
}
@ -341,6 +342,7 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
testCollectionsAPI();
testCoreUnloadAndLeaders();
testUnloadLotsOfCores();
testStopAndStartCoresInOneInstance();
// Thread.sleep(10000000000L);
if (DEBUG) {
super.printLayout();
@ -550,29 +552,10 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
5, TimeUnit.SECONDS, new SynchronousQueue<Runnable>(),
new DefaultSolrThreadFactory("testExecutor"));
int cnt = atLeast(6);
for (int i = 0; i < cnt; i++) {
final int freezeI = i;
executor.execute(new Runnable() {
@Override
public void run() {
Create createCmd = new Create();
createCmd.setCoreName("multiunload" + freezeI);
createCmd.setCollection("multiunload");
String core3dataDir = dataDir.getAbsolutePath() + File.separator
+ System.currentTimeMillis() + "unloadcollection" + "_3n" + freezeI;
createCmd.setDataDir(core3dataDir);
try {
server.request(createCmd);
} catch (SolrServerException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
}
// create the 6 cores
createCores(server, executor, "multiunload", 2, cnt);
executor.shutdown();
executor.awaitTermination(120, TimeUnit.SECONDS);
executor = new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5,
@ -599,6 +582,68 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
executor.shutdown();
executor.awaitTermination(120, TimeUnit.SECONDS);
}
private void testStopAndStartCoresInOneInstance() throws Exception {
SolrServer client = clients.get(0);
String url3 = getBaseUrl(client);
final HttpSolrServer server = new HttpSolrServer(url3);
ThreadPoolExecutor executor = new ThreadPoolExecutor(0, Integer.MAX_VALUE,
5, TimeUnit.SECONDS, new SynchronousQueue<Runnable>(),
new DefaultSolrThreadFactory("testExecutor"));
int cnt = 3;
// create the cores
createCores(server, executor, "multiunload2", 1, cnt);
executor.shutdown();
executor.awaitTermination(120, TimeUnit.SECONDS);
ChaosMonkey.stop(cloudJettys.get(0).jetty);
printLayout();
// nocommit
System.out.println("start again");
Thread.sleep(5000);
ChaosMonkey.start(cloudJettys.get(0).jetty);
cloudClient.getZkStateReader().updateClusterState(true);
try {
cloudClient.getZkStateReader().getLeaderProps("multiunload2", "shard1", 30000);
} catch (SolrException e) {
printLayout();
throw e;
}
printLayout();
}
private void createCores(final HttpSolrServer server,
ThreadPoolExecutor executor, final String collection, final int numShards, int cnt) {
for (int i = 0; i < cnt; i++) {
final int freezeI = i;
executor.execute(new Runnable() {
@Override
public void run() {
Create createCmd = new Create();
createCmd.setCoreName(collection + freezeI);
createCmd.setCollection(collection);
String core3dataDir = dataDir.getAbsolutePath() + File.separator
+ System.currentTimeMillis() + collection + "_3n" + freezeI;
createCmd.setDataDir(core3dataDir);
createCmd.setNumShards(numShards);
try {
server.request(createCmd);
} catch (SolrServerException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
}
}
private String getBaseUrl(SolrServer client) {
@ -1439,6 +1484,7 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
}
System.clearProperty("numShards");
System.clearProperty("zkHost");
System.clearProperty("solr.xml.persist");
// insurance
DirectUpdateHandler2.commitOnClose = true;

View File

@ -407,7 +407,7 @@ public class ZkStateReader {
long timeoutAt = System.currentTimeMillis() + timeout;
while (System.currentTimeMillis() < timeoutAt) {
if (clusterState != null) {
final ZkNodeProps nodeProps = clusterState.getLeader(collection, shard);
final ZkNodeProps nodeProps = clusterState.getLeader(collection, shard);
if (nodeProps != null && getClusterState().liveNodesContain((String) nodeProps.get(ZkStateReader.NODE_NAME_PROP))) {
return nodeProps;
}

View File

@ -337,9 +337,12 @@ public abstract class AbstractFullDistribZkTestBase extends AbstractDistribZkTes
return jettys;
}
protected int getNumShards(String defaultCollection) {
protected int getNumShards(String collection) {
ZkStateReader zkStateReader = cloudClient.getZkStateReader();
Map<String,Slice> slices = zkStateReader.getClusterState().getSlices(defaultCollection);
Map<String,Slice> slices = zkStateReader.getClusterState().getSlices(collection);
if (slices == null) {
throw new IllegalArgumentException("Could not find collection:" + collection);
}
int cnt = 0;
for (Map.Entry<String,Slice> entry : slices.entrySet()) {
cnt += entry.getValue().getReplicasMap().size();