SOLR-10279: The autoAddReplica feature can result in SolrCores being assigned new shards when using legacyCloud=false and will also fail on a state check when taking over a core registration with a new core.

This commit is contained in:
markrmiller 2017-03-14 06:01:06 -04:00 committed by Shalin Shekhar Mangar
parent 83772c6f99
commit cab7e1a3d7
4 changed files with 38 additions and 9 deletions

View File

@ -243,6 +243,10 @@ Bug Fixes
* SOLR-10269: MetricsHandler JSON output incorrect. (ab) * SOLR-10269: MetricsHandler JSON output incorrect. (ab)
* SOLR-10279: The autoAddReplica feature can result in SolrCores being assigned new shards when using
legacyCloud=false and will also fail on a state check when taking over a core registration with a new
core. (Mark Miller, Hrishikesh Gadre, Patrick Dvorack)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -243,13 +243,14 @@ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable {
final String dataDir = badReplica.replica.getStr("dataDir"); final String dataDir = badReplica.replica.getStr("dataDir");
final String ulogDir = badReplica.replica.getStr("ulogDir"); final String ulogDir = badReplica.replica.getStr("ulogDir");
final String coreNodeName = badReplica.replica.getName(); final String coreNodeName = badReplica.replica.getName();
final String shardId = badReplica.slice.getName();
if (dataDir != null) { if (dataDir != null) {
// need an async request - full shard goes down leader election // need an async request - full shard goes down leader election
final String coreName = badReplica.replica.getStr(ZkStateReader.CORE_NAME_PROP); final String coreName = badReplica.replica.getStr(ZkStateReader.CORE_NAME_PROP);
log.debug("submit call to {}", createUrl); log.debug("submit call to {}", createUrl);
MDC.put("OverseerAutoReplicaFailoverThread.createUrl", createUrl); MDC.put("OverseerAutoReplicaFailoverThread.createUrl", createUrl);
try { try {
updateExecutor.submit(() -> createSolrCore(collection, createUrl, dataDir, ulogDir, coreNodeName, coreName)); updateExecutor.submit(() -> createSolrCore(collection, createUrl, dataDir, ulogDir, coreNodeName, coreName, shardId));
} finally { } finally {
MDC.remove("OverseerAutoReplicaFailoverThread.createUrl"); MDC.remove("OverseerAutoReplicaFailoverThread.createUrl");
} }
@ -440,7 +441,7 @@ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable {
private boolean createSolrCore(final String collection, private boolean createSolrCore(final String collection,
final String createUrl, final String dataDir, final String ulogDir, final String createUrl, final String dataDir, final String ulogDir,
final String coreNodeName, final String coreName) { final String coreNodeName, final String coreName, final String shardId) {
try (HttpSolrClient client = new HttpSolrClient.Builder(createUrl).build()) { try (HttpSolrClient client = new HttpSolrClient.Builder(createUrl).build()) {
log.debug("create url={}", createUrl); log.debug("create url={}", createUrl);
@ -451,6 +452,7 @@ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable {
createCmd.setCoreNodeName(coreNodeName); createCmd.setCoreNodeName(coreNodeName);
// TODO: how do we ensure unique coreName // TODO: how do we ensure unique coreName
// for now, the collections API will use unique names // for now, the collections API will use unique names
createCmd.setShardId(shardId);
createCmd.setCoreName(coreName); createCmd.setCoreName(coreName);
createCmd.setDataDir(dataDir); createCmd.setDataDir(dataDir);
createCmd.setUlogDir(ulogDir.substring(0, ulogDir.length() - "/tlog".length())); createCmd.setUlogDir(ulogDir.substring(0, ulogDir.length() - "/tlog".length()));

View File

@ -1449,13 +1449,7 @@ public class ZkController {
errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId()); errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId());
return false; return false;
} }
String baseUrl = replica.getStr(BASE_URL_PROP); return true;
String coreName = replica.getStr(CORE_NAME_PROP);
if (baseUrl.equals(this.baseURL) && coreName.equals(cd.getName())) {
return true;
}
errorMessage.set("coreNodeName " + coreNodeName + " exists, but does not match expected node or core name");
return false;
}); });
} catch (TimeoutException e) { } catch (TimeoutException e) {
String error = errorMessage.get(); String error = errorMessage.get();

View File

@ -38,6 +38,7 @@ import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.CollectionAdminRequest.Create; import org.apache.solr.client.solrj.request.CollectionAdminRequest.Create;
import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.response.CollectionAdminResponse; import org.apache.solr.client.solrj.response.CollectionAdminResponse;
@ -103,6 +104,11 @@ public class SharedFSAutoReplicaFailoverTest extends AbstractFullDistribZkTestBa
public void setUp() throws Exception { public void setUp() throws Exception {
super.setUp(); super.setUp();
collectionUlogDirMap.clear(); collectionUlogDirMap.clear();
if (random().nextBoolean()) {
CollectionAdminRequest.setClusterProperty("legacyCloud", "false").process(cloudClient);
} else {
CollectionAdminRequest.setClusterProperty("legacyCloud", "true").process(cloudClient);
}
} }
@Override @Override
@ -313,6 +319,29 @@ public class SharedFSAutoReplicaFailoverTest extends AbstractFullDistribZkTestBa
assertSliceAndReplicaCount(collection1); assertSliceAndReplicaCount(collection1);
assertUlogDir(collections); assertUlogDir(collections);
// restart all to test core saved state
ChaosMonkey.stop(jettys);
ChaosMonkey.stop(controlJetty);
assertTrue("Timeout waiting for all not live", ClusterStateUtil.waitForAllReplicasNotLive(cloudClient.getZkStateReader(), 45000));
ChaosMonkey.start(jettys);
ChaosMonkey.start(controlJetty);
assertTrue("Timeout waiting for all live and active", ClusterStateUtil.waitForAllActiveAndLiveReplicas(cloudClient.getZkStateReader(), collection1, 120000));
assertSliceAndReplicaCount(collection1);
assertUlogDir(collections);
assertSliceAndReplicaCount(collection1);
assertSingleReplicationAndShardSize(collection3, 5);
// all docs should be queried
assertSingleReplicationAndShardSize(collection4, 5);
queryAndAssertResultSize(collection4, numDocs, 10000);
} }
private void queryAndAssertResultSize(String collection, int expectedResultSize, int timeoutMS) private void queryAndAssertResultSize(String collection, int expectedResultSize, int timeoutMS)