SOLR-9563: Collection creation can fail if a node doesn't update its state quickly enough

This commit is contained in:
Alan Woodward 2016-10-04 12:26:17 +01:00
parent 23485c3377
commit 1e3781500c
3 changed files with 35 additions and 21 deletions

View File

@ -121,6 +121,10 @@ Bug Fixes
* SOLR-9556: OverseerAutoFailoverReplicaThread was not exiting on interrupt
(Alan Woodward)
* SOLR-9563: Collection creation could fail if an ADDREPLICA subrequest arrived
at a node before its local state had updated with the new collection data
(Alan Woodward)
Optimizations
----------------------

View File

@ -43,6 +43,7 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import com.google.common.base.Strings;
import org.apache.commons.lang.StringUtils;
@ -1524,34 +1525,43 @@ public class ZkController {
if (!Overseer.isLegacy(zkStateReader)) {
CloudDescriptor cloudDesc = cd.getCloudDescriptor();
String coreNodeName = cloudDesc.getCoreNodeName();
assert coreNodeName != null : "SolrCore: " + cd.getName() + " has no coreNodeName";
if (coreNodeName == null)
throw new SolrException(ErrorCode.SERVER_ERROR, "No coreNodeName for " + cd);
if (cloudDesc.getShardId() == null) {
throw new SolrException(ErrorCode.SERVER_ERROR, "No shard id for :" + cd);
throw new SolrException(ErrorCode.SERVER_ERROR, "No shard id for " + cd);
}
long endTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(3, TimeUnit.SECONDS);
String errMessage = null;
while (System.nanoTime() < endTime) {
Slice slice = zkStateReader.getClusterState().getSlice(cd.getCollectionName(), cloudDesc.getShardId());
if (slice == null) {
errMessage = "Invalid slice : " + cloudDesc.getShardId();
continue;
}
if (slice.getReplica(coreNodeName) != null) {
AtomicReference<String> errorMessage = new AtomicReference<>();
AtomicReference<DocCollection> collectionState = new AtomicReference<>();
try {
zkStateReader.waitForState(cd.getCollectionName(), 3, TimeUnit.SECONDS, (n, c) -> {
collectionState.set(c);
if (c == null)
return false;
Slice slice = c.getSlice(cloudDesc.getShardId());
if (slice == null) {
errorMessage.set("Invalid shard: " + cloudDesc.getShardId());
return false;
}
Replica replica = slice.getReplica(coreNodeName);
if (replica == null) {
errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId());
return false;
}
String baseUrl = replica.getStr(BASE_URL_PROP);
String coreName = replica.getStr(CORE_NAME_PROP);
if (baseUrl.equals(this.baseURL) && coreName.equals(cd.getName())) {
return;
} else {
errMessage = "replica with coreNodeName " + coreNodeName + " exists but with a different name or base_url";
return true;
}
}
Thread.sleep(100);
errorMessage.set("coreNodeName " + coreNodeName + " exists, but does not match expected node or core name");
return false;
});
} catch (TimeoutException e) {
String error = errorMessage.get();
if (error == null)
error = "Replica " + coreNodeName + " is not present in cluster state";
throw new SolrException(ErrorCode.SERVER_ERROR, error + ": " + collectionState.get());
}
if (errMessage == null) {
errMessage = "replica " + coreNodeName + " is not present in cluster state";
}
throw new SolrException(ErrorCode.SERVER_ERROR, errMessage + ". state : " + zkStateReader.getClusterState().getCollection(cd.getCollectionName()));
}
}

View File

@ -85,7 +85,7 @@ public class DeleteInactiveReplicaTest extends SolrCloudTestCase {
CoreContainer.CoreLoadFailure loadFailure = cc.getCoreInitFailures().get(replica.getCoreName());
assertNotNull("Deleted core was still loaded!", loadFailure);
assertTrue("Unexpected load failure message: " + loadFailure.exception.getMessage(),
loadFailure.exception.getMessage().contains("not present in cluster state"));
loadFailure.exception.getMessage().contains("does not exist in shard"));
// Check that we can't create a core with no coreNodeName
try (SolrClient queryClient = getHttpSolrClient(jetty.getBaseUrl().toString())) {