mirror of https://github.com/apache/lucene.git
SOLR-9563: Collection creation can fail if a node doesn't update its state quickly enough
This commit is contained in:
parent
23485c3377
commit
1e3781500c
|
@ -121,6 +121,10 @@ Bug Fixes
|
|||
* SOLR-9556: OverseerAutoFailoverReplicaThread was not exiting on interrupt
|
||||
(Alan Woodward)
|
||||
|
||||
* SOLR-9563: Collection creation could fail if an ADDREPLICA subrequest arrived
|
||||
at a node before its local state had updated with the new collection data
|
||||
(Alan Woodward)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -43,6 +43,7 @@ import java.util.concurrent.ExecutorService;
|
|||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
@ -1524,34 +1525,43 @@ public class ZkController {
|
|||
if (!Overseer.isLegacy(zkStateReader)) {
|
||||
CloudDescriptor cloudDesc = cd.getCloudDescriptor();
|
||||
String coreNodeName = cloudDesc.getCoreNodeName();
|
||||
assert coreNodeName != null : "SolrCore: " + cd.getName() + " has no coreNodeName";
|
||||
if (coreNodeName == null)
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "No coreNodeName for " + cd);
|
||||
if (cloudDesc.getShardId() == null) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "No shard id for :" + cd);
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "No shard id for " + cd);
|
||||
}
|
||||
long endTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(3, TimeUnit.SECONDS);
|
||||
String errMessage = null;
|
||||
while (System.nanoTime() < endTime) {
|
||||
Slice slice = zkStateReader.getClusterState().getSlice(cd.getCollectionName(), cloudDesc.getShardId());
|
||||
if (slice == null) {
|
||||
errMessage = "Invalid slice : " + cloudDesc.getShardId();
|
||||
continue;
|
||||
}
|
||||
if (slice.getReplica(coreNodeName) != null) {
|
||||
|
||||
AtomicReference<String> errorMessage = new AtomicReference<>();
|
||||
AtomicReference<DocCollection> collectionState = new AtomicReference<>();
|
||||
try {
|
||||
zkStateReader.waitForState(cd.getCollectionName(), 3, TimeUnit.SECONDS, (n, c) -> {
|
||||
collectionState.set(c);
|
||||
if (c == null)
|
||||
return false;
|
||||
Slice slice = c.getSlice(cloudDesc.getShardId());
|
||||
if (slice == null) {
|
||||
errorMessage.set("Invalid shard: " + cloudDesc.getShardId());
|
||||
return false;
|
||||
}
|
||||
Replica replica = slice.getReplica(coreNodeName);
|
||||
if (replica == null) {
|
||||
errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId());
|
||||
return false;
|
||||
}
|
||||
String baseUrl = replica.getStr(BASE_URL_PROP);
|
||||
String coreName = replica.getStr(CORE_NAME_PROP);
|
||||
if (baseUrl.equals(this.baseURL) && coreName.equals(cd.getName())) {
|
||||
return;
|
||||
} else {
|
||||
errMessage = "replica with coreNodeName " + coreNodeName + " exists but with a different name or base_url";
|
||||
return true;
|
||||
}
|
||||
}
|
||||
Thread.sleep(100);
|
||||
errorMessage.set("coreNodeName " + coreNodeName + " exists, but does not match expected node or core name");
|
||||
return false;
|
||||
});
|
||||
} catch (TimeoutException e) {
|
||||
String error = errorMessage.get();
|
||||
if (error == null)
|
||||
error = "Replica " + coreNodeName + " is not present in cluster state";
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, error + ": " + collectionState.get());
|
||||
}
|
||||
if (errMessage == null) {
|
||||
errMessage = "replica " + coreNodeName + " is not present in cluster state";
|
||||
}
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, errMessage + ". state : " + zkStateReader.getClusterState().getCollection(cd.getCollectionName()));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -85,7 +85,7 @@ public class DeleteInactiveReplicaTest extends SolrCloudTestCase {
|
|||
CoreContainer.CoreLoadFailure loadFailure = cc.getCoreInitFailures().get(replica.getCoreName());
|
||||
assertNotNull("Deleted core was still loaded!", loadFailure);
|
||||
assertTrue("Unexpected load failure message: " + loadFailure.exception.getMessage(),
|
||||
loadFailure.exception.getMessage().contains("not present in cluster state"));
|
||||
loadFailure.exception.getMessage().contains("does not exist in shard"));
|
||||
|
||||
// Check that we can't create a core with no coreNodeName
|
||||
try (SolrClient queryClient = getHttpSolrClient(jetty.getBaseUrl().toString())) {
|
||||
|
|
Loading…
Reference in New Issue