SOLR-13834: ZkController#getSolrCloudManager() now uses the same ZkStateReader instance instead of instantiating a new one

ZkController#getSolrCloudManager() created a new instance of ZkStateReader, thereby causing mismatch in the
visibility of the cluster state and, as a result, undesired race conditions.
This commit is contained in:
Ishan Chattopadhyaya 2019-10-11 22:14:11 +05:30
parent b8648c60e7
commit e2b160b865
6 changed files with 27 additions and 14 deletions

View File

@ -326,6 +326,9 @@ Bug Fixes
the number of replicas. This avoids making too many cascading calls to remote servers, which, if not restricted, can the number of replicas. This avoids making too many cascading calls to remote servers, which, if not restricted, can
bring down nodes containing the said collection (Kesharee Nandan Vishwakarma, Ishan Chattopadhyaya) bring down nodes containing the said collection (Kesharee Nandan Vishwakarma, Ishan Chattopadhyaya)
* SOLR-13834: ZkController#getSolrCloudManager() created a new instance of ZkStateReader, thereby causing mismatch in the
visibility of the cluster state and, as a result, undesired race conditions (Clay Goddard via Ishan Chattopadhyaya)
Other Changes Other Changes
---------------------- ----------------------

View File

@ -40,7 +40,6 @@ import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.concurrent.Callable; import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
@ -59,6 +58,7 @@ import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient.Builder; import org.apache.solr.client.solrj.impl.HttpSolrClient.Builder;
import org.apache.solr.client.solrj.impl.SolrClientCloudManager; import org.apache.solr.client.solrj.impl.SolrClientCloudManager;
import org.apache.solr.client.solrj.impl.ZkClientClusterStateProvider;
import org.apache.solr.client.solrj.request.CoreAdminRequest.WaitForState; import org.apache.solr.client.solrj.request.CoreAdminRequest.WaitForState;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType; import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
import org.apache.solr.cloud.overseer.OverseerAction; import org.apache.solr.cloud.overseer.OverseerAction;
@ -746,7 +746,7 @@ public class ZkController implements Closeable {
if (cloudManager != null) { if (cloudManager != null) {
return cloudManager; return cloudManager;
} }
cloudSolrClient = new CloudSolrClient.Builder(Collections.singletonList(zkServerAddress), Optional.empty()).withSocketTimeout(30000).withConnectionTimeout(15000) cloudSolrClient = new CloudSolrClient.Builder(new ZkClientClusterStateProvider(zkStateReader)).withSocketTimeout(30000).withConnectionTimeout(15000)
.withHttpClient(cc.getUpdateShardHandler().getDefaultHttpClient()) .withHttpClient(cc.getUpdateShardHandler().getDefaultHttpClient())
.withConnectionTimeout(15000).withSocketTimeout(30000).build(); .withConnectionTimeout(15000).withSocketTimeout(30000).build();
cloudManager = new SolrClientCloudManager(new ZkDistributedQueueFactory(zkClient), cloudSolrClient); cloudManager = new SolrClientCloudManager(new ZkDistributedQueueFactory(zkClient), cloudSolrClient);

View File

@ -77,10 +77,13 @@ public class CreateShardCmd implements OverseerCollectionMessageHandler.Cmd {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, NRT_REPLICAS + " + " + TLOG_REPLICAS + " must be greater than 0"); throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, NRT_REPLICAS + " + " + TLOG_REPLICAS + " must be greater than 0");
} }
ZkStateReader zkStateReader = ocmh.zkStateReader; //ZkStateReader zkStateReader = ocmh.zkStateReader;
ocmh.overseer.offerStateUpdate(Utils.toJSON(message)); ocmh.overseer.offerStateUpdate(Utils.toJSON(message));
// wait for a while until we see the shard // wait for a while until we see the shard
ocmh.waitForNewShard(collectionName, sliceName); //ocmh.waitForNewShard(collectionName, sliceName);
// wait for a while until we see the shard and update the local view of the cluster state
clusterState = ocmh.waitForNewShard(collectionName, sliceName);
String async = message.getStr(ASYNC); String async = message.getStr(ASYNC);
ZkNodeProps addReplicasProps = new ZkNodeProps( ZkNodeProps addReplicasProps = new ZkNodeProps(
COLLECTION_PROP, collectionName, COLLECTION_PROP, collectionName,
@ -97,7 +100,8 @@ public class CreateShardCmd implements OverseerCollectionMessageHandler.Cmd {
if (async != null) addReplicasProps.getProperties().put(ASYNC, async); if (async != null) addReplicasProps.getProperties().put(ASYNC, async);
final NamedList addResult = new NamedList(); final NamedList addResult = new NamedList();
try { try {
ocmh.addReplica(zkStateReader.getClusterState(), addReplicasProps, addResult, () -> { //ocmh.addReplica(zkStateReader.getClusterState(), addReplicasProps, addResult, () -> {
ocmh.addReplica(clusterState, addReplicasProps, addResult, () -> {
Object addResultFailure = addResult.get("failure"); Object addResultFailure = addResult.get("failure");
if (addResultFailure != null) { if (addResultFailure != null) {
SimpleOrderedMap failure = (SimpleOrderedMap) results.get("failure"); SimpleOrderedMap failure = (SimpleOrderedMap) results.get("failure");

View File

@ -553,12 +553,14 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
throw new SolrException(ErrorCode.SERVER_ERROR, "Could not find coreNodeName"); throw new SolrException(ErrorCode.SERVER_ERROR, "Could not find coreNodeName");
} }
void waitForNewShard(String collectionName, String sliceName) throws KeeperException, InterruptedException { ClusterState waitForNewShard(String collectionName, String sliceName) throws KeeperException, InterruptedException {
log.debug("Waiting for slice {} of collection {} to be available", sliceName, collectionName); log.debug("Waiting for slice {} of collection {} to be available", sliceName, collectionName);
RTimer timer = new RTimer(); RTimer timer = new RTimer();
int retryCount = 320; int retryCount = 320;
while (retryCount-- > 0) { while (retryCount-- > 0) {
DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); ClusterState clusterState = zkStateReader.getClusterState();
DocCollection collection = clusterState.getCollection(collectionName);
if (collection == null) { if (collection == null) {
throw new SolrException(ErrorCode.SERVER_ERROR, throw new SolrException(ErrorCode.SERVER_ERROR,
"Unable to find collection: " + collectionName + " in clusterstate"); "Unable to find collection: " + collectionName + " in clusterstate");
@ -567,7 +569,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
if (slice != null) { if (slice != null) {
log.debug("Waited for {}ms for slice {} of collection {} to be available", log.debug("Waited for {}ms for slice {} of collection {} to be available",
timer.getTime(), sliceName, collectionName); timer.getTime(), sliceName, collectionName);
return; return clusterState;
} }
Thread.sleep(1000); Thread.sleep(1000);
} }

View File

@ -310,11 +310,8 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
ocmh.overseer.offerStateUpdate(Utils.toJSON(new ZkNodeProps(propMap))); ocmh.overseer.offerStateUpdate(Utils.toJSON(new ZkNodeProps(propMap)));
// wait until we are able to see the new shard in cluster state // wait until we are able to see the new shard in cluster state and refresh the local view of the cluster state
ocmh.waitForNewShard(collectionName, subSlice); clusterState = ocmh.waitForNewShard(collectionName, subSlice);
// refresh cluster state
clusterState = zkStateReader.getClusterState();
log.debug("Adding first replica " + subShardName + " as part of slice " + subSlice + " of collection " + collectionName log.debug("Adding first replica " + subShardName + " as part of slice " + subSlice + " of collection " + collectionName
+ " on " + nodeName); + " on " + nodeName);

View File

@ -254,6 +254,13 @@ public class CloudSolrClient extends BaseCloudSolrClient {
this.solrUrls = solrUrls; this.solrUrls = solrUrls;
} }
/**
* Provide an already created {@link ClusterStateProvider} instance
*/
public Builder(ClusterStateProvider stateProvider) {
this.stateProvider = stateProvider;
}
/** /**
* Provide a series of ZK hosts which will be used when configuring {@link CloudSolrClient} instances. * Provide a series of ZK hosts which will be used when configuring {@link CloudSolrClient} instances.
* *