mirror of https://github.com/apache/lucene.git
Revert "SOLR-9140: Replace some zk state polling with CollectionStateWatchers"
Alan's comments (via Uwe) in SOLR-9140 jira comments suggest that he thought he had already
reverted this on both branches, but that is not the case. Reverting on his behalf due to the
likelyhood that this is causing SOLR-9189.
Alan's comments regarding the master equivilent revert...
"There's still some places where notifications can be missed, so I'm reverting
this until those are fixed."
This reverts commit 9f299bb6ad
.
This commit is contained in:
parent
87d46225cd
commit
4e3884bec7
|
@ -252,9 +252,6 @@ Optimizations
|
||||||
|
|
||||||
* SOLR-8744: Overseer operations performed with fine grained mutual exclusion (noble, Scott Blum)
|
* SOLR-8744: Overseer operations performed with fine grained mutual exclusion (noble, Scott Blum)
|
||||||
|
|
||||||
* SOLR-9140: Replace zk polling in ZkController with CollectionStateWatchers
|
|
||||||
(Alan Woodward)
|
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
----------------------
|
----------------------
|
||||||
* SOLR-7516: Improve javadocs for JavaBinCodec, ObjectResolver and enforce the single-usage policy.
|
* SOLR-7516: Improve javadocs for JavaBinCodec, ObjectResolver and enforce the single-usage policy.
|
||||||
|
|
|
@ -27,7 +27,6 @@ import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.Callable;
|
import java.util.concurrent.Callable;
|
||||||
import java.util.concurrent.CountDownLatch;
|
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
@ -684,22 +683,34 @@ public final class ZkController {
|
||||||
|
|
||||||
publishNodeAsDown(getNodeName());
|
publishNodeAsDown(getNodeName());
|
||||||
|
|
||||||
Set<String> collections = cc.getLocalCollections();
|
// now wait till the updates are in our state
|
||||||
CountDownLatch latch = new CountDownLatch(collections.size());
|
long now = System.nanoTime();
|
||||||
|
long timeout = now + TimeUnit.NANOSECONDS.convert(WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);
|
||||||
|
boolean foundStates = true;
|
||||||
|
|
||||||
for (String collection : collections) {
|
while (System.nanoTime() < timeout) {
|
||||||
zkStateReader.registerCollectionStateWatcher(collection, (nodes, state) -> {
|
ClusterState clusterState = zkStateReader.getClusterState();
|
||||||
for (Replica replica : state.getReplicasOnNode(getNodeName())) {
|
Map<String, DocCollection> collections = clusterState.getCollectionsMap();
|
||||||
if (replica.getState() != Replica.State.DOWN)
|
for (Map.Entry<String, DocCollection> entry : collections.entrySet()) {
|
||||||
return false;
|
DocCollection collection = entry.getValue();
|
||||||
|
Collection<Slice> slices = collection.getSlices();
|
||||||
|
for (Slice slice : slices) {
|
||||||
|
Collection<Replica> replicas = slice.getReplicas();
|
||||||
|
for (Replica replica : replicas) {
|
||||||
|
if (getNodeName().equals(replica.getNodeName()) && replica.getState() != Replica.State.DOWN) {
|
||||||
|
foundStates = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
latch.countDown();
|
|
||||||
return true;
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (latch.await(WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS) == false) {
|
if (foundStates) {
|
||||||
// TODO should we abort here?
|
Thread.sleep(1000);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Thread.sleep(1000);
|
||||||
|
}
|
||||||
|
if (!foundStates) {
|
||||||
log.warn("Timed out waiting to see all nodes published as DOWN in our cluster state.");
|
log.warn("Timed out waiting to see all nodes published as DOWN in our cluster state.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1355,7 +1366,7 @@ public final class ZkController {
|
||||||
return zkStateReader;
|
return zkStateReader;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doGetShardIdAndNodeNameProcess(CoreDescriptor cd) throws InterruptedException {
|
private void doGetShardIdAndNodeNameProcess(CoreDescriptor cd) {
|
||||||
final String coreNodeName = cd.getCloudDescriptor().getCoreNodeName();
|
final String coreNodeName = cd.getCloudDescriptor().getCoreNodeName();
|
||||||
|
|
||||||
if (coreNodeName != null) {
|
if (coreNodeName != null) {
|
||||||
|
@ -1367,45 +1378,58 @@ public final class ZkController {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void waitForCoreNodeName(CoreDescriptor descriptor) throws InterruptedException {
|
private void waitForCoreNodeName(CoreDescriptor descriptor) {
|
||||||
log.info("Waiting for coreNodeName for core {} in collection {} to be assigned",
|
int retryCount = 320;
|
||||||
descriptor.getName(), descriptor.getCollectionName());
|
log.info("look for our core node name");
|
||||||
final String thisNode = getNodeName();
|
while (retryCount-- > 0) {
|
||||||
|
Map<String, Slice> slicesMap = zkStateReader.getClusterState()
|
||||||
|
.getSlicesMap(descriptor.getCloudDescriptor().getCollectionName());
|
||||||
|
if (slicesMap != null) {
|
||||||
|
|
||||||
|
for (Slice slice : slicesMap.values()) {
|
||||||
|
for (Replica replica : slice.getReplicas()) {
|
||||||
|
// TODO: for really large clusters, we could 'index' on this
|
||||||
|
|
||||||
|
String nodeName = replica.getStr(ZkStateReader.NODE_NAME_PROP);
|
||||||
|
String core = replica.getStr(ZkStateReader.CORE_NAME_PROP);
|
||||||
|
|
||||||
|
String msgNodeName = getNodeName();
|
||||||
|
String msgCore = descriptor.getName();
|
||||||
|
|
||||||
|
if (msgNodeName.equals(nodeName) && core.equals(msgCore)) {
|
||||||
|
descriptor.getCloudDescriptor()
|
||||||
|
.setCoreNodeName(replica.getName());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
zkStateReader.waitForState(descriptor.getCollectionName(), 320, TimeUnit.SECONDS, (n, c) -> {
|
Thread.sleep(1000);
|
||||||
if (c == null)
|
} catch (InterruptedException e) {
|
||||||
return false;
|
Thread.currentThread().interrupt();
|
||||||
for (Replica replica : c.getReplicasOnNode(thisNode)) {
|
|
||||||
if (descriptor.getName().equals(replica.getCoreName())) {
|
|
||||||
descriptor.getCloudDescriptor().setCoreNodeName(replica.getName());
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
|
||||||
});
|
|
||||||
} catch (TimeoutException e) {
|
|
||||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Timed out getting coreNodeName for " + descriptor.getName());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void waitForShardId(CoreDescriptor cd) throws InterruptedException {
|
private void waitForShardId(CoreDescriptor cd) {
|
||||||
log.info("waiting to find shard id in clusterstate for " + cd.getName());
|
log.info("waiting to find shard id in clusterstate for " + cd.getName());
|
||||||
final String thisNode = getNodeName();
|
int retryCount = 320;
|
||||||
try {
|
while (retryCount-- > 0) {
|
||||||
zkStateReader.waitForState(cd.getCollectionName(), 320, TimeUnit.SECONDS, (n, c) -> {
|
final String shardId = zkStateReader.getClusterState().getShardId(cd.getCollectionName(), getNodeName(), cd.getName());
|
||||||
if (c == null)
|
|
||||||
return false;
|
|
||||||
String shardId = c.getShardId(thisNode, cd.getName());
|
|
||||||
if (shardId != null) {
|
if (shardId != null) {
|
||||||
cd.getCloudDescriptor().setShardId(shardId);
|
cd.getCloudDescriptor().setShardId(shardId);
|
||||||
return true;
|
return;
|
||||||
}
|
}
|
||||||
return false;
|
try {
|
||||||
});
|
Thread.sleep(1000);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
}
|
}
|
||||||
catch (TimeoutException e) {
|
|
||||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Timed out getting shard id for core: " + cd.getName());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||||
|
"Could not get shard id for core: " + cd.getName());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1419,7 +1443,7 @@ public final class ZkController {
|
||||||
return coreNodeName;
|
return coreNodeName;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void preRegister(CoreDescriptor cd) throws InterruptedException {
|
public void preRegister(CoreDescriptor cd) {
|
||||||
|
|
||||||
String coreNodeName = getCoreNodeName(cd);
|
String coreNodeName = getCoreNodeName(cd);
|
||||||
|
|
||||||
|
|
|
@ -26,12 +26,11 @@ import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
import java.util.Set;
|
import java.util.concurrent.Callable;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.ExecutionException;
|
import java.util.concurrent.ExecutionException;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.concurrent.ExecutorService;
|
||||||
import java.util.concurrent.Future;
|
import java.util.concurrent.Future;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableMap;
|
import com.google.common.collect.ImmutableMap;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
|
@ -42,7 +41,6 @@ import org.apache.solr.cloud.Overseer;
|
||||||
import org.apache.solr.cloud.ZkController;
|
import org.apache.solr.cloud.ZkController;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.SolrException.ErrorCode;
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
import org.apache.solr.common.cloud.SolrZkClient;
|
|
||||||
import org.apache.solr.common.cloud.ZkStateReader;
|
import org.apache.solr.common.cloud.ZkStateReader;
|
||||||
import org.apache.solr.common.util.ExecutorUtil;
|
import org.apache.solr.common.util.ExecutorUtil;
|
||||||
import org.apache.solr.common.util.IOUtils;
|
import org.apache.solr.common.util.IOUtils;
|
||||||
|
@ -822,7 +820,6 @@ public class CoreContainer {
|
||||||
|
|
||||||
return core;
|
return core;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
SolrZkClient.checkInterrupted(e);
|
|
||||||
coreInitFailures.put(dcore.getName(), new CoreLoadFailure(dcore, e));
|
coreInitFailures.put(dcore.getName(), new CoreLoadFailure(dcore, e));
|
||||||
log.error("Error creating core [{}]: {}", dcore.getName(), e.getMessage(), e);
|
log.error("Error creating core [{}]: {}", dcore.getName(), e.getMessage(), e);
|
||||||
final SolrException solrException = new SolrException(ErrorCode.SERVER_ERROR, "Unable to create core [" + dcore.getName() + "]", e);
|
final SolrException solrException = new SolrException(ErrorCode.SERVER_ERROR, "Unable to create core [" + dcore.getName() + "]", e);
|
||||||
|
@ -872,17 +869,6 @@ public class CoreContainer {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @return a Set containing the names of all collections with a core hosted in this container
|
|
||||||
*/
|
|
||||||
public Set<String> getLocalCollections() {
|
|
||||||
Set<String> collections = getCoreDescriptors().stream()
|
|
||||||
.filter(cd -> cd.getCollectionName() != null)
|
|
||||||
.map(CoreDescriptor::getCollectionName)
|
|
||||||
.collect(Collectors.toSet());
|
|
||||||
return collections;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns an immutable Map of Exceptions that occured when initializing
|
* Returns an immutable Map of Exceptions that occured when initializing
|
||||||
* SolrCores (either at startup, or do to runtime requests to create cores)
|
* SolrCores (either at startup, or do to runtime requests to create cores)
|
||||||
|
|
|
@ -1104,7 +1104,7 @@ public abstract class CollectionAdminRequest<T extends CollectionAdminResponse>
|
||||||
deleteAsyncId(requestId).process(client);
|
deleteAsyncId(requestId).process(client);
|
||||||
return state;
|
return state;
|
||||||
}
|
}
|
||||||
TimeUnit.MILLISECONDS.sleep(100);
|
TimeUnit.SECONDS.sleep(1);
|
||||||
}
|
}
|
||||||
return state;
|
return state;
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,7 +25,6 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.SolrException.ErrorCode;
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
|
@ -260,26 +259,4 @@ public class DocCollection extends ZkNodeProps implements Iterable<Slice> {
|
||||||
}
|
}
|
||||||
return replicas;
|
return replicas;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Get all the replicas on a particular node
|
|
||||||
*/
|
|
||||||
public List<Replica> getReplicasOnNode(String nodeName) {
|
|
||||||
return getReplicas().stream()
|
|
||||||
.filter(replica -> replica.getNodeName().equals(nodeName))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the shardId of a core on a specific node
|
|
||||||
*/
|
|
||||||
public String getShardId(String nodeName, String coreName) {
|
|
||||||
for (Slice slice : this) {
|
|
||||||
for (Replica replica : slice) {
|
|
||||||
if (Objects.equals(replica.getNodeName(), nodeName) && Objects.equals(replica.getCoreName(), coreName))
|
|
||||||
return slice.getName();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,9 +51,8 @@ public class TestCollectionStateWatchers extends SolrCloudTestCase {
|
||||||
@BeforeClass
|
@BeforeClass
|
||||||
public static void startCluster() throws Exception {
|
public static void startCluster() throws Exception {
|
||||||
configureCluster(CLUSTER_SIZE)
|
configureCluster(CLUSTER_SIZE)
|
||||||
.addConfig("config", getFile("solrj/solr/configsets/streaming/conf").toPath())
|
.addConfig("config", getFile("solrj/solr/collection1/conf").toPath())
|
||||||
.configure();
|
.configure();
|
||||||
cluster.getSolrClient().connect();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterClass
|
@AfterClass
|
||||||
|
@ -260,7 +259,7 @@ public class TestCollectionStateWatchers extends SolrCloudTestCase {
|
||||||
|
|
||||||
final CloudSolrClient client = cluster.getSolrClient();
|
final CloudSolrClient client = cluster.getSolrClient();
|
||||||
|
|
||||||
Future<Boolean> future = waitInBackground("stateformat1", 30, TimeUnit.SECONDS,
|
Future<Boolean> future = waitInBackground("stateformat1", 10, TimeUnit.SECONDS,
|
||||||
(n, c) -> DocCollection.isFullyActive(n, c, 1, 1));
|
(n, c) -> DocCollection.isFullyActive(n, c, 1, 1));
|
||||||
|
|
||||||
CollectionAdminRequest.createCollection("stateformat1", "config", 1, 1).setStateFormat(1)
|
CollectionAdminRequest.createCollection("stateformat1", "config", 1, 1).setStateFormat(1)
|
||||||
|
|
Loading…
Reference in New Issue