mirror of https://github.com/apache/lucene.git
SOLR-6665: ZkController.publishAndWaitForDownStates can return before all local cores are marked as 'down' if multiple replicas with the same core name exist in the cluster
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675030 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d33eb8eb44
commit
9eae452cfb
|
@ -140,6 +140,10 @@ Bug Fixes
|
||||||
|
|
||||||
* SOLR-7426: SolrConfig#getConfigOverlay does not clean up it's resources. (Mark Miller)
|
* SOLR-7426: SolrConfig#getConfigOverlay does not clean up it's resources. (Mark Miller)
|
||||||
|
|
||||||
|
* SOLR-6665: ZkController.publishAndWaitForDownStates can return before all local cores are
|
||||||
|
marked as 'down' if multiple replicas with the same core name exist in the cluster.
|
||||||
|
(shalin)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -107,6 +107,7 @@ import org.slf4j.MDC;
|
||||||
public final class ZkController {
|
public final class ZkController {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(ZkController.class);
|
private static final Logger log = LoggerFactory.getLogger(ZkController.class);
|
||||||
|
static final int WAIT_DOWN_STATES_TIMEOUT_SECONDS = 60;
|
||||||
|
|
||||||
private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery");
|
private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery");
|
||||||
|
|
||||||
|
@ -662,7 +663,7 @@ public final class ZkController {
|
||||||
|
|
||||||
ClusterState clusterState = zkStateReader.getClusterState();
|
ClusterState clusterState = zkStateReader.getClusterState();
|
||||||
Set<String> collections = clusterState.getCollections();
|
Set<String> collections = clusterState.getCollections();
|
||||||
List<String> updatedNodes = new ArrayList<>();
|
Set<String> updatedCoreNodeNames = new HashSet<>();
|
||||||
for (String collectionName : collections) {
|
for (String collectionName : collections) {
|
||||||
DocCollection collection = clusterState.getCollection(collectionName);
|
DocCollection collection = clusterState.getCollection(collectionName);
|
||||||
Collection<Slice> slices = collection.getSlices();
|
Collection<Slice> slices = collection.getSlices();
|
||||||
|
@ -683,7 +684,7 @@ public final class ZkController {
|
||||||
replica.getStr(ZkStateReader.SHARD_ID_PROP),
|
replica.getStr(ZkStateReader.SHARD_ID_PROP),
|
||||||
ZkStateReader.COLLECTION_PROP, collectionName,
|
ZkStateReader.COLLECTION_PROP, collectionName,
|
||||||
ZkStateReader.CORE_NODE_NAME_PROP, replica.getName());
|
ZkStateReader.CORE_NODE_NAME_PROP, replica.getName());
|
||||||
updatedNodes.add(replica.getStr(ZkStateReader.CORE_NAME_PROP));
|
updatedCoreNodeNames.add(replica.getName());
|
||||||
overseerJobQueue.offer(ZkStateReader.toJSON(m));
|
overseerJobQueue.offer(ZkStateReader.toJSON(m));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -692,7 +693,7 @@ public final class ZkController {
|
||||||
|
|
||||||
// now wait till the updates are in our state
|
// now wait till the updates are in our state
|
||||||
long now = System.nanoTime();
|
long now = System.nanoTime();
|
||||||
long timeout = now + TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS);
|
long timeout = now + TimeUnit.NANOSECONDS.convert(WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);
|
||||||
boolean foundStates = false;
|
boolean foundStates = false;
|
||||||
while (System.nanoTime() < timeout) {
|
while (System.nanoTime() < timeout) {
|
||||||
clusterState = zkStateReader.getClusterState();
|
clusterState = zkStateReader.getClusterState();
|
||||||
|
@ -704,14 +705,14 @@ public final class ZkController {
|
||||||
Collection<Replica> replicas = slice.getReplicas();
|
Collection<Replica> replicas = slice.getReplicas();
|
||||||
for (Replica replica : replicas) {
|
for (Replica replica : replicas) {
|
||||||
if (replica.getState() == Replica.State.DOWN) {
|
if (replica.getState() == Replica.State.DOWN) {
|
||||||
updatedNodes.remove(replica.getStr(ZkStateReader.CORE_NAME_PROP));
|
updatedCoreNodeNames.remove(replica.getName());
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (updatedNodes.size() == 0) {
|
if (updatedCoreNodeNames.size() == 0) {
|
||||||
foundStates = true;
|
foundStates = true;
|
||||||
Thread.sleep(1000);
|
Thread.sleep(1000);
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -19,7 +19,11 @@ package org.apache.solr.cloud;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||||
import org.apache.solr.SolrTestCaseJ4;
|
import org.apache.solr.SolrTestCaseJ4;
|
||||||
|
import org.apache.solr.common.cloud.ClusterState;
|
||||||
|
import org.apache.solr.common.cloud.DocCollection;
|
||||||
|
import org.apache.solr.common.cloud.DocRouter;
|
||||||
import org.apache.solr.common.cloud.Replica;
|
import org.apache.solr.common.cloud.Replica;
|
||||||
|
import org.apache.solr.common.cloud.Slice;
|
||||||
import org.apache.solr.common.cloud.SolrZkClient;
|
import org.apache.solr.common.cloud.SolrZkClient;
|
||||||
import org.apache.solr.common.cloud.ZkConfigManager;
|
import org.apache.solr.common.cloud.ZkConfigManager;
|
||||||
import org.apache.solr.common.cloud.ZkCoreNodeProps;
|
import org.apache.solr.common.cloud.ZkCoreNodeProps;
|
||||||
|
@ -41,6 +45,7 @@ import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
@Slow
|
@Slow
|
||||||
public class ZkControllerTest extends SolrTestCaseJ4 {
|
public class ZkControllerTest extends SolrTestCaseJ4 {
|
||||||
|
@ -246,6 +251,14 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Test that:
|
||||||
|
1) LIR state to 'down' is not set unless publishing node is a leader
|
||||||
|
1a) Test that leader can publish when LIR node already exists in zk
|
||||||
|
1b) Test that leader can publish when LIR node does not exist - TODO
|
||||||
|
2) LIR state to 'active' or 'recovery' can be set regardless of whether publishing
|
||||||
|
node is leader or not - TODO
|
||||||
|
*/
|
||||||
public void testEnsureReplicaInLeaderInitiatedRecovery() throws Exception {
|
public void testEnsureReplicaInLeaderInitiatedRecovery() throws Exception {
|
||||||
String zkDir = createTempDir("testEnsureReplicaInLeaderInitiatedRecovery").toFile().getAbsolutePath();
|
String zkDir = createTempDir("testEnsureReplicaInLeaderInitiatedRecovery").toFile().getAbsolutePath();
|
||||||
CoreContainer cc = null;
|
CoreContainer cc = null;
|
||||||
|
@ -296,14 +309,69 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
public void testPublishAndWaitForDownStates() throws Exception {
|
||||||
Test that:
|
String zkDir = createTempDir("testPublishAndWaitForDownStates").toFile().getAbsolutePath();
|
||||||
1) LIR state to 'down' is not set unless publishing node is a leader
|
CoreContainer cc = null;
|
||||||
1a) Test that leader can publish when LIR node already exists in zk
|
|
||||||
1b) Test that leader can publish when LIR node does not exist
|
ZkTestServer server = new ZkTestServer(zkDir);
|
||||||
2) LIR state to 'active' or 'recovery' can be set regardless of whether publishing
|
try {
|
||||||
node is leader or not
|
server.run();
|
||||||
*/
|
|
||||||
|
AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
|
||||||
|
AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
|
||||||
|
|
||||||
|
cc = getCoreContainer();
|
||||||
|
ZkController zkController = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
CloudConfig cloudConfig = new CloudConfig.CloudConfigBuilder("127.0.0.1", 8983, "solr").build();
|
||||||
|
zkController = new ZkController(cc, server.getZkAddress(), TIMEOUT, cloudConfig, new CurrentCoreDescriptorProvider() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<CoreDescriptor> getCurrentDescriptors() {
|
||||||
|
// do nothing
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
HashMap<String, DocCollection> collectionStates = new HashMap<>();
|
||||||
|
HashMap<String, Replica> replicas = new HashMap<>();
|
||||||
|
// add two replicas with the same core name but one of them should be on a different node
|
||||||
|
// than this ZkController instance
|
||||||
|
for (int i=1; i<=2; i++) {
|
||||||
|
Replica r = new Replica("core_node" + i,
|
||||||
|
map(ZkStateReader.STATE_PROP, i == 1 ? "active" : "down",
|
||||||
|
ZkStateReader.NODE_NAME_PROP, i == 1 ? "127.0.0.1:8983_solr" : "non_existent_host",
|
||||||
|
ZkStateReader.CORE_NAME_PROP, "collection1"));
|
||||||
|
replicas.put("core_node" + i, r);
|
||||||
|
}
|
||||||
|
HashMap<String, Object> sliceProps = new HashMap<>();
|
||||||
|
sliceProps.put("state", Slice.State.ACTIVE.toString());
|
||||||
|
Slice slice = new Slice("shard1", replicas, sliceProps);
|
||||||
|
DocCollection c = new DocCollection("testPublishAndWaitForDownStates", map("shard1", slice), Collections.emptyMap(), DocRouter.DEFAULT);
|
||||||
|
ClusterState state = new ClusterState(0, Collections.emptySet(), map("testPublishAndWaitForDownStates", c));
|
||||||
|
byte[] bytes = ZkStateReader.toJSON(state);
|
||||||
|
zkController.getZkClient().makePath(ZkStateReader.getCollectionPath("testPublishAndWaitForDownStates"), bytes, CreateMode.PERSISTENT, true);
|
||||||
|
|
||||||
|
zkController.getZkStateReader().updateClusterState(true);
|
||||||
|
assertTrue(zkController.getZkStateReader().getClusterState().hasCollection("testPublishAndWaitForDownStates"));
|
||||||
|
assertNotNull(zkController.getZkStateReader().getClusterState().getCollection("testPublishAndWaitForDownStates"));
|
||||||
|
|
||||||
|
long now = System.nanoTime();
|
||||||
|
long timeout = now + TimeUnit.NANOSECONDS.convert(ZkController.WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);
|
||||||
|
zkController.publishAndWaitForDownStates();
|
||||||
|
assertTrue("The ZkController.publishAndWaitForDownStates should have timed out but it didn't", System.nanoTime() >= timeout);
|
||||||
|
} finally {
|
||||||
|
if (zkController != null)
|
||||||
|
zkController.close();
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
if (cc != null) {
|
||||||
|
cc.shutdown();
|
||||||
|
}
|
||||||
|
server.shutdown();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private CoreContainer getCoreContainer() {
|
private CoreContainer getCoreContainer() {
|
||||||
return new MockCoreContainer();
|
return new MockCoreContainer();
|
||||||
|
|
Loading…
Reference in New Issue