mirror of https://github.com/apache/lucene.git
SOLR-6665: ZkController.publishAndWaitForDownStates can return before all local cores are marked as 'down' if multiple replicas with the same core name exist in the cluster
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675030 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d33eb8eb44
commit
9eae452cfb
|
@ -140,6 +140,10 @@ Bug Fixes
|
|||
|
||||
* SOLR-7426: SolrConfig#getConfigOverlay does not clean up it's resources. (Mark Miller)
|
||||
|
||||
* SOLR-6665: ZkController.publishAndWaitForDownStates can return before all local cores are
|
||||
marked as 'down' if multiple replicas with the same core name exist in the cluster.
|
||||
(shalin)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -107,6 +107,7 @@ import org.slf4j.MDC;
|
|||
public final class ZkController {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ZkController.class);
|
||||
static final int WAIT_DOWN_STATES_TIMEOUT_SECONDS = 60;
|
||||
|
||||
private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery");
|
||||
|
||||
|
@ -662,7 +663,7 @@ public final class ZkController {
|
|||
|
||||
ClusterState clusterState = zkStateReader.getClusterState();
|
||||
Set<String> collections = clusterState.getCollections();
|
||||
List<String> updatedNodes = new ArrayList<>();
|
||||
Set<String> updatedCoreNodeNames = new HashSet<>();
|
||||
for (String collectionName : collections) {
|
||||
DocCollection collection = clusterState.getCollection(collectionName);
|
||||
Collection<Slice> slices = collection.getSlices();
|
||||
|
@ -683,7 +684,7 @@ public final class ZkController {
|
|||
replica.getStr(ZkStateReader.SHARD_ID_PROP),
|
||||
ZkStateReader.COLLECTION_PROP, collectionName,
|
||||
ZkStateReader.CORE_NODE_NAME_PROP, replica.getName());
|
||||
updatedNodes.add(replica.getStr(ZkStateReader.CORE_NAME_PROP));
|
||||
updatedCoreNodeNames.add(replica.getName());
|
||||
overseerJobQueue.offer(ZkStateReader.toJSON(m));
|
||||
}
|
||||
}
|
||||
|
@ -692,7 +693,7 @@ public final class ZkController {
|
|||
|
||||
// now wait till the updates are in our state
|
||||
long now = System.nanoTime();
|
||||
long timeout = now + TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS);
|
||||
long timeout = now + TimeUnit.NANOSECONDS.convert(WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);
|
||||
boolean foundStates = false;
|
||||
while (System.nanoTime() < timeout) {
|
||||
clusterState = zkStateReader.getClusterState();
|
||||
|
@ -704,14 +705,14 @@ public final class ZkController {
|
|||
Collection<Replica> replicas = slice.getReplicas();
|
||||
for (Replica replica : replicas) {
|
||||
if (replica.getState() == Replica.State.DOWN) {
|
||||
updatedNodes.remove(replica.getStr(ZkStateReader.CORE_NAME_PROP));
|
||||
updatedCoreNodeNames.remove(replica.getName());
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (updatedNodes.size() == 0) {
|
||||
if (updatedCoreNodeNames.size() == 0) {
|
||||
foundStates = true;
|
||||
Thread.sleep(1000);
|
||||
break;
|
||||
|
|
|
@ -19,7 +19,11 @@ package org.apache.solr.cloud;
|
|||
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.cloud.ClusterState;
|
||||
import org.apache.solr.common.cloud.DocCollection;
|
||||
import org.apache.solr.common.cloud.DocRouter;
|
||||
import org.apache.solr.common.cloud.Replica;
|
||||
import org.apache.solr.common.cloud.Slice;
|
||||
import org.apache.solr.common.cloud.SolrZkClient;
|
||||
import org.apache.solr.common.cloud.ZkConfigManager;
|
||||
import org.apache.solr.common.cloud.ZkCoreNodeProps;
|
||||
|
@ -41,6 +45,7 @@ import java.util.Collections;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@Slow
|
||||
public class ZkControllerTest extends SolrTestCaseJ4 {
|
||||
|
@ -173,7 +178,7 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
|
|||
props.put("configName", actualConfigName);
|
||||
ZkNodeProps zkProps = new ZkNodeProps(props);
|
||||
zkClient.makePath(ZkStateReader.COLLECTIONS_ZKNODE + "/"
|
||||
+ COLLECTION_NAME, ZkStateReader.toJSON(zkProps),
|
||||
+ COLLECTION_NAME, ZkStateReader.toJSON(zkProps),
|
||||
CreateMode.PERSISTENT, true);
|
||||
|
||||
if (DEBUG) {
|
||||
|
@ -246,6 +251,14 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Test that:
|
||||
1) LIR state to 'down' is not set unless publishing node is a leader
|
||||
1a) Test that leader can publish when LIR node already exists in zk
|
||||
1b) Test that leader can publish when LIR node does not exist - TODO
|
||||
2) LIR state to 'active' or 'recovery' can be set regardless of whether publishing
|
||||
node is leader or not - TODO
|
||||
*/
|
||||
public void testEnsureReplicaInLeaderInitiatedRecovery() throws Exception {
|
||||
String zkDir = createTempDir("testEnsureReplicaInLeaderInitiatedRecovery").toFile().getAbsolutePath();
|
||||
CoreContainer cc = null;
|
||||
|
@ -296,14 +309,69 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Test that:
|
||||
1) LIR state to 'down' is not set unless publishing node is a leader
|
||||
1a) Test that leader can publish when LIR node already exists in zk
|
||||
1b) Test that leader can publish when LIR node does not exist
|
||||
2) LIR state to 'active' or 'recovery' can be set regardless of whether publishing
|
||||
node is leader or not
|
||||
*/
|
||||
public void testPublishAndWaitForDownStates() throws Exception {
|
||||
String zkDir = createTempDir("testPublishAndWaitForDownStates").toFile().getAbsolutePath();
|
||||
CoreContainer cc = null;
|
||||
|
||||
ZkTestServer server = new ZkTestServer(zkDir);
|
||||
try {
|
||||
server.run();
|
||||
|
||||
AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
|
||||
AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
|
||||
|
||||
cc = getCoreContainer();
|
||||
ZkController zkController = null;
|
||||
|
||||
try {
|
||||
CloudConfig cloudConfig = new CloudConfig.CloudConfigBuilder("127.0.0.1", 8983, "solr").build();
|
||||
zkController = new ZkController(cc, server.getZkAddress(), TIMEOUT, cloudConfig, new CurrentCoreDescriptorProvider() {
|
||||
|
||||
@Override
|
||||
public List<CoreDescriptor> getCurrentDescriptors() {
|
||||
// do nothing
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
HashMap<String, DocCollection> collectionStates = new HashMap<>();
|
||||
HashMap<String, Replica> replicas = new HashMap<>();
|
||||
// add two replicas with the same core name but one of them should be on a different node
|
||||
// than this ZkController instance
|
||||
for (int i=1; i<=2; i++) {
|
||||
Replica r = new Replica("core_node" + i,
|
||||
map(ZkStateReader.STATE_PROP, i == 1 ? "active" : "down",
|
||||
ZkStateReader.NODE_NAME_PROP, i == 1 ? "127.0.0.1:8983_solr" : "non_existent_host",
|
||||
ZkStateReader.CORE_NAME_PROP, "collection1"));
|
||||
replicas.put("core_node" + i, r);
|
||||
}
|
||||
HashMap<String, Object> sliceProps = new HashMap<>();
|
||||
sliceProps.put("state", Slice.State.ACTIVE.toString());
|
||||
Slice slice = new Slice("shard1", replicas, sliceProps);
|
||||
DocCollection c = new DocCollection("testPublishAndWaitForDownStates", map("shard1", slice), Collections.emptyMap(), DocRouter.DEFAULT);
|
||||
ClusterState state = new ClusterState(0, Collections.emptySet(), map("testPublishAndWaitForDownStates", c));
|
||||
byte[] bytes = ZkStateReader.toJSON(state);
|
||||
zkController.getZkClient().makePath(ZkStateReader.getCollectionPath("testPublishAndWaitForDownStates"), bytes, CreateMode.PERSISTENT, true);
|
||||
|
||||
zkController.getZkStateReader().updateClusterState(true);
|
||||
assertTrue(zkController.getZkStateReader().getClusterState().hasCollection("testPublishAndWaitForDownStates"));
|
||||
assertNotNull(zkController.getZkStateReader().getClusterState().getCollection("testPublishAndWaitForDownStates"));
|
||||
|
||||
long now = System.nanoTime();
|
||||
long timeout = now + TimeUnit.NANOSECONDS.convert(ZkController.WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);
|
||||
zkController.publishAndWaitForDownStates();
|
||||
assertTrue("The ZkController.publishAndWaitForDownStates should have timed out but it didn't", System.nanoTime() >= timeout);
|
||||
} finally {
|
||||
if (zkController != null)
|
||||
zkController.close();
|
||||
}
|
||||
} finally {
|
||||
if (cc != null) {
|
||||
cc.shutdown();
|
||||
}
|
||||
server.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
private CoreContainer getCoreContainer() {
|
||||
return new MockCoreContainer();
|
||||
|
|
Loading…
Reference in New Issue