SOLR-6665: ZkController.publishAndWaitForDownStates can return before all local cores are marked as 'down' if multiple replicas with the same core name exist in the cluster

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675030 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2015-04-21 05:49:39 +00:00
parent d33eb8eb44
commit 9eae452cfb
3 changed files with 87 additions and 14 deletions

View File

@ -140,6 +140,10 @@ Bug Fixes
* SOLR-7426: SolrConfig#getConfigOverlay does not clean up it's resources. (Mark Miller) * SOLR-7426: SolrConfig#getConfigOverlay does not clean up it's resources. (Mark Miller)
* SOLR-6665: ZkController.publishAndWaitForDownStates can return before all local cores are
marked as 'down' if multiple replicas with the same core name exist in the cluster.
(shalin)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -107,6 +107,7 @@ import org.slf4j.MDC;
public final class ZkController { public final class ZkController {
private static final Logger log = LoggerFactory.getLogger(ZkController.class); private static final Logger log = LoggerFactory.getLogger(ZkController.class);
static final int WAIT_DOWN_STATES_TIMEOUT_SECONDS = 60;
private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery"); private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery");
@ -662,7 +663,7 @@ public final class ZkController {
ClusterState clusterState = zkStateReader.getClusterState(); ClusterState clusterState = zkStateReader.getClusterState();
Set<String> collections = clusterState.getCollections(); Set<String> collections = clusterState.getCollections();
List<String> updatedNodes = new ArrayList<>(); Set<String> updatedCoreNodeNames = new HashSet<>();
for (String collectionName : collections) { for (String collectionName : collections) {
DocCollection collection = clusterState.getCollection(collectionName); DocCollection collection = clusterState.getCollection(collectionName);
Collection<Slice> slices = collection.getSlices(); Collection<Slice> slices = collection.getSlices();
@ -683,7 +684,7 @@ public final class ZkController {
replica.getStr(ZkStateReader.SHARD_ID_PROP), replica.getStr(ZkStateReader.SHARD_ID_PROP),
ZkStateReader.COLLECTION_PROP, collectionName, ZkStateReader.COLLECTION_PROP, collectionName,
ZkStateReader.CORE_NODE_NAME_PROP, replica.getName()); ZkStateReader.CORE_NODE_NAME_PROP, replica.getName());
updatedNodes.add(replica.getStr(ZkStateReader.CORE_NAME_PROP)); updatedCoreNodeNames.add(replica.getName());
overseerJobQueue.offer(ZkStateReader.toJSON(m)); overseerJobQueue.offer(ZkStateReader.toJSON(m));
} }
} }
@ -692,7 +693,7 @@ public final class ZkController {
// now wait till the updates are in our state // now wait till the updates are in our state
long now = System.nanoTime(); long now = System.nanoTime();
long timeout = now + TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS); long timeout = now + TimeUnit.NANOSECONDS.convert(WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);
boolean foundStates = false; boolean foundStates = false;
while (System.nanoTime() < timeout) { while (System.nanoTime() < timeout) {
clusterState = zkStateReader.getClusterState(); clusterState = zkStateReader.getClusterState();
@ -704,14 +705,14 @@ public final class ZkController {
Collection<Replica> replicas = slice.getReplicas(); Collection<Replica> replicas = slice.getReplicas();
for (Replica replica : replicas) { for (Replica replica : replicas) {
if (replica.getState() == Replica.State.DOWN) { if (replica.getState() == Replica.State.DOWN) {
updatedNodes.remove(replica.getStr(ZkStateReader.CORE_NAME_PROP)); updatedCoreNodeNames.remove(replica.getName());
} }
} }
} }
} }
if (updatedNodes.size() == 0) { if (updatedCoreNodeNames.size() == 0) {
foundStates = true; foundStates = true;
Thread.sleep(1000); Thread.sleep(1000);
break; break;

View File

@ -19,7 +19,11 @@ package org.apache.solr.cloud;
import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.DocRouter;
import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkConfigManager; import org.apache.solr.common.cloud.ZkConfigManager;
import org.apache.solr.common.cloud.ZkCoreNodeProps; import org.apache.solr.common.cloud.ZkCoreNodeProps;
@ -41,6 +45,7 @@ import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.TimeUnit;
@Slow @Slow
public class ZkControllerTest extends SolrTestCaseJ4 { public class ZkControllerTest extends SolrTestCaseJ4 {
@ -246,6 +251,14 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
} }
} }
/*
Test that:
1) LIR state to 'down' is not set unless publishing node is a leader
1a) Test that leader can publish when LIR node already exists in zk
1b) Test that leader can publish when LIR node does not exist - TODO
2) LIR state to 'active' or 'recovery' can be set regardless of whether publishing
node is leader or not - TODO
*/
public void testEnsureReplicaInLeaderInitiatedRecovery() throws Exception { public void testEnsureReplicaInLeaderInitiatedRecovery() throws Exception {
String zkDir = createTempDir("testEnsureReplicaInLeaderInitiatedRecovery").toFile().getAbsolutePath(); String zkDir = createTempDir("testEnsureReplicaInLeaderInitiatedRecovery").toFile().getAbsolutePath();
CoreContainer cc = null; CoreContainer cc = null;
@ -296,14 +309,69 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
} }
} }
/* public void testPublishAndWaitForDownStates() throws Exception {
Test that: String zkDir = createTempDir("testPublishAndWaitForDownStates").toFile().getAbsolutePath();
1) LIR state to 'down' is not set unless publishing node is a leader CoreContainer cc = null;
1a) Test that leader can publish when LIR node already exists in zk
1b) Test that leader can publish when LIR node does not exist ZkTestServer server = new ZkTestServer(zkDir);
2) LIR state to 'active' or 'recovery' can be set regardless of whether publishing try {
node is leader or not server.run();
*/
AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
cc = getCoreContainer();
ZkController zkController = null;
try {
CloudConfig cloudConfig = new CloudConfig.CloudConfigBuilder("127.0.0.1", 8983, "solr").build();
zkController = new ZkController(cc, server.getZkAddress(), TIMEOUT, cloudConfig, new CurrentCoreDescriptorProvider() {
@Override
public List<CoreDescriptor> getCurrentDescriptors() {
// do nothing
return null;
}
});
HashMap<String, DocCollection> collectionStates = new HashMap<>();
HashMap<String, Replica> replicas = new HashMap<>();
// add two replicas with the same core name but one of them should be on a different node
// than this ZkController instance
for (int i=1; i<=2; i++) {
Replica r = new Replica("core_node" + i,
map(ZkStateReader.STATE_PROP, i == 1 ? "active" : "down",
ZkStateReader.NODE_NAME_PROP, i == 1 ? "127.0.0.1:8983_solr" : "non_existent_host",
ZkStateReader.CORE_NAME_PROP, "collection1"));
replicas.put("core_node" + i, r);
}
HashMap<String, Object> sliceProps = new HashMap<>();
sliceProps.put("state", Slice.State.ACTIVE.toString());
Slice slice = new Slice("shard1", replicas, sliceProps);
DocCollection c = new DocCollection("testPublishAndWaitForDownStates", map("shard1", slice), Collections.emptyMap(), DocRouter.DEFAULT);
ClusterState state = new ClusterState(0, Collections.emptySet(), map("testPublishAndWaitForDownStates", c));
byte[] bytes = ZkStateReader.toJSON(state);
zkController.getZkClient().makePath(ZkStateReader.getCollectionPath("testPublishAndWaitForDownStates"), bytes, CreateMode.PERSISTENT, true);
zkController.getZkStateReader().updateClusterState(true);
assertTrue(zkController.getZkStateReader().getClusterState().hasCollection("testPublishAndWaitForDownStates"));
assertNotNull(zkController.getZkStateReader().getClusterState().getCollection("testPublishAndWaitForDownStates"));
long now = System.nanoTime();
long timeout = now + TimeUnit.NANOSECONDS.convert(ZkController.WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);
zkController.publishAndWaitForDownStates();
assertTrue("The ZkController.publishAndWaitForDownStates should have timed out but it didn't", System.nanoTime() >= timeout);
} finally {
if (zkController != null)
zkController.close();
}
} finally {
if (cc != null) {
cc.shutdown();
}
server.shutdown();
}
}
private CoreContainer getCoreContainer() { private CoreContainer getCoreContainer() {
return new MockCoreContainer(); return new MockCoreContainer();