From 990573689887cd0e6df0313984b3051aa74ba76f Mon Sep 17 00:00:00 2001 From: Cao Manh Dat Date: Wed, 4 Oct 2017 15:25:17 +0700 Subject: [PATCH] SOLR-11424: When legacyCloud=false, cores should not publish itself as DOWN on startup --- solr/CHANGES.txt | 2 + .../org/apache/solr/cloud/ZkController.java | 32 +++++++++--- .../org/apache/solr/core/CoreContainer.java | 2 +- ...a.java => TestSkipOverseerOperations.java} | 49 +++++++++++++++++-- 4 files changed, 72 insertions(+), 13 deletions(-) rename solr/core/src/test/org/apache/solr/cloud/{TestShardsWithSingleReplica.java => TestSkipOverseerOperations.java} (62%) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index c14c9b6e358..0d61cc24c6c 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -174,6 +174,8 @@ Optimizations * SOLR-10285: Skip LEADER messages when there are leader only shards (Cao Manh Dat, Joshua Humphries) +* SOLR-11424: When legacyCloud=false, cores should not publish itself as DOWN on startup. (Cao Manh Dat) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index dcce2c82422..068a1308826 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -1471,7 +1471,7 @@ public class ZkController { return coreNodeName; } - public void preRegister(CoreDescriptor cd) { + public void preRegister(CoreDescriptor cd, boolean publishState) { String coreNodeName = getCoreNodeName(cd); @@ -1487,7 +1487,10 @@ public class ZkController { cloudDesc.setCoreNodeName(coreNodeName); } - publish(cd, Replica.State.DOWN, false, true); + // publishState == false on startup + if (publishState || isPublishAsDownOnStartup(cloudDesc)) { + publish(cd, Replica.State.DOWN, false, true); + } String collectionName = cd.getCloudDescriptor().getCollectionName(); DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(collectionName); log.debug(collection == null ? @@ -1504,15 +1507,28 @@ public class ZkController { throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e); } - if (cd.getCloudDescriptor().getShardId() == null && needsToBeAssignedShardId(cd, zkStateReader.getClusterState(), coreNodeName)) { - doGetShardIdAndNodeNameProcess(cd); - } else { - // still wait till we see us in local state - doGetShardIdAndNodeNameProcess(cd); - } + doGetShardIdAndNodeNameProcess(cd); } + /** + * On startup, the node already published all of its replicas as DOWN, + * so in case of legacyCloud=false ( the replica must already present on Zk ) + * we can skip publish the replica as down + * @return Should publish the replica as down on startup + */ + private boolean isPublishAsDownOnStartup(CloudDescriptor cloudDesc) { + if (!Overseer.isLegacy(zkStateReader)) { + Replica replica = zkStateReader.getClusterState().getCollection(cloudDesc.getCollectionName()) + .getSlice(cloudDesc.getShardId()) + .getReplica(cloudDesc.getCoreNodeName()); + if (replica.getNodeName().equals(getNodeName())) { + return false; + } + } + return true; + } + private void checkStateInZk(CoreDescriptor cd) throws InterruptedException { if (!Overseer.isLegacy(zkStateReader)) { CloudDescriptor cloudDesc = cd.getCloudDescriptor(); diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index f3f3c49ea08..4467bc03ba4 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -1018,7 +1018,7 @@ public class CoreContainer { MDCLoggingContext.setCoreDescriptor(this, dcore); SolrIdentifierValidator.validateCoreName(dcore.getName()); if (zkSys.getZkController() != null) { - zkSys.getZkController().preRegister(dcore); + zkSys.getZkController().preRegister(dcore, publishState); } ConfigSet coreConfig = coreConfigService.getConfig(dcore); diff --git a/solr/core/src/test/org/apache/solr/cloud/TestShardsWithSingleReplica.java b/solr/core/src/test/org/apache/solr/cloud/TestSkipOverseerOperations.java similarity index 62% rename from solr/core/src/test/org/apache/solr/cloud/TestShardsWithSingleReplica.java rename to solr/core/src/test/org/apache/solr/cloud/TestSkipOverseerOperations.java index a4c51ba3718..2039249ced3 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestShardsWithSingleReplica.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestSkipOverseerOperations.java @@ -27,7 +27,7 @@ import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.response.CollectionAdminResponse; import org.junit.BeforeClass; -public class TestShardsWithSingleReplica extends SolrCloudTestCase { +public class TestSkipOverseerOperations extends SolrCloudTestCase { @BeforeClass public static void setupCluster() throws Exception { @@ -56,26 +56,67 @@ public class TestShardsWithSingleReplica extends SolrCloudTestCase { .process(cluster.getSolrClient()); for (JettySolrRunner solrRunner : notOverseerNodes) { - cluster.stopJettySolrRunner(solrRunner); + solrRunner.stop(); } - waitForState("Expected empty liveNodes", collection, + waitForState("Expected single liveNode", collection, (liveNodes, collectionState) -> liveNodes.size() == 1); CollectionAdminResponse resp = CollectionAdminRequest.getOverseerStatus().process(cluster.getSolrClient()); for (JettySolrRunner solrRunner : notOverseerNodes) { - cluster.startJettySolrRunner(solrRunner); + solrRunner.start(); } waitForState("Expected 2x1 for collection: " + collection, collection, clusterShape(2, 1)); CollectionAdminResponse resp2 = CollectionAdminRequest.getOverseerStatus().process(cluster.getSolrClient()); assertEquals(getNumLeaderOpeations(resp), getNumLeaderOpeations(resp2)); + CollectionAdminRequest.deleteCollection(collection).process(cluster.getSolrClient()); + } + + public void testSkipDownOperations() throws Exception { + String overseerLeader = getOverseerLeader(); + List notOverseerNodes = cluster.getJettySolrRunners() + .stream() + .filter(solrRunner -> !solrRunner.getNodeName().equals(overseerLeader)) + .collect(Collectors.toList()); + String collection = "collection2"; + CollectionAdminRequest + .createCollection(collection, 2, 2) + .setCreateNodeSet(notOverseerNodes + .stream() + .map(JettySolrRunner::getNodeName) + .collect(Collectors.joining(",")) + ) + .setMaxShardsPerNode(2) + .process(cluster.getSolrClient()); + + for (JettySolrRunner solrRunner : notOverseerNodes) { + solrRunner.stop(); + } + waitForState("Expected single liveNode", collection, + (liveNodes, collectionState) -> liveNodes.size() == 1); + + CollectionAdminResponse resp = CollectionAdminRequest.getOverseerStatus().process(cluster.getSolrClient()); + for (JettySolrRunner solrRunner : notOverseerNodes) { + solrRunner.start(); + } + + waitForState("Expected 2x2 for collection: " + collection, collection, + clusterShape(2, 2)); + CollectionAdminResponse resp2 = CollectionAdminRequest.getOverseerStatus().process(cluster.getSolrClient()); + // 2 for recovering state, 4 for active state + assertEquals(getNumStateOpeations(resp) + 6, getNumStateOpeations(resp2)); + CollectionAdminRequest.deleteCollection(collection).process(cluster.getSolrClient()); } private int getNumLeaderOpeations(CollectionAdminResponse resp) { return (int) resp.getResponse().findRecursive("overseer_operations", "leader", "requests"); } + private int getNumStateOpeations(CollectionAdminResponse resp) { + return (int) resp.getResponse().findRecursive("overseer_operations", "state", "requests"); + } + private String getOverseerLeader() throws IOException, SolrServerException { CollectionAdminResponse resp = CollectionAdminRequest.getOverseerStatus().process(cluster.getSolrClient()); return (String) resp.getResponse().get("leader");