From 83fa9d80fa1fa2cc2d4fb7288a2fe46bfe2058f6 Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Sun, 12 Feb 2012 02:45:06 +0000 Subject: [PATCH] SOLR-3125: When a SolrCore registers in zk there may be stale leader state in the clusterstate.json. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1243195 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/solr/cloud/ZkController.java | 55 ++++++++++++++++--- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index b605abc4d8b..b6a1642e768 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -50,6 +50,7 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.update.UpdateLog; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.KeeperException.NoNodeException; import org.apache.zookeeper.data.Stat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -464,11 +465,6 @@ public final class ZkController { final CloudDescriptor cloudDesc = desc.getCloudDescriptor(); final String collection = cloudDesc.getCollectionName(); - - - log.info("Attempting to update " + ZkStateReader.CLUSTER_STATE + " version " - + null); - CloudState state = CloudState.load(zkClient, zkStateReader.getCloudState().getLiveNodes()); final String coreZkNodeName = getNodeName() + "_" + coreName; @@ -495,8 +491,25 @@ public final class ZkController { joinElection(collection, coreZkNodeName, shardId, leaderProps); - String leaderUrl = zkStateReader.getLeaderUrl(collection, - cloudDesc.getShardId(), 30000); + // rather than look in the cluster state file, we go straight to the zknodes + // here, because on cluster restart there could be stale leader info in the + // cluster state node that won't be updated for a moment + String leaderUrl = getLeaderUrl(collection, cloudDesc.getShardId()); + + // now wait until our currently cloud state contains the latest leader + String cloudStateLeader = zkStateReader.getLeaderUrl(collection, cloudDesc.getShardId(), 30000); + int tries = 0; + while (!leaderUrl.equals(cloudStateLeader)) { + if (tries == 60) { + throw new SolrException(ErrorCode.SERVER_ERROR, + "There is conflicting information about the leader of shard: " + + cloudDesc.getShardId()); + } + Thread.sleep(1000); + tries++; + cloudStateLeader = zkStateReader.getLeaderUrl(collection, + cloudDesc.getShardId(), 30000); + } String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName); log.info("We are " + ourUrl + " and leader is " + leaderUrl); @@ -551,6 +564,32 @@ public final class ZkController { return shardId; } + + /** + * Get leader URL directly from zk nodes. + * + * @param collection + * @param slice + * @return + * @throws KeeperException + * @throws InterruptedException + */ + private String getLeaderUrl(final String collection, final String slice) + throws KeeperException, InterruptedException { + int iterCount = 60; + while (iterCount-- > 0) + try { + byte[] data = zkClient.getData( + ZkStateReader.getShardLeadersPath(collection, slice), null, null, + true); + ZkCoreNodeProps leaderProps = new ZkCoreNodeProps( + ZkNodeProps.load(data)); + return leaderProps.getCoreUrl(); + } catch (NoNodeException e) { + Thread.sleep(500); + } + throw new RuntimeException("Could not get leader props"); + } private void joinElection(final String collection, @@ -600,6 +639,8 @@ public final class ZkController { core.getUpdateHandler().getSolrCoreState().doRecovery(core); return true; } + } else { + log.info("I am the leader, no recovery necessary"); } return false;