From ac7621e607f2b71d197f3b1c381e0caf2a6111d5 Mon Sep 17 00:00:00 2001 From: Timothy Potter Date: Mon, 6 Oct 2014 17:50:00 +0000 Subject: [PATCH] SOLR-6592: Avoid waiting for the leader to see the down state if that leader is not live. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1629719 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/solr/cloud/ZkController.java | 22 +++++++++++++++++++ .../apache/solr/cloud/HttpPartitionTest.java | 11 ++++++++++ 2 files changed, 33 insertions(+) diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index 88a48aa9943..d149d94b7ce 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -20,8 +20,11 @@ package org.apache.solr.cloud; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; +import java.net.ConnectException; import java.net.InetAddress; import java.net.NetworkInterface; +import java.net.SocketException; +import java.net.SocketTimeoutException; import java.net.URLEncoder; import java.net.UnknownHostException; import java.util.ArrayList; @@ -41,6 +44,8 @@ import java.util.concurrent.TimeoutException; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; +import org.apache.http.NoHttpResponseException; +import org.apache.http.conn.ConnectTimeoutException; import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.request.CoreAdminRequest.WaitForState; import org.apache.solr.common.SolrException; @@ -1623,6 +1628,23 @@ public final class ZkController { server.request(prepCmd); break; } catch (Exception e) { + + // if the core container is shutdown, don't wait + if (cc.isShutDown()) { + throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, + "Core container is shutdown."); + } + + Throwable rootCause = SolrException.getRootCause(e); + if (rootCause instanceof IOException) { + // if there was a communication error talking to the leader, see if the leader is even alive + if (!zkStateReader.getClusterState().liveNodesContain(leaderProps.getNodeName())) { + throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, + "Node "+leaderProps.getNodeName()+" hosting leader for "+ + shard+" in "+collection+" is not live!"); + } + } + SolrException.log(log, "There was a problem making a request to the leader", e); try { diff --git a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java index 1027a88d492..c7b002b6c05 100644 --- a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java @@ -117,6 +117,10 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { // have the leader lose its Zk session temporarily testLeaderZkSessionLoss(); + + waitForThingsToLevelOut(30000); + + log.info("HttpParitionTest succeeded ... shutting down now!"); } protected void testRf2() throws Exception { @@ -187,6 +191,8 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { // verify all docs received assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, numDocs + 3); + log.info("testRf2 succeeded ... deleting the "+testCollectionName+" collection"); + // try to clean up try { CollectionAdminRequest req = new CollectionAdminRequest.Delete(); @@ -240,6 +246,9 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { sendDoc(4); assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, 4); + + log.info("testRf3 succeeded ... deleting the "+testCollectionName+" collection"); + // try to clean up try { CollectionAdminRequest req = new CollectionAdminRequest.Delete(); @@ -334,6 +343,8 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { waitToSeeReplicasActive(testCollectionName, "shard1", replicasToCheck, 20); assertDocsExistInAllReplicas(participatingReplicas, testCollectionName, 1, 2); + log.info("testLeaderZkSessionLoss succeeded ... deleting the "+testCollectionName+" collection"); + // try to clean up try { CollectionAdminRequest req = new CollectionAdminRequest.Delete();