From 15fbdbd3a6bf0c48197f93f845f087df42beb2a4 Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Mon, 30 Dec 2013 03:24:25 +0000 Subject: [PATCH] SOLR-5588: PeerSync doesn't count all connect failures as success. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1554129 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 3 +++ .../java/org/apache/solr/update/PeerSync.java | 24 ++++++++++++++++++- .../solr/cloud/BasicDistributedZk2Test.java | 8 ------- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index c4dbbb828ac..a0d1516a86c 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -324,6 +324,9 @@ Bug Fixes * SOLR-5503: Retry 'forward to leader' requests less aggressively - rather than on IOException and status 500, ConnectException. (Mark Miller) + +* SOLR-5588: PeerSync doesn't count all connect failures as success. + (Mark Miller) Optimizations ---------------------- diff --git a/solr/core/src/java/org/apache/solr/update/PeerSync.java b/solr/core/src/java/org/apache/solr/update/PeerSync.java index 57f8e74f761..86b19106332 100644 --- a/solr/core/src/java/org/apache/solr/update/PeerSync.java +++ b/solr/core/src/java/org/apache/solr/update/PeerSync.java @@ -291,7 +291,8 @@ public class PeerSync { if (cantReachIsSuccess && sreq.purpose == 1 && srsp.getException() instanceof SolrServerException) { Throwable solrException = ((SolrServerException) srsp.getException()) .getRootCause(); - if (solrException instanceof ConnectException || solrException instanceof ConnectTimeoutException + boolean connectTimeoutExceptionInChain = connectTimeoutExceptionInChain(srsp.getException()); + if (connectTimeoutExceptionInChain || solrException instanceof ConnectException || solrException instanceof ConnectTimeoutException || solrException instanceof NoHttpResponseException || solrException instanceof SocketException) { log.warn(msg() + " couldn't connect to " + srsp.getShardAddress() + ", counting as success"); @@ -309,6 +310,10 @@ public class PeerSync { "Perhaps /get is not registered?"); return true; } + + // TODO: we should return the above information so that when we can request a recovery through zookeeper, we do + // that for these nodes + // TODO: at least log??? // srsp.getException().printStackTrace(System.out); @@ -324,6 +329,23 @@ public class PeerSync { } } + // sometimes the root exception is a SocketTimeoutException, but ConnectTimeoutException + // is in the chain + private boolean connectTimeoutExceptionInChain(Throwable exception) { + Throwable t = exception; + while (true) { + if (t instanceof ConnectTimeoutException) { + return true; + } + Throwable cause = t.getCause(); + if (cause != null) { + t = cause; + } else { + return false; + } + } + } + private boolean handleVersions(ShardResponse srsp) { // we retrieved the last N updates from the replica List otherVersions = (List)srsp.getSolrResponse().getResponse().get("versions"); diff --git a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java index 7a9bb992faa..1db16aec4d6 100644 --- a/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java +++ b/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java @@ -282,14 +282,6 @@ public class BasicDistributedZk2Test extends AbstractFullDistribZkTestBase { // kill a shard CloudJettyRunner deadShard = chaosMonkey.stopShard(SHARD1, 0); - - - // we are careful to make sure the downed node is no longer in the state, - // because on some systems (especially freebsd w/ blackhole enabled), trying - // to talk to a downed node causes grief - Set jetties = new HashSet(); - jetties.addAll(shardToJetty.get(SHARD1)); - jetties.remove(deadShard); // ensure shard is dead try {