SOLR-5588: PeerSync doesn't count all connect failures as success.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1554129 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2013-12-30 03:24:25 +00:00
parent f3812e8410
commit 15fbdbd3a6
3 changed files with 26 additions and 9 deletions

View File

@ -325,6 +325,9 @@ Bug Fixes
* SOLR-5503: Retry 'forward to leader' requests less aggressively - rather
than on IOException and status 500, ConnectException. (Mark Miller)
* SOLR-5588: PeerSync doesn't count all connect failures as success.
(Mark Miller)
Optimizations
----------------------

View File

@ -291,7 +291,8 @@ public class PeerSync {
if (cantReachIsSuccess && sreq.purpose == 1 && srsp.getException() instanceof SolrServerException) {
Throwable solrException = ((SolrServerException) srsp.getException())
.getRootCause();
if (solrException instanceof ConnectException || solrException instanceof ConnectTimeoutException
boolean connectTimeoutExceptionInChain = connectTimeoutExceptionInChain(srsp.getException());
if (connectTimeoutExceptionInChain || solrException instanceof ConnectException || solrException instanceof ConnectTimeoutException
|| solrException instanceof NoHttpResponseException || solrException instanceof SocketException) {
log.warn(msg() + " couldn't connect to " + srsp.getShardAddress() + ", counting as success");
@ -309,6 +310,10 @@ public class PeerSync {
"Perhaps /get is not registered?");
return true;
}
// TODO: we should return the above information so that when we can request a recovery through zookeeper, we do
// that for these nodes
// TODO: at least log???
// srsp.getException().printStackTrace(System.out);
@ -324,6 +329,23 @@ public class PeerSync {
}
}
// sometimes the root exception is a SocketTimeoutException, but ConnectTimeoutException
// is in the chain
private boolean connectTimeoutExceptionInChain(Throwable exception) {
Throwable t = exception;
while (true) {
if (t instanceof ConnectTimeoutException) {
return true;
}
Throwable cause = t.getCause();
if (cause != null) {
t = cause;
} else {
return false;
}
}
}
private boolean handleVersions(ShardResponse srsp) {
// we retrieved the last N updates from the replica
List<Long> otherVersions = (List<Long>)srsp.getSolrResponse().getResponse().get("versions");

View File

@ -283,14 +283,6 @@ public class BasicDistributedZk2Test extends AbstractFullDistribZkTestBase {
// kill a shard
CloudJettyRunner deadShard = chaosMonkey.stopShard(SHARD1, 0);
// we are careful to make sure the downed node is no longer in the state,
// because on some systems (especially freebsd w/ blackhole enabled), trying
// to talk to a downed node causes grief
Set<CloudJettyRunner> jetties = new HashSet<CloudJettyRunner>();
jetties.addAll(shardToJetty.get(SHARD1));
jetties.remove(deadShard);
// ensure shard is dead
try {
index_specific(deadShard.client.solrClient, id, 999, i1, 107, t1,