Discovery: only retry join when other node is not (yet) a master

When a node tries to join a master, the master may not yet be ready to accept the join request. In such cases we retry sending the join request up to 3 times before going back to ping. To detect this the current logic uses ExceptionsHelper.unwrapCause(t) to unwrap the incoming RemoteTransportException and inspect it's source, looking for ElasticsearchIllegalStateException. However, local ElasticsearchIllegalStateException can also be thrown when the join process should be cancelled (i.e., node shut down). In this case we shouldn't retry.

This commit adds an explicit NotMasterException to indicate the remote node is not a master. A similarly named exception (but meaning something else) in the master fault detection code was given a better name. Also clean up some other exceptions while at it.

Closes #8972
This commit is contained in:
Boaz Leskes 2014-12-16 13:01:50 +01:00
parent 154e9d90cd
commit 8f146f9ab0
4 changed files with 30 additions and 61 deletions

View File

@ -16,21 +16,31 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.discovery.zen;
package org.elasticsearch.discovery;
import org.elasticsearch.ElasticsearchIllegalStateException;
import org.elasticsearch.ElasticsearchException;
/**
*
* Thrown when a node join request or a master ping reaches a node which is not
* currently acting as a master.
*/
public class DiscoveryException extends ElasticsearchException {
public class NotMasterException extends ElasticsearchIllegalStateException {
public DiscoveryException(String message) {
super(message);
public NotMasterException() {
super(null);
}
public DiscoveryException(String message, Throwable cause) {
super(message, cause);
public NotMasterException(String msg) {
super(msg);
}
public NotMasterException(String msg, Throwable cause) {
super(msg, cause);
}
@Override
public Throwable fillInStackTrace() {
return null;
}
}

View File

@ -475,7 +475,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
return true;
} catch (Throwable t) {
Throwable unwrap = ExceptionsHelper.unwrapCause(t);
if (unwrap instanceof ElasticsearchIllegalStateException) {
if (unwrap instanceof NotMasterException) {
if (++joinAttempt == this.joinRetryAttempts) {
logger.info("failed to send join request to master [{}], reason [{}], tried [{}] times", masterNode, ExceptionsHelper.detailedMessage(t), joinAttempt);
return false;
@ -917,7 +917,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
public void onNoLongerMaster(String source) {
// we are rejected, so drain all pending task (execute never run)
processJoinRequests.drainTo(drainedJoinRequests);
Exception e = new ElasticsearchIllegalStateException("Node [" + clusterService.localNode() + "] not master for join request from [" + node + "]");
Exception e = new NotMasterException("Node [" + clusterService.localNode() + "] not master for join request from [" + node + "]");
innerOnFailure(e);
}

View File

@ -33,6 +33,7 @@ import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.discovery.zen.NotMasterException;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.*;
@ -255,11 +256,11 @@ public class MasterFaultDetection extends FaultDetection {
if (exp instanceof ConnectTransportException || exp.getCause() instanceof ConnectTransportException) {
handleTransportDisconnect(masterToPing);
return;
} else if (exp.getCause() instanceof NoLongerMasterException) {
} else if (exp.getCause() instanceof NotMasterException) {
logger.debug("[master] pinging a master {} that is no longer a master", masterNode);
notifyMasterFailure(masterToPing, "no longer master");
return;
} else if (exp.getCause() instanceof NotMasterException) {
} else if (exp.getCause() instanceof ThisIsNotTheMasterYouAreLookingForException) {
logger.debug("[master] pinging a master {} that is not the master", masterNode);
notifyMasterFailure(masterToPing, "not master");
return;
@ -292,20 +293,14 @@ public class MasterFaultDetection extends FaultDetection {
}
}
static class NoLongerMasterException extends ElasticsearchIllegalStateException {
@Override
public Throwable fillInStackTrace() {
return null;
}
}
/** Thrown when a ping reaches the wrong node */
static class ThisIsNotTheMasterYouAreLookingForException extends ElasticsearchIllegalStateException {
static class NotMasterException extends ElasticsearchIllegalStateException {
NotMasterException(String msg) {
ThisIsNotTheMasterYouAreLookingForException(String msg) {
super(msg);
}
NotMasterException() {
ThisIsNotTheMasterYouAreLookingForException() {
}
@Override
@ -334,13 +329,13 @@ public class MasterFaultDetection extends FaultDetection {
// check if we are really the same master as the one we seemed to be think we are
// this can happen if the master got "kill -9" and then another node started using the same port
if (!request.masterNodeId.equals(nodes.localNodeId())) {
throw new NotMasterException();
throw new ThisIsNotTheMasterYouAreLookingForException();
}
// ping from nodes of version < 1.4.0 will have the clustername set to null
if (request.clusterName != null && !request.clusterName.equals(clusterName)) {
logger.trace("master fault detection ping request is targeted for a different [{}] cluster then us [{}]", request.clusterName, clusterName);
throw new NotMasterException("master fault detection ping request is targeted for a different [" + request.clusterName + "] cluster then us [" + clusterName + "]");
throw new ThisIsNotTheMasterYouAreLookingForException("master fault detection ping request is targeted for a different [" + request.clusterName + "] cluster then us [" + clusterName + "]");
}
// when we are elected as master or when a node joins, we use a cluster state update thread
@ -360,7 +355,7 @@ public class MasterFaultDetection extends FaultDetection {
// if we are no longer master, fail...
DiscoveryNodes nodes = currentState.nodes();
if (!nodes.localNodeMaster()) {
throw new NoLongerMasterException();
throw new NotMasterException();
}
if (!nodes.nodeExists(request.nodeId)) {
throw new NodeDoesNotExistOnMasterException();

View File

@ -1,36 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.discovery.zen.ping;
import org.elasticsearch.discovery.DiscoveryException;
/**
*
*/
public class ZenPingException extends DiscoveryException {
public ZenPingException(String message) {
super(message);
}
public ZenPingException(String message, Throwable cause) {
super(message, cause);
}
}