[Discovery] Master fault detection and nodes fault detection should take cluster name into account.
Both master fault detection and nodes fault detection request should also send the cluster name, so that on the receiving side the handling of these requests can be failed with an error. This error can be caught on the sending side and for master fault detection the node can fail the master locally and for nodes fault detection the node can be failed. Note this validation will most likely never fail in a production cluster, but in during automated tests where cluster / nodes are created and destroyed very frequently.
This commit is contained in:
parent
364374dd03
commit
4b8456e954
|
@ -166,10 +166,10 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
||||||
this.electMaster = new ElectMasterService(settings);
|
this.electMaster = new ElectMasterService(settings);
|
||||||
nodeSettingsService.addListener(new ApplySettings());
|
nodeSettingsService.addListener(new ApplySettings());
|
||||||
|
|
||||||
this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this);
|
this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this, clusterName);
|
||||||
this.masterFD.addListener(new MasterNodeFailureListener());
|
this.masterFD.addListener(new MasterNodeFailureListener());
|
||||||
|
|
||||||
this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService);
|
this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService, clusterName);
|
||||||
this.nodesFD.addListener(new NodeFailureListener());
|
this.nodesFD.addListener(new NodeFailureListener());
|
||||||
|
|
||||||
this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewClusterStateListener(), discoverySettings);
|
this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewClusterStateListener(), discoverySettings);
|
||||||
|
|
|
@ -20,6 +20,8 @@
|
||||||
package org.elasticsearch.discovery.zen.fd;
|
package org.elasticsearch.discovery.zen.fd;
|
||||||
|
|
||||||
import org.elasticsearch.ElasticsearchIllegalStateException;
|
import org.elasticsearch.ElasticsearchIllegalStateException;
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.cluster.ClusterName;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNode;
|
import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
||||||
import org.elasticsearch.common.component.AbstractComponent;
|
import org.elasticsearch.common.component.AbstractComponent;
|
||||||
|
@ -58,6 +60,8 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
|
|
||||||
private final DiscoveryNodesProvider nodesProvider;
|
private final DiscoveryNodesProvider nodesProvider;
|
||||||
|
|
||||||
|
private final ClusterName clusterName;
|
||||||
|
|
||||||
private final CopyOnWriteArrayList<Listener> listeners = new CopyOnWriteArrayList<>();
|
private final CopyOnWriteArrayList<Listener> listeners = new CopyOnWriteArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
|
@ -85,11 +89,13 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
|
|
||||||
private final AtomicBoolean notifiedMasterFailure = new AtomicBoolean();
|
private final AtomicBoolean notifiedMasterFailure = new AtomicBoolean();
|
||||||
|
|
||||||
public MasterFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService, DiscoveryNodesProvider nodesProvider) {
|
public MasterFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService,
|
||||||
|
DiscoveryNodesProvider nodesProvider, ClusterName clusterName) {
|
||||||
super(settings);
|
super(settings);
|
||||||
this.threadPool = threadPool;
|
this.threadPool = threadPool;
|
||||||
this.transportService = transportService;
|
this.transportService = transportService;
|
||||||
this.nodesProvider = nodesProvider;
|
this.nodesProvider = nodesProvider;
|
||||||
|
this.clusterName = clusterName;
|
||||||
|
|
||||||
this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false);
|
this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false);
|
||||||
this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
|
this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
|
||||||
|
@ -270,8 +276,10 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
threadPool.schedule(pingInterval, ThreadPool.Names.SAME, MasterPinger.this);
|
threadPool.schedule(pingInterval, ThreadPool.Names.SAME, MasterPinger.this);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, new MasterPingRequest(nodesProvider.nodes().localNode().id(), masterToPing.id()), options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout),
|
final MasterPingRequest request = new MasterPingRequest(nodesProvider.nodes().localNode().id(), masterToPing.id(), clusterName);
|
||||||
new BaseTransportResponseHandler<MasterPingResponseResponse>() {
|
final TransportRequestOptions options = options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout);
|
||||||
|
transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, request, options, new BaseTransportResponseHandler<MasterPingResponseResponse>() {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public MasterPingResponseResponse newInstance() {
|
public MasterPingResponseResponse newInstance() {
|
||||||
return new MasterPingResponseResponse();
|
return new MasterPingResponseResponse();
|
||||||
|
@ -328,7 +336,7 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
notifyMasterFailure(masterToPing, "failed to ping, tried [" + pingRetryCount + "] times, each with maximum [" + pingRetryTimeout + "] timeout");
|
notifyMasterFailure(masterToPing, "failed to ping, tried [" + pingRetryCount + "] times, each with maximum [" + pingRetryTimeout + "] timeout");
|
||||||
} else {
|
} else {
|
||||||
// resend the request, not reschedule, rely on send timeout
|
// resend the request, not reschedule, rely on send timeout
|
||||||
transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, new MasterPingRequest(nodesProvider.nodes().localNode().id(), masterToPing.id()), options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout), this);
|
transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, request, options, this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -351,6 +359,14 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
}
|
}
|
||||||
|
|
||||||
static class NotMasterException extends ElasticsearchIllegalStateException {
|
static class NotMasterException extends ElasticsearchIllegalStateException {
|
||||||
|
|
||||||
|
NotMasterException(String msg) {
|
||||||
|
super(msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
NotMasterException() {
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Throwable fillInStackTrace() {
|
public Throwable fillInStackTrace() {
|
||||||
return null;
|
return null;
|
||||||
|
@ -379,6 +395,12 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
if (!request.masterNodeId.equals(nodes.localNodeId())) {
|
if (!request.masterNodeId.equals(nodes.localNodeId())) {
|
||||||
throw new NotMasterException();
|
throw new NotMasterException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (request.clusterName != null && !request.clusterName.equals(clusterName)) {
|
||||||
|
logger.trace("master fault detection ping request is targeted for a different [{}] cluster then us [{}]", request.clusterName, clusterName);
|
||||||
|
throw new NotMasterException("master fault detection ping request is targeted for a different [" + request.clusterName + "] cluster then us [" + clusterName + "]");
|
||||||
|
}
|
||||||
|
|
||||||
// if we are no longer master, fail...
|
// if we are no longer master, fail...
|
||||||
if (!nodes.localNodeMaster()) {
|
if (!nodes.localNodeMaster()) {
|
||||||
throw new NoLongerMasterException();
|
throw new NoLongerMasterException();
|
||||||
|
@ -402,13 +424,15 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
private String nodeId;
|
private String nodeId;
|
||||||
|
|
||||||
private String masterNodeId;
|
private String masterNodeId;
|
||||||
|
private ClusterName clusterName;
|
||||||
|
|
||||||
private MasterPingRequest() {
|
private MasterPingRequest() {
|
||||||
}
|
}
|
||||||
|
|
||||||
private MasterPingRequest(String nodeId, String masterNodeId) {
|
private MasterPingRequest(String nodeId, String masterNodeId, ClusterName clusterName) {
|
||||||
this.nodeId = nodeId;
|
this.nodeId = nodeId;
|
||||||
this.masterNodeId = masterNodeId;
|
this.masterNodeId = masterNodeId;
|
||||||
|
this.clusterName = clusterName;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -416,6 +440,9 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
super.readFrom(in);
|
super.readFrom(in);
|
||||||
nodeId = in.readString();
|
nodeId = in.readString();
|
||||||
masterNodeId = in.readString();
|
masterNodeId = in.readString();
|
||||||
|
if (in.getVersion().onOrAfter(Version.V_1_4_0)) {
|
||||||
|
clusterName = ClusterName.readClusterName(in);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -423,6 +450,9 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
super.writeTo(out);
|
super.writeTo(out);
|
||||||
out.writeString(nodeId);
|
out.writeString(nodeId);
|
||||||
out.writeString(masterNodeId);
|
out.writeString(masterNodeId);
|
||||||
|
if (out.getVersion().onOrAfter(Version.V_1_4_0)) {
|
||||||
|
clusterName.writeTo(out);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,8 @@
|
||||||
package org.elasticsearch.discovery.zen.fd;
|
package org.elasticsearch.discovery.zen.fd;
|
||||||
|
|
||||||
import org.elasticsearch.ElasticsearchIllegalStateException;
|
import org.elasticsearch.ElasticsearchIllegalStateException;
|
||||||
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.cluster.ClusterName;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNode;
|
import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
||||||
import org.elasticsearch.common.component.AbstractComponent;
|
import org.elasticsearch.common.component.AbstractComponent;
|
||||||
|
@ -54,6 +56,7 @@ public class NodesFaultDetection extends AbstractComponent {
|
||||||
private final ThreadPool threadPool;
|
private final ThreadPool threadPool;
|
||||||
|
|
||||||
private final TransportService transportService;
|
private final TransportService transportService;
|
||||||
|
private final ClusterName clusterName;
|
||||||
|
|
||||||
|
|
||||||
private final boolean connectOnNetworkDisconnect;
|
private final boolean connectOnNetworkDisconnect;
|
||||||
|
@ -78,10 +81,11 @@ public class NodesFaultDetection extends AbstractComponent {
|
||||||
|
|
||||||
private volatile boolean running = false;
|
private volatile boolean running = false;
|
||||||
|
|
||||||
public NodesFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService) {
|
public NodesFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName) {
|
||||||
super(settings);
|
super(settings);
|
||||||
this.threadPool = threadPool;
|
this.threadPool = threadPool;
|
||||||
this.transportService = transportService;
|
this.transportService = transportService;
|
||||||
|
this.clusterName = clusterName;
|
||||||
|
|
||||||
this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false);
|
this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false);
|
||||||
this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
|
this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
|
||||||
|
@ -204,8 +208,9 @@ public class NodesFaultDetection extends AbstractComponent {
|
||||||
if (!running) {
|
if (!running) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
transportService.sendRequest(node, PING_ACTION_NAME, new PingRequest(node.id()), options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout),
|
final PingRequest pingRequest = new PingRequest(node.id(), clusterName);
|
||||||
new BaseTransportResponseHandler<PingResponse>() {
|
final TransportRequestOptions options = options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout);
|
||||||
|
transportService.sendRequest(node, PING_ACTION_NAME, pingRequest, options, new BaseTransportResponseHandler<PingResponse>() {
|
||||||
@Override
|
@Override
|
||||||
public PingResponse newInstance() {
|
public PingResponse newInstance() {
|
||||||
return new PingResponse();
|
return new PingResponse();
|
||||||
|
@ -252,8 +257,7 @@ public class NodesFaultDetection extends AbstractComponent {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// resend the request, not reschedule, rely on send timeout
|
// resend the request, not reschedule, rely on send timeout
|
||||||
transportService.sendRequest(node, PING_ACTION_NAME, new PingRequest(node.id()),
|
transportService.sendRequest(node, PING_ACTION_NAME, pingRequest, options, this);
|
||||||
options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout), this);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -298,6 +302,10 @@ public class NodesFaultDetection extends AbstractComponent {
|
||||||
if (!latestNodes.localNodeId().equals(request.nodeId)) {
|
if (!latestNodes.localNodeId().equals(request.nodeId)) {
|
||||||
throw new ElasticsearchIllegalStateException("Got pinged as node [" + request.nodeId + "], but I am node [" + latestNodes.localNodeId() + "]");
|
throw new ElasticsearchIllegalStateException("Got pinged as node [" + request.nodeId + "], but I am node [" + latestNodes.localNodeId() + "]");
|
||||||
}
|
}
|
||||||
|
if (request.clusterName != null && !request.clusterName.equals(clusterName)) {
|
||||||
|
// Don't introduce new exception for bwc reasons
|
||||||
|
throw new ElasticsearchIllegalStateException("Got pinged with cluster name [" + request.clusterName + "], but I'm part of cluster [" + clusterName + "]");
|
||||||
|
}
|
||||||
channel.sendResponse(new PingResponse());
|
channel.sendResponse(new PingResponse());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -308,28 +316,45 @@ public class NodesFaultDetection extends AbstractComponent {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static class PingRequest extends TransportRequest {
|
public static class PingRequest extends TransportRequest {
|
||||||
|
|
||||||
// the (assumed) node id we are pinging
|
// the (assumed) node id we are pinging
|
||||||
private String nodeId;
|
private String nodeId;
|
||||||
|
|
||||||
|
private ClusterName clusterName;
|
||||||
|
|
||||||
PingRequest() {
|
PingRequest() {
|
||||||
}
|
}
|
||||||
|
|
||||||
PingRequest(String nodeId) {
|
PingRequest(String nodeId, ClusterName clusterName) {
|
||||||
this.nodeId = nodeId;
|
this.nodeId = nodeId;
|
||||||
|
this.clusterName = clusterName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String nodeId() {
|
||||||
|
return nodeId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ClusterName clusterName() {
|
||||||
|
return clusterName;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void readFrom(StreamInput in) throws IOException {
|
public void readFrom(StreamInput in) throws IOException {
|
||||||
super.readFrom(in);
|
super.readFrom(in);
|
||||||
nodeId = in.readString();
|
nodeId = in.readString();
|
||||||
|
if (in.getVersion().onOrAfter(Version.V_1_4_0)) {
|
||||||
|
clusterName = ClusterName.readClusterName(in);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void writeTo(StreamOutput out) throws IOException {
|
public void writeTo(StreamOutput out) throws IOException {
|
||||||
super.writeTo(out);
|
super.writeTo(out);
|
||||||
out.writeString(nodeId);
|
out.writeString(nodeId);
|
||||||
|
if (out.getVersion().onOrAfter(Version.V_1_4_0)) {
|
||||||
|
clusterName.writeTo(out);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.discovery;
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableMap;
|
import com.google.common.collect.ImmutableMap;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
|
import org.elasticsearch.cluster.ClusterName;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNode;
|
import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
||||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
|
@ -131,7 +132,7 @@ public class ZenFaultDetectionTests extends ElasticsearchTestCase {
|
||||||
boolean shouldRetry = randomBoolean();
|
boolean shouldRetry = randomBoolean();
|
||||||
// make sure we don't ping
|
// make sure we don't ping
|
||||||
settings.put("discovery.zen.fd.connect_on_network_disconnect", shouldRetry).put("discovery.zen.fd.ping_interval", "5m");
|
settings.put("discovery.zen.fd.connect_on_network_disconnect", shouldRetry).put("discovery.zen.fd.ping_interval", "5m");
|
||||||
NodesFaultDetection nodesFD = new NodesFaultDetection(settings.build(), threadPool, serviceA);
|
NodesFaultDetection nodesFD = new NodesFaultDetection(settings.build(), threadPool, serviceA, new ClusterName("test"));
|
||||||
nodesFD.start();
|
nodesFD.start();
|
||||||
nodesFD.updateNodes(buildNodesForA(true));
|
nodesFD.updateNodes(buildNodesForA(true));
|
||||||
final String[] failureReason = new String[1];
|
final String[] failureReason = new String[1];
|
||||||
|
@ -165,6 +166,7 @@ public class ZenFaultDetectionTests extends ElasticsearchTestCase {
|
||||||
boolean shouldRetry = randomBoolean();
|
boolean shouldRetry = randomBoolean();
|
||||||
// make sure we don't ping
|
// make sure we don't ping
|
||||||
settings.put("discovery.zen.fd.connect_on_network_disconnect", shouldRetry).put("discovery.zen.fd.ping_interval", "5m");
|
settings.put("discovery.zen.fd.connect_on_network_disconnect", shouldRetry).put("discovery.zen.fd.ping_interval", "5m");
|
||||||
|
ClusterName clusterName = new ClusterName(randomAsciiOfLengthBetween(3, 20));
|
||||||
final DiscoveryNodes nodes = buildNodesForA(false);
|
final DiscoveryNodes nodes = buildNodesForA(false);
|
||||||
MasterFaultDetection masterFD = new MasterFaultDetection(settings.build(), threadPool, serviceA,
|
MasterFaultDetection masterFD = new MasterFaultDetection(settings.build(), threadPool, serviceA,
|
||||||
new DiscoveryNodesProvider() {
|
new DiscoveryNodesProvider() {
|
||||||
|
@ -177,7 +179,8 @@ public class ZenFaultDetectionTests extends ElasticsearchTestCase {
|
||||||
public NodeService nodeService() {
|
public NodeService nodeService() {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
clusterName
|
||||||
);
|
);
|
||||||
masterFD.start(nodeB, "test");
|
masterFD.start(nodeB, "test");
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue