Zen Discovery: A node might get into an infinite state of trying to find a master (when client / non_master) nodes exists, closes #247.
This commit is contained in:
parent
87eb6bed8f
commit
d531d82cfb
|
@ -104,6 +104,11 @@ public class Lifecycle {
|
||||||
return state == State.CLOSED;
|
return state == State.CLOSED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean stoppedOrClosed() {
|
||||||
|
Lifecycle.State state = this.state;
|
||||||
|
return state == State.STOPPED || state == State.CLOSED;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean canMoveToStarted() throws ElasticSearchIllegalStateException {
|
public boolean canMoveToStarted() throws ElasticSearchIllegalStateException {
|
||||||
State localState = this.state;
|
State localState = this.state;
|
||||||
if (localState == State.INITIALIZED || localState == State.STOPPED) {
|
if (localState == State.INITIALIZED || localState == State.STOPPED) {
|
||||||
|
|
|
@ -142,16 +142,12 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
||||||
nodesFD.updateNodes(latestDiscoNodes);
|
nodesFD.updateNodes(latestDiscoNodes);
|
||||||
pingService.start();
|
pingService.start();
|
||||||
|
|
||||||
if (nodeAttributes.containsKey("zen.master") && nodeAttributes.get("zen.master").equals("false")) {
|
// do the join on a different thread, the DiscoveryService waits for 30s anyhow till it is discovered
|
||||||
// do the join on a different thread
|
|
||||||
threadPool.execute(new Runnable() {
|
threadPool.execute(new Runnable() {
|
||||||
@Override public void run() {
|
@Override public void run() {
|
||||||
initialJoin();
|
joinCluster();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
} else {
|
|
||||||
initialJoin();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override protected void doStop() throws ElasticSearchException {
|
@Override protected void doStop() throws ElasticSearchException {
|
||||||
|
@ -221,11 +217,18 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
||||||
publishClusterState.publish(clusterState);
|
publishClusterState.publish(clusterState);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initialJoin() {
|
private void joinCluster() {
|
||||||
boolean retry = true;
|
boolean retry = true;
|
||||||
while (retry) {
|
while (retry) {
|
||||||
|
if (lifecycle.stoppedOrClosed()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
retry = false;
|
retry = false;
|
||||||
DiscoveryNode masterNode = broadPingTillMasterResolved();
|
DiscoveryNode masterNode = findMaster();
|
||||||
|
if (masterNode == null) {
|
||||||
|
retry = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (localNode.equals(masterNode)) {
|
if (localNode.equals(masterNode)) {
|
||||||
this.master = true;
|
this.master = true;
|
||||||
nodesFD.start(); // start the nodes FD
|
nodesFD.start(); // start the nodes FD
|
||||||
|
@ -274,11 +277,6 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
||||||
// cool, we found a master, start an FD on it
|
// cool, we found a master, start an FD on it
|
||||||
masterFD.start(masterNode, "initial_join");
|
masterFD.start(masterNode, "initial_join");
|
||||||
}
|
}
|
||||||
if (retry) {
|
|
||||||
if (!lifecycle.started()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -306,7 +304,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private void handleMasterGone(final DiscoveryNode masterNode, String reason) {
|
private void handleMasterGone(final DiscoveryNode masterNode, final String reason) {
|
||||||
if (lifecycleState() != Lifecycle.State.STARTED) {
|
if (lifecycleState() != Lifecycle.State.STARTED) {
|
||||||
// not started, ignore a master failure
|
// not started, ignore a master failure
|
||||||
return;
|
return;
|
||||||
|
@ -316,57 +314,73 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("master [{}] left, reason [{}]", masterNode, reason);
|
logger.info("master_left [{}], reason [{}]", masterNode, reason);
|
||||||
List<DiscoveryNode> nodes = newArrayList(latestDiscoNodes.nodes().values());
|
|
||||||
|
clusterService.submitStateUpdateTask("zen-disco-master_failed (" + masterNode + ")", new ProcessedClusterStateUpdateTask() {
|
||||||
|
@Override public ClusterState execute(ClusterState currentState) {
|
||||||
|
if (!masterNode.id().equals(currentState.nodes().masterNodeId())) {
|
||||||
|
// master got switched on us, no need to send anything
|
||||||
|
return currentState;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<DiscoveryNode> nodes = newArrayList(currentState.nodes().nodes().values());
|
||||||
nodes.remove(masterNode); // remove the master node from the list, it has failed
|
nodes.remove(masterNode); // remove the master node from the list, it has failed
|
||||||
DiscoveryNode electedMaster = electMaster.electMaster(nodes); // elect master
|
final DiscoveryNode electedMaster = electMaster.electMaster(nodes); // elect master
|
||||||
if (localNode.equals(electedMaster)) {
|
if (localNode.equals(electedMaster)) {
|
||||||
this.master = true;
|
master = true;
|
||||||
masterFD.stop("got elected as new master since master left (reason = " + reason + ")");
|
masterFD.stop("got elected as new master since master left (reason = " + reason + ")");
|
||||||
nodesFD.start();
|
nodesFD.start();
|
||||||
clusterService.submitStateUpdateTask("zen-disco-elected_as_master(old master [" + masterNode + "])", new ProcessedClusterStateUpdateTask() {
|
DiscoveryNodes.Builder builder = DiscoveryNodes.newNodesBuilder()
|
||||||
@Override public ClusterState execute(ClusterState currentState) {
|
|
||||||
DiscoveryNodes.Builder builder = new DiscoveryNodes.Builder()
|
|
||||||
.putAll(currentState.nodes())
|
.putAll(currentState.nodes())
|
||||||
// make sure the old master node, which has failed, is not part of the nodes we publish
|
// make sure the old master node, which has failed, is not part of the nodes we publish
|
||||||
.remove(masterNode.id())
|
.remove(masterNode.id())
|
||||||
.masterNodeId(localNode.id());
|
.masterNodeId(localNode.id());
|
||||||
// update the fact that we are the master...
|
|
||||||
latestDiscoNodes = builder.build();
|
latestDiscoNodes = builder.build();
|
||||||
return newClusterStateBuilder().state(currentState).nodes(latestDiscoNodes).build();
|
return newClusterStateBuilder().state(currentState).nodes(latestDiscoNodes).build();
|
||||||
|
} else {
|
||||||
|
nodesFD.stop();
|
||||||
|
DiscoveryNodes.Builder builder = DiscoveryNodes.newNodesBuilder()
|
||||||
|
.putAll(currentState.nodes()).remove(masterNode.id());
|
||||||
|
if (electedMaster != null) {
|
||||||
|
builder.masterNodeId(electedMaster.id());
|
||||||
|
masterFD.restart(electedMaster, "possible elected master since master left (reason = " + reason + ")");
|
||||||
|
} else {
|
||||||
|
builder.masterNodeId(null);
|
||||||
|
masterFD.stop("no master elected since master left (reason = " + reason + ")");
|
||||||
|
// try and join the cluster again...
|
||||||
|
threadPool.execute(new Runnable() {
|
||||||
|
@Override public void run() {
|
||||||
|
joinCluster();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
latestDiscoNodes = builder.build();
|
||||||
|
return newClusterStateBuilder().state(currentState).nodes(latestDiscoNodes).build();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public void clusterStateProcessed(ClusterState clusterState) {
|
@Override public void clusterStateProcessed(ClusterState clusterState) {
|
||||||
sendInitialStateEventIfNeeded();
|
sendInitialStateEventIfNeeded();
|
||||||
}
|
}
|
||||||
|
|
||||||
});
|
});
|
||||||
} else {
|
|
||||||
nodesFD.stop();
|
|
||||||
if (electedMaster != null) {
|
|
||||||
// we are not the master, start FD against the possible master
|
|
||||||
masterFD.restart(electedMaster, "possible elected master since master left (reason = " + reason + ")");
|
|
||||||
} else {
|
|
||||||
masterFD.stop("no master elected since master left (reason = " + reason + ")");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void handleNewClusterState(final ClusterState clusterState) {
|
void handleNewClusterStateFromMaster(final ClusterState clusterState) {
|
||||||
if (master) {
|
if (master) {
|
||||||
logger.warn("master should not receive new cluster state from [{}]", clusterState.nodes().masterNode());
|
logger.warn("master should not receive new cluster state from [{}]", clusterState.nodes().masterNode());
|
||||||
} else {
|
} else {
|
||||||
|
if (clusterState.nodes().localNode() == null) {
|
||||||
|
logger.warn("received a cluster state from [{}] and not part of the cluster, should not happen", clusterState.nodes().masterNode());
|
||||||
|
} else {
|
||||||
|
clusterService.submitStateUpdateTask("zen-disco-receive(from [" + clusterState.nodes().masterNode() + "])", new ProcessedClusterStateUpdateTask() {
|
||||||
|
@Override public ClusterState execute(ClusterState currentState) {
|
||||||
latestDiscoNodes = clusterState.nodes();
|
latestDiscoNodes = clusterState.nodes();
|
||||||
|
|
||||||
// check to see that we monitor the correct master of the cluster
|
// check to see that we monitor the correct master of the cluster
|
||||||
if (masterFD.masterNode() == null || !masterFD.masterNode().equals(latestDiscoNodes.masterNode())) {
|
if (masterFD.masterNode() == null || !masterFD.masterNode().equals(latestDiscoNodes.masterNode())) {
|
||||||
masterFD.restart(latestDiscoNodes.masterNode(), "new cluster stare received and we monitor the wrong master [" + masterFD.masterNode() + "]");
|
masterFD.restart(latestDiscoNodes.masterNode(), "new cluster stare received and we monitor the wrong master [" + masterFD.masterNode() + "]");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (clusterState.nodes().localNode() == null) {
|
|
||||||
logger.warn("received a cluster state from [{}] and not part of the cluster, should not happen", clusterState.nodes().masterNode());
|
|
||||||
} else {
|
|
||||||
clusterService.submitStateUpdateTask("zen-disco-receive(from [" + clusterState.nodes().masterNode() + "])", new ProcessedClusterStateUpdateTask() {
|
|
||||||
@Override public ClusterState execute(ClusterState currentState) {
|
|
||||||
return clusterState;
|
return clusterState;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -418,8 +432,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private DiscoveryNode broadPingTillMasterResolved() {
|
private DiscoveryNode findMaster() {
|
||||||
while (true) {
|
|
||||||
ZenPing.PingResponse[] pingResponses = pingService.pingAndWait(initialPingTimeout);
|
ZenPing.PingResponse[] pingResponses = pingService.pingAndWait(initialPingTimeout);
|
||||||
if (logger.isDebugEnabled()) {
|
if (logger.isDebugEnabled()) {
|
||||||
StringBuilder sb = new StringBuilder("ping responses:");
|
StringBuilder sb = new StringBuilder("ping responses:");
|
||||||
|
@ -455,7 +468,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
||||||
return electedMaster;
|
return electedMaster;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void sendInitialStateEventIfNeeded() {
|
private void sendInitialStateEventIfNeeded() {
|
||||||
|
@ -468,7 +481,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
||||||
|
|
||||||
private class NewClusterStateListener implements PublishClusterStateAction.NewClusterStateListener {
|
private class NewClusterStateListener implements PublishClusterStateAction.NewClusterStateListener {
|
||||||
@Override public void onNewClusterState(ClusterState clusterState) {
|
@Override public void onNewClusterState(ClusterState clusterState) {
|
||||||
handleNewClusterState(clusterState);
|
handleNewClusterStateFromMaster(clusterState);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -93,6 +93,9 @@ public interface ZenPing extends LifecycleComponent<ZenPing> {
|
||||||
|
|
||||||
@Override public void writeTo(StreamOutput out) throws IOException {
|
@Override public void writeTo(StreamOutput out) throws IOException {
|
||||||
clusterName.writeTo(out);
|
clusterName.writeTo(out);
|
||||||
|
if (target == null) {
|
||||||
|
System.out.println("ARGH!");
|
||||||
|
}
|
||||||
target.writeTo(out);
|
target.writeTo(out);
|
||||||
if (master == null) {
|
if (master == null) {
|
||||||
out.writeBoolean(false);
|
out.writeBoolean(false);
|
||||||
|
|
|
@ -345,14 +345,14 @@ public class MulticastZenPing extends AbstractLifecycleComponent<ZenPing> implem
|
||||||
// connect to the node if possible
|
// connect to the node if possible
|
||||||
try {
|
try {
|
||||||
transportService.connectToNode(requestingNode);
|
transportService.connectToNode(requestingNode);
|
||||||
} catch (Exception e) {
|
|
||||||
logger.warn("failed to connect to requesting node {}", e, requestingNode);
|
|
||||||
}
|
|
||||||
transportService.sendRequest(requestingNode, MulticastPingResponseRequestHandler.ACTION, multicastPingResponse, new VoidTransportResponseHandler(false) {
|
transportService.sendRequest(requestingNode, MulticastPingResponseRequestHandler.ACTION, multicastPingResponse, new VoidTransportResponseHandler(false) {
|
||||||
@Override public void handleException(RemoteTransportException exp) {
|
@Override public void handleException(RemoteTransportException exp) {
|
||||||
logger.warn("failed to receive confirmation on sent ping response to [{}]", exp, requestingNode);
|
logger.warn("failed to receive confirmation on sent ping response to [{}]", exp, requestingNode);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.warn("failed to connect to requesting node {}", e, requestingNode);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Reference in New Issue