From 3815a416265b00623dfc633b53b54a8a945743a2 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Wed, 19 Aug 2015 17:46:44 +0200 Subject: [PATCH 01/21] initial copy over from POC --- .../discovery/DiscoverySettings.java | 8 + .../discovery/zen/ZenDiscovery.java | 214 +++++++------- .../publish/PublishClusterStateAction.java | 267 +++++++++++++++--- ...va => PublishClusterStateActionTests.java} | 17 +- .../DiscoveryWithServiceDisruptionsIT.java | 6 +- .../discovery/zen/ZenDiscoveryIT.java | 2 +- 6 files changed, 366 insertions(+), 148 deletions(-) rename core/src/test/java/org/elasticsearch/cluster/{ClusterStateDiffPublishingTests.java => PublishClusterStateActionTests.java} (97%) diff --git a/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java b/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java index acce73ccd9f..e699489281b 100644 --- a/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java +++ b/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java @@ -36,10 +36,12 @@ import java.util.EnumSet; public class DiscoverySettings extends AbstractComponent { public static final String PUBLISH_TIMEOUT = "discovery.zen.publish_timeout"; + public static final String COMMIT_TIMEOUT = "discovery.zen.commit_timeout"; public static final String NO_MASTER_BLOCK = "discovery.zen.no_master_block"; public static final String PUBLISH_DIFF_ENABLE = "discovery.zen.publish_diff.enable"; public static final TimeValue DEFAULT_PUBLISH_TIMEOUT = TimeValue.timeValueSeconds(30); + public static final TimeValue DEFAULT_COMMIT_TIMEOUT = TimeValue.timeValueSeconds(1); public static final String DEFAULT_NO_MASTER_BLOCK = "write"; public final static int NO_MASTER_BLOCK_ID = 2; public final static boolean DEFAULT_PUBLISH_DIFF_ENABLE = true; @@ -49,6 +51,7 @@ public class DiscoverySettings extends AbstractComponent { private volatile ClusterBlock noMasterBlock; private volatile TimeValue publishTimeout = DEFAULT_PUBLISH_TIMEOUT; + private volatile TimeValue commitTimeout = DEFAULT_COMMIT_TIMEOUT; private volatile boolean publishDiff = DEFAULT_PUBLISH_DIFF_ENABLE; @Inject @@ -57,6 +60,7 @@ public class DiscoverySettings extends AbstractComponent { nodeSettingsService.addListener(new ApplySettings()); this.noMasterBlock = parseNoMasterBlock(settings.get(NO_MASTER_BLOCK, DEFAULT_NO_MASTER_BLOCK)); this.publishTimeout = settings.getAsTime(PUBLISH_TIMEOUT, publishTimeout); + this.commitTimeout = settings.getAsTime(PUBLISH_TIMEOUT, publishTimeout); this.publishDiff = settings.getAsBoolean(PUBLISH_DIFF_ENABLE, DEFAULT_PUBLISH_DIFF_ENABLE); } @@ -67,6 +71,10 @@ public class DiscoverySettings extends AbstractComponent { return publishTimeout; } + public TimeValue getCommitTimeout() { + return commitTimeout; + } + public ClusterBlock getNoMasterBlock() { return noMasterBlock; } diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java index b62db46aca9..f522468c3c1 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java @@ -36,6 +36,7 @@ import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.MetaData; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.cluster.routing.RoutingNode; import org.elasticsearch.cluster.routing.RoutingService; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; import org.elasticsearch.cluster.service.InternalClusterService; @@ -199,7 +200,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService, clusterName); this.nodesFD.addListener(new NodeFaultDetectionListener()); - this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewClusterStateListener(), discoverySettings); + this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewClusterStateListener(), discoverySettings, clusterName); this.pingService.setPingContextProvider(this); this.membership = new MembershipAction(settings, clusterService, transportService, this, new MembershipListener()); @@ -329,7 +330,24 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen throw new IllegalStateException("Shouldn't publish state when not master"); } nodesFD.updateNodesAndPing(clusterChangedEvent.state()); - publishClusterState.publish(clusterChangedEvent, ackListener); + try { + publishClusterState.publish(clusterChangedEvent, electMaster.minimumMasterNodes(), ackListener); + } catch (PublishClusterStateAction.FailedToCommitException t) { + logger.warn("failed to publish [{}] (not enough nodes acknowledged, min master nodes [{}])", clusterChangedEvent.state().version(), electMaster.minimumMasterNodes()); + clusterService.submitStateUpdateTask("zen-disco-failed-to-publish", Priority.IMMEDIATE, new ClusterStateUpdateTask() { + @Override + public ClusterState execute(ClusterState currentState) { + return rejoin(currentState, "failed to publish to min_master_nodes"); + } + + @Override + public void onFailure(String source, Throwable t) { + logger.error("unexpected failure during [{}]", t, source); + } + + }); + throw t; + } } /** @@ -677,12 +695,6 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen void handleNewClusterStateFromMaster(ClusterState newClusterState, final PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { final ClusterName incomingClusterName = newClusterState.getClusterName(); - /* The cluster name can still be null if the state comes from a node that is prev 1.1.1*/ - if (incomingClusterName != null && !incomingClusterName.equals(this.clusterName)) { - logger.warn("received cluster state from [{}] which is also master but with a different cluster name [{}]", newClusterState.nodes().masterNode(), incomingClusterName); - newStateProcessed.onNewClusterStateFailed(new IllegalStateException("received state from a node that is not part of the cluster")); - return; - } if (localNodeMaster()) { logger.debug("received cluster state from [{}] which is also master with cluster name [{}]", newClusterState.nodes().masterNode(), incomingClusterName); final ClusterState newState = newClusterState; @@ -705,101 +717,97 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen }); } else { - if (newClusterState.nodes().localNode() == null) { - logger.warn("received a cluster state from [{}] and not part of the cluster, should not happen", newClusterState.nodes().masterNode()); - newStateProcessed.onNewClusterStateFailed(new IllegalStateException("received state from a node that is not part of the cluster")); - } else { - - final ProcessClusterState processClusterState = new ProcessClusterState(newClusterState); - processNewClusterStates.add(processClusterState); - - assert newClusterState.nodes().masterNode() != null : "received a cluster state without a master"; - assert !newClusterState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock()) : "received a cluster state with a master block"; - - clusterService.submitStateUpdateTask("zen-disco-receive(from master [" + newClusterState.nodes().masterNode() + "])", Priority.URGENT, new ProcessedClusterStateNonMasterUpdateTask() { - @Override - public ClusterState execute(ClusterState currentState) { - // we already processed it in a previous event - if (processClusterState.processed) { - return currentState; - } - - // TODO: once improvement that we can do is change the message structure to include version and masterNodeId - // at the start, this will allow us to keep the "compressed bytes" around, and only parse the first page - // to figure out if we need to use it or not, and only once we picked the latest one, parse the whole state - ClusterState updatedState = selectNextStateToProcess(processNewClusterStates); - if (updatedState == null) { - updatedState = currentState; - } - if (shouldIgnoreOrRejectNewClusterState(logger, currentState, updatedState)) { - return currentState; - } + final ProcessClusterState processClusterState = new ProcessClusterState(newClusterState); + processNewClusterStates.add(processClusterState); - // we don't need to do this, since we ping the master, and get notified when it has moved from being a master - // because it doesn't have enough master nodes... - //if (!electMaster.hasEnoughMasterNodes(newState.nodes())) { - // return disconnectFromCluster(newState, "not enough master nodes on new cluster state wreceived from [" + newState.nodes().masterNode() + "]"); - //} + assert newClusterState.nodes().masterNode() != null : "received a cluster state without a master"; + assert !newClusterState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock()) : "received a cluster state with a master block"; - // check to see that we monitor the correct master of the cluster - if (masterFD.masterNode() == null || !masterFD.masterNode().equals(updatedState.nodes().masterNode())) { - masterFD.restart(updatedState.nodes().masterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]"); - } + clusterService.submitStateUpdateTask("zen-disco-receive(from master [" + newClusterState.nodes().masterNode() + "])", Priority.URGENT, new ProcessedClusterStateNonMasterUpdateTask() { + @Override + public ClusterState execute(ClusterState currentState) { + // we already processed it in a previous event + if (processClusterState.processed) { + return currentState; + } - if (currentState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock())) { - // its a fresh update from the master as we transition from a start of not having a master to having one - logger.debug("got first state from fresh master [{}]", updatedState.nodes().masterNodeId()); - long count = clusterJoinsCounter.incrementAndGet(); - logger.trace("updated cluster join cluster to [{}]", count); - - return updatedState; - } + // TODO: once improvement that we can do is change the message structure to include version and masterNodeId + // at the start, this will allow us to keep the "compressed bytes" around, and only parse the first page + // to figure out if we need to use it or not, and only once we picked the latest one, parse the whole state - // some optimizations to make sure we keep old objects where possible - ClusterState.Builder builder = ClusterState.builder(updatedState); + ClusterState updatedState = selectNextStateToProcess(processNewClusterStates); + if (updatedState == null) { + updatedState = currentState; + } + if (shouldIgnoreOrRejectNewClusterState(logger, currentState, updatedState)) { + return currentState; + } - // if the routing table did not change, use the original one - if (updatedState.routingTable().version() == currentState.routingTable().version()) { - builder.routingTable(currentState.routingTable()); - } - // same for metadata - if (updatedState.metaData().version() == currentState.metaData().version()) { - builder.metaData(currentState.metaData()); - } else { - // if its not the same version, only copy over new indices or ones that changed the version - MetaData.Builder metaDataBuilder = MetaData.builder(updatedState.metaData()).removeAllIndices(); - for (IndexMetaData indexMetaData : updatedState.metaData()) { - IndexMetaData currentIndexMetaData = currentState.metaData().index(indexMetaData.index()); - if (currentIndexMetaData != null && currentIndexMetaData.isSameUUID(indexMetaData.indexUUID()) && - currentIndexMetaData.version() == indexMetaData.version()) { - // safe to reuse - metaDataBuilder.put(currentIndexMetaData, false); - } else { - metaDataBuilder.put(indexMetaData, false); - } + // we don't need to do this, since we ping the master, and get notified when it has moved from being a master + // because it doesn't have enough master nodes... + //if (!electMaster.hasEnoughMasterNodes(newState.nodes())) { + // return disconnectFromCluster(newState, "not enough master nodes on new cluster state wreceived from [" + newState.nodes().masterNode() + "]"); + //} + + // check to see that we monitor the correct master of the cluster + if (masterFD.masterNode() == null || !masterFD.masterNode().equals(updatedState.nodes().masterNode())) { + masterFD.restart(updatedState.nodes().masterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]"); + } + + if (currentState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock())) { + // its a fresh update from the master as we transition from a start of not having a master to having one + logger.debug("got first state from fresh master [{}]", updatedState.nodes().masterNodeId()); + long count = clusterJoinsCounter.incrementAndGet(); + logger.trace("updated cluster join cluster to [{}]", count); + + return updatedState; + } + + + // some optimizations to make sure we keep old objects where possible + ClusterState.Builder builder = ClusterState.builder(updatedState); + + // if the routing table did not change, use the original one + if (updatedState.routingTable().version() == currentState.routingTable().version()) { + builder.routingTable(currentState.routingTable()); + } + // same for metadata + if (updatedState.metaData().version() == currentState.metaData().version()) { + builder.metaData(currentState.metaData()); + } else { + // if its not the same version, only copy over new indices or ones that changed the version + MetaData.Builder metaDataBuilder = MetaData.builder(updatedState.metaData()).removeAllIndices(); + for (IndexMetaData indexMetaData : updatedState.metaData()) { + IndexMetaData currentIndexMetaData = currentState.metaData().index(indexMetaData.index()); + if (currentIndexMetaData != null && currentIndexMetaData.isSameUUID(indexMetaData.indexUUID()) && + currentIndexMetaData.version() == indexMetaData.version()) { + // safe to reuse + metaDataBuilder.put(currentIndexMetaData, false); + } else { + metaDataBuilder.put(indexMetaData, false); } - builder.metaData(metaDataBuilder); } - - return builder.build(); + builder.metaData(metaDataBuilder); } - @Override - public void onFailure(String source, Throwable t) { - logger.error("unexpected failure during [{}]", t, source); - newStateProcessed.onNewClusterStateFailed(t); - } + return builder.build(); + } - @Override - public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { - sendInitialStateEventIfNeeded(); - newStateProcessed.onNewClusterStateProcessed(); - } - }); - } + @Override + public void onFailure(String source, Throwable t) { + logger.error("unexpected failure during [{}]", t, source); + newStateProcessed.onNewClusterStateFailed(t); + } + + @Override + public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { + sendInitialStateEventIfNeeded(); + newStateProcessed.onNewClusterStateProcessed(); + } + }); } } @@ -848,13 +856,8 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen * If the second condition fails we ignore the cluster state. */ static boolean shouldIgnoreOrRejectNewClusterState(ESLogger logger, ClusterState currentState, ClusterState newClusterState) { - if (currentState.nodes().masterNodeId() == null) { - return false; - } - if (!currentState.nodes().masterNodeId().equals(newClusterState.nodes().masterNodeId())) { - logger.warn("received a cluster state from a different master then the current one, rejecting (received {}, current {})", newClusterState.nodes().masterNode(), currentState.nodes().masterNode()); - throw new IllegalStateException("cluster state from a different master than the current one, rejecting (received " + newClusterState.nodes().masterNode() + ", current " + currentState.nodes().masterNode() + ")"); - } else if (newClusterState.version() < currentState.version()) { + rejectNewClusterStateIfNeeded(logger, currentState.nodes(), newClusterState); + if (currentState.nodes().masterNodeId() != null && newClusterState.version() < currentState.version()) { // if the new state has a smaller version, and it has the same master node, then no need to process it logger.debug("received a cluster state that has a lower version than the current one, ignoring (received {}, current {})", newClusterState.version(), currentState.version()); return true; @@ -863,6 +866,21 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen } } + /** + * In the case we follow an elected master the new cluster state needs to have the same elected master + * This method checks for this and throws an exception if needed + */ + + public static void rejectNewClusterStateIfNeeded(ESLogger logger, DiscoveryNodes currentNodes, ClusterState newClusterState) { + if (currentNodes.masterNodeId() == null) { + return; + } + if (!currentNodes.masterNodeId().equals(newClusterState.nodes().masterNodeId())) { + logger.warn("received a cluster state from a different master then the current one, rejecting (received {}, current {})", newClusterState.nodes().masterNode(), currentNodes.masterNode()); + throw new IllegalStateException("cluster state from a different master then the current one, rejecting (received " + newClusterState.nodes().masterNode() + ", current " + currentNodes.masterNode() + ")"); + } + } + void handleJoinRequest(final DiscoveryNode node, final MembershipAction.JoinCallback callback) { if (!transportService.addressSupported(node.address().getClass())) { @@ -1300,4 +1318,4 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen } } -} \ No newline at end of file +} diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java index 78c13f8ce53..050805bcf5a 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java @@ -20,11 +20,9 @@ package org.elasticsearch.discovery.zen.publish; import com.google.common.collect.Maps; +import org.elasticsearch.ElasticsearchException; import org.elasticsearch.Version; -import org.elasticsearch.cluster.ClusterChangedEvent; -import org.elasticsearch.cluster.ClusterState; -import org.elasticsearch.cluster.Diff; -import org.elasticsearch.cluster.IncompatibleClusterStateVersionException; +import org.elasticsearch.cluster.*; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.bytes.BytesReference; @@ -41,13 +39,17 @@ import org.elasticsearch.discovery.BlockingClusterStatePublishResponseHandler; import org.elasticsearch.discovery.Discovery; import org.elasticsearch.discovery.DiscoverySettings; import org.elasticsearch.discovery.zen.DiscoveryNodesProvider; +import org.elasticsearch.discovery.zen.ZenDiscovery; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.*; import java.io.IOException; +import java.util.ArrayList; import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; /** @@ -55,7 +57,8 @@ import java.util.concurrent.atomic.AtomicBoolean; */ public class PublishClusterStateAction extends AbstractComponent { - public static final String ACTION_NAME = "internal:discovery/zen/publish"; + public static final String SEND_ACTION_NAME = "internal:discovery/zen/publish/send"; + public static final String COMMIT_ACTION_NAME = "internal:discovery/zen/publish/commit"; public interface NewClusterStateListener { @@ -73,34 +76,41 @@ public class PublishClusterStateAction extends AbstractComponent { private final DiscoveryNodesProvider nodesProvider; private final NewClusterStateListener listener; private final DiscoverySettings discoverySettings; + private final ClusterName clusterName; public PublishClusterStateAction(Settings settings, TransportService transportService, DiscoveryNodesProvider nodesProvider, - NewClusterStateListener listener, DiscoverySettings discoverySettings) { + NewClusterStateListener listener, DiscoverySettings discoverySettings, ClusterName clusterName) { super(settings); this.transportService = transportService; this.nodesProvider = nodesProvider; this.listener = listener; this.discoverySettings = discoverySettings; - transportService.registerRequestHandler(ACTION_NAME, BytesTransportRequest.class, ThreadPool.Names.SAME, new PublishClusterStateRequestHandler()); + this.clusterName = clusterName; + transportService.registerRequestHandler(SEND_ACTION_NAME, BytesTransportRequest.class, ThreadPool.Names.SAME, new SendClusterStateRequestHandler()); + transportService.registerRequestHandler(COMMIT_ACTION_NAME, CommitClusterStateRequest.class, ThreadPool.Names.SAME, new CommitClusterStateRequestHandler()); } public void close() { - transportService.removeHandler(ACTION_NAME); + transportService.removeHandler(SEND_ACTION_NAME); + transportService.removeHandler(COMMIT_ACTION_NAME); } - public void publish(ClusterChangedEvent clusterChangedEvent, final Discovery.AckListener ackListener) { + public void publish(ClusterChangedEvent clusterChangedEvent, int minMasterNodes, final Discovery.AckListener ackListener) { Set nodesToPublishTo = new HashSet<>(clusterChangedEvent.state().nodes().size()); DiscoveryNode localNode = nodesProvider.nodes().localNode(); + int totalMasterNodes = 0; for (final DiscoveryNode node : clusterChangedEvent.state().nodes()) { - if (node.equals(localNode)) { - continue; + if (node.isMasterNode()) { + totalMasterNodes++; + } + if (node.equals(localNode) == false) { + nodesToPublishTo.add(node); } - nodesToPublishTo.add(node); } - publish(clusterChangedEvent, nodesToPublishTo, new AckClusterStatePublishResponseHandler(nodesToPublishTo, ackListener)); + publish(clusterChangedEvent, minMasterNodes, totalMasterNodes, nodesToPublishTo, new AckClusterStatePublishResponseHandler(nodesToPublishTo, ackListener)); } - private void publish(final ClusterChangedEvent clusterChangedEvent, final Set nodesToPublishTo, + private void publish(final ClusterChangedEvent clusterChangedEvent, int minMasterNodes, int totalMasterNodes, final Set nodesToPublishTo, final BlockingClusterStatePublishResponseHandler publishResponseHandler) { Map serializedStates = Maps.newHashMap(); @@ -111,6 +121,7 @@ public class PublishClusterStateAction extends AbstractComponent { final AtomicBoolean timedOutWaitingForNodes = new AtomicBoolean(false); final TimeValue publishTimeout = discoverySettings.getPublishTimeout(); final boolean sendFullVersion = !discoverySettings.getPublishDiff() || previousState == null; + final SendingController sendingController = new SendingController(clusterChangedEvent.state(), minMasterNodes, totalMasterNodes, publishResponseHandler); Diff diff = null; for (final DiscoveryNode node : nodesToPublishTo) { @@ -119,15 +130,17 @@ public class PublishClusterStateAction extends AbstractComponent { // per node when we send it over the wire, compress it while we are at it... // we don't send full version if node didn't exist in the previous version of cluster state if (sendFullVersion || !previousState.nodes().nodeExists(node.id())) { - sendFullClusterState(clusterState, serializedStates, node, timedOutWaitingForNodes, publishTimeout, publishResponseHandler); + sendFullClusterState(clusterState, serializedStates, node, timedOutWaitingForNodes, publishTimeout, sendingController); } else { if (diff == null) { diff = clusterState.diff(previousState); } - sendClusterStateDiff(clusterState, diff, serializedDiffs, node, timedOutWaitingForNodes, publishTimeout, publishResponseHandler); + sendClusterStateDiff(clusterState, diff, serializedDiffs, node, timedOutWaitingForNodes, publishTimeout, sendingController); } } + sendingController.waitForCommit(discoverySettings.getCommitTimeout()); + if (publishTimeout.millis() > 0) { // only wait if the publish timeout is configured... try { @@ -148,7 +161,7 @@ public class PublishClusterStateAction extends AbstractComponent { private void sendFullClusterState(ClusterState clusterState, @Nullable Map serializedStates, DiscoveryNode node, AtomicBoolean timedOutWaitingForNodes, TimeValue publishTimeout, - BlockingClusterStatePublishResponseHandler publishResponseHandler) { + SendingController sendingController) { BytesReference bytes = null; if (serializedStates != null) { bytes = serializedStates.get(node.version()); @@ -161,16 +174,16 @@ public class PublishClusterStateAction extends AbstractComponent { } } catch (Throwable e) { logger.warn("failed to serialize cluster_state before publishing it to node {}", e, node); - publishResponseHandler.onFailure(node, e); + sendingController.onNodeSendFailed(node, e); return; } } - publishClusterStateToNode(clusterState, bytes, node, timedOutWaitingForNodes, publishTimeout, publishResponseHandler, false); + sendClusterStateToNode(clusterState, bytes, node, timedOutWaitingForNodes, publishTimeout, sendingController, false); } private void sendClusterStateDiff(ClusterState clusterState, Diff diff, Map serializedDiffs, DiscoveryNode node, AtomicBoolean timedOutWaitingForNodes, TimeValue publishTimeout, - BlockingClusterStatePublishResponseHandler publishResponseHandler) { + SendingController sendingController) { BytesReference bytes = serializedDiffs.get(node.version()); if (bytes == null) { try { @@ -178,23 +191,23 @@ public class PublishClusterStateAction extends AbstractComponent { serializedDiffs.put(node.version(), bytes); } catch (Throwable e) { logger.warn("failed to serialize diff of cluster_state before publishing it to node {}", e, node); - publishResponseHandler.onFailure(node, e); + sendingController.onNodeSendFailed(node, e); return; } } - publishClusterStateToNode(clusterState, bytes, node, timedOutWaitingForNodes, publishTimeout, publishResponseHandler, true); + sendClusterStateToNode(clusterState, bytes, node, timedOutWaitingForNodes, publishTimeout, sendingController, true); } - private void publishClusterStateToNode(final ClusterState clusterState, BytesReference bytes, - final DiscoveryNode node, final AtomicBoolean timedOutWaitingForNodes, - final TimeValue publishTimeout, - final BlockingClusterStatePublishResponseHandler publishResponseHandler, - final boolean sendDiffs) { + private void sendClusterStateToNode(final ClusterState clusterState, BytesReference bytes, + final DiscoveryNode node, final AtomicBoolean timedOutWaitingForNodes, + final TimeValue publishTimeout, + final SendingController sendingController, + final boolean sendDiffs) { try { TransportRequestOptions options = TransportRequestOptions.options().withType(TransportRequestOptions.Type.STATE).withCompress(false); // no need to put a timeout on the options here, because we want the response to eventually be received // and not log an error if it arrives after the timeout - transportService.sendRequest(node, ACTION_NAME, + transportService.sendRequest(node, SEND_ACTION_NAME, new BytesTransportRequest(bytes, node.version()), options, // no need to compress, we already compressed the bytes @@ -205,26 +218,59 @@ public class PublishClusterStateAction extends AbstractComponent { if (timedOutWaitingForNodes.get()) { logger.debug("node {} responded for cluster state [{}] (took longer than [{}])", node, clusterState.version(), publishTimeout); } - publishResponseHandler.onResponse(node); + sendingController.onNodeSendAck(node); } @Override public void handleException(TransportException exp) { if (sendDiffs && exp.unwrapCause() instanceof IncompatibleClusterStateVersionException) { logger.debug("resending full cluster state to node {} reason {}", node, exp.getDetailedMessage()); - sendFullClusterState(clusterState, null, node, timedOutWaitingForNodes, publishTimeout, publishResponseHandler); + sendFullClusterState(clusterState, null, node, timedOutWaitingForNodes, publishTimeout, sendingController); } else { logger.debug("failed to send cluster state to {}", exp, node); - publishResponseHandler.onFailure(node, exp); + sendingController.onNodeSendFailed(node, exp); } } }); } catch (Throwable t) { logger.warn("error sending cluster state to {}", t, node); + sendingController.onNodeSendFailed(node, t); + } + } + + private void sendCommitToNode(final DiscoveryNode node, final ClusterState clusterState, final BlockingClusterStatePublishResponseHandler publishResponseHandler) { + try { + logger.trace("sending commit for cluster state (uuid: [{}], version [{}]) to [{}]", clusterState.stateUUID(), clusterState.version(), node); + TransportRequestOptions options = TransportRequestOptions.options().withType(TransportRequestOptions.Type.STATE).withCompress(false); + // no need to put a timeout on the options here, because we want the response to eventually be received + // and not log an error if it arrives after the timeout + transportService.sendRequest(node, COMMIT_ACTION_NAME, + new CommitClusterStateRequest(clusterState.stateUUID()), + options, // no need to compress, we already compressed the bytes + + new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { + + @Override + public void handleResponse(TransportResponse.Empty response) { +// if (timedOutWaitingForNodes.get()) { + logger.debug("node {} responded to cluster state commit [{}]", node, clusterState.version()); +// } + publishResponseHandler.onResponse(node); + } + + @Override + public void handleException(TransportException exp) { + logger.debug("failed to commit cluster state (uuid [{}], version [{}]) to {}", exp, clusterState.stateUUID(), clusterState.version(), node); + publishResponseHandler.onFailure(node, exp); + } + }); + } catch (Throwable t) { + logger.warn("error sending cluster state commit (uuid [{}], version [{}]) to {}", t, clusterState.stateUUID(), clusterState.version(), node); publishResponseHandler.onFailure(node, t); } } + public static BytesReference serializeFullClusterState(ClusterState clusterState, Version nodeVersion) throws IOException { BytesStreamOutput bStream = new BytesStreamOutput(); try (StreamOutput stream = CompressorFactory.defaultCompressor().streamOutput(bStream)) { @@ -245,8 +291,10 @@ public class PublishClusterStateAction extends AbstractComponent { return bStream.bytes(); } - private class PublishClusterStateRequestHandler implements TransportRequestHandler { - private ClusterState lastSeenClusterState; + private Object lastSeenClusterStateMutex = new Object(); + private ClusterState lastSeenClusterState; + + private class SendClusterStateRequestHandler implements TransportRequestHandler { @Override public void messageReceived(BytesTransportRequest request, final TransportChannel channel) throws Exception { @@ -258,24 +306,57 @@ public class PublishClusterStateAction extends AbstractComponent { in = request.bytes().streamInput(); } in.setVersion(request.version()); - synchronized (this) { + synchronized (lastSeenClusterStateMutex) { + final ClusterState incomingState; // If true we received full cluster state - otherwise diffs if (in.readBoolean()) { - lastSeenClusterState = ClusterState.Builder.readFrom(in, nodesProvider.nodes().localNode()); - logger.debug("received full cluster state version {} with size {}", lastSeenClusterState.version(), request.bytes().length()); + incomingState = ClusterState.Builder.readFrom(in, nodesProvider.nodes().localNode()); + logger.debug("received full cluster state version [{}] with size [{}]", incomingState.version(), request.bytes().length()); } else if (lastSeenClusterState != null) { Diff diff = lastSeenClusterState.readDiffFrom(in); - lastSeenClusterState = diff.apply(lastSeenClusterState); - logger.debug("received diff cluster state version {} with uuid {}, diff size {}", lastSeenClusterState.version(), lastSeenClusterState.stateUUID(), request.bytes().length()); + incomingState = diff.apply(lastSeenClusterState); + logger.debug("received diff cluster state version [{}] with uuid [{}], diff size [{}]", incomingState.version(), incomingState.stateUUID(), request.bytes().length()); } else { logger.debug("received diff for but don't have any local cluster state - requesting full state"); throw new IncompatibleClusterStateVersionException("have no local cluster state"); } + // sanity check incoming state + final ClusterName incomingClusterName = incomingState.getClusterName(); + if (!incomingClusterName.equals(PublishClusterStateAction.this.clusterName)) { + logger.warn("received cluster state from [{}] which is also master but with a different cluster name [{}]", incomingState.nodes().masterNode(), incomingClusterName); + throw new IllegalStateException("received state from a node that is not part of the cluster"); + } + if (incomingState.nodes().localNode() == null) { + logger.warn("received a cluster state from [{}] and not part of the cluster, should not happen", incomingState.nodes().masterNode()); + throw new IllegalStateException("received state from a node that is not part of the cluster"); + } + // state from another master requires more subtle checks, so we let it pass for now (it will be checked in ZenDiscovery) + if (nodesProvider.nodes().localNodeMaster() == false) { + ZenDiscovery.rejectNewClusterStateIfNeeded(logger, nodesProvider.nodes(), incomingState); + } + + lastSeenClusterState = incomingState; lastSeenClusterState.status(ClusterState.ClusterStateStatus.RECEIVED); } + channel.sendResponse(TransportResponse.Empty.INSTANCE); + } + } + + private class CommitClusterStateRequestHandler implements TransportRequestHandler { + @Override + public void messageReceived(CommitClusterStateRequest request, final TransportChannel channel) throws Exception { + ClusterState committedClusterState; + synchronized (lastSeenClusterStateMutex) { + committedClusterState = lastSeenClusterState; + } + if (committedClusterState.stateUUID().equals(request.stateUUID) == false) { + // nocommit: we need something better here + channel.sendResponse(TransportResponse.Empty.INSTANCE); + return; + } try { - listener.onNewClusterState(lastSeenClusterState, new NewClusterStateListener.NewStateProcessed() { + listener.onNewClusterState(committedClusterState, new NewClusterStateListener.NewStateProcessed() { @Override public void onNewClusterStateProcessed() { try { @@ -304,4 +385,110 @@ public class PublishClusterStateAction extends AbstractComponent { } } } -} + + static class CommitClusterStateRequest extends TransportRequest { + + String stateUUID; + + public CommitClusterStateRequest() { + } + + public CommitClusterStateRequest(String stateUUID) { + this.stateUUID = stateUUID; + } + + @Override + public void readFrom(StreamInput in) throws IOException { + super.readFrom(in); + stateUUID = in.readString(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeString(stateUUID); + } + } + + + public class FailedToCommitException extends ElasticsearchException { + + public FailedToCommitException(String msg) { + super(msg); + } + } + + class SendingController { + + private final ClusterState clusterState; + private final BlockingClusterStatePublishResponseHandler publishResponseHandler; + volatile int neededMastersToCommit; + int pendingMasterNodes; + final ArrayList sendAckedBeforeCommit = new ArrayList<>(); + final CountDownLatch comittedOrFailed; + final AtomicBoolean committed; + + private SendingController(ClusterState clusterState, int minMasterNodes, int totalMasterNodes, BlockingClusterStatePublishResponseHandler publishResponseHandler) { + this.clusterState = clusterState; + this.publishResponseHandler = publishResponseHandler; + this.neededMastersToCommit = Math.max(0, minMasterNodes - 1); // we are one of the master nodes + this.pendingMasterNodes = totalMasterNodes - 1; + this.committed = new AtomicBoolean(neededMastersToCommit == 0); + this.comittedOrFailed = new CountDownLatch(committed.get() ? 0 : 1); + } + + public void waitForCommit(TimeValue commitTimeout) { + try { + comittedOrFailed.await(commitTimeout.millis(), TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + + } + if (committed.get() == false) { + throw new FailedToCommitException("failed to get enough masters to ack sent cluster state. [" + neededMastersToCommit + "] left"); + } + } + + synchronized public void onNodeSendAck(DiscoveryNode node) { + if (committed.get() == false) { + sendAckedBeforeCommit.add(node); + if (node.isMasterNode()) { + onMasterNodeSendAck(node); + } + } else { + assert sendAckedBeforeCommit.isEmpty(); + sendCommitToNode(node, clusterState, publishResponseHandler); + } + + } + + private void onMasterNodeSendAck(DiscoveryNode node) { + neededMastersToCommit--; + if (neededMastersToCommit == 0) { + logger.trace("committing version [{}]", clusterState.version()); + for (DiscoveryNode nodeToCommit : sendAckedBeforeCommit) { + sendCommitToNode(nodeToCommit, clusterState, publishResponseHandler); + } + sendAckedBeforeCommit.clear(); + boolean success = committed.compareAndSet(false, true); + assert success; + comittedOrFailed.countDown(); + } + onMasterNodeDone(node); + } + + private void onMasterNodeDone(DiscoveryNode node) { + pendingMasterNodes--; + if (pendingMasterNodes == 0) { + comittedOrFailed.countDown(); + } + } + + synchronized public void onNodeSendFailed(DiscoveryNode node, Throwable t) { + if (node.isMasterNode()) { + onMasterNodeDone(node); + } + publishResponseHandler.onFailure(node, t); + } + + } +} \ No newline at end of file diff --git a/core/src/test/java/org/elasticsearch/cluster/ClusterStateDiffPublishingTests.java b/core/src/test/java/org/elasticsearch/cluster/PublishClusterStateActionTests.java similarity index 97% rename from core/src/test/java/org/elasticsearch/cluster/ClusterStateDiffPublishingTests.java rename to core/src/test/java/org/elasticsearch/cluster/PublishClusterStateActionTests.java index 5575ed93317..5bd51f8ec7a 100644 --- a/core/src/test/java/org/elasticsearch/cluster/ClusterStateDiffPublishingTests.java +++ b/core/src/test/java/org/elasticsearch/cluster/PublishClusterStateActionTests.java @@ -60,7 +60,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import static com.google.common.collect.Maps.newHashMap; import static org.hamcrest.Matchers.*; -public class ClusterStateDiffPublishingTests extends ESTestCase { +public class PublishClusterStateActionTests extends ESTestCase { protected ThreadPool threadPool; protected Map nodes = newHashMap(); @@ -177,7 +177,7 @@ public class ClusterStateDiffPublishingTests extends ESTestCase { protected PublishClusterStateAction buildPublishClusterStateAction(Settings settings, MockTransportService transportService, MockDiscoveryNodesProvider nodesProvider, PublishClusterStateAction.NewClusterStateListener listener) { DiscoverySettings discoverySettings = new DiscoverySettings(settings, new NodeSettingsService(settings)); - return new PublishClusterStateAction(settings, transportService, nodesProvider, listener, discoverySettings); + return new PublishClusterStateAction(settings, transportService, nodesProvider, listener, discoverySettings, ClusterName.DEFAULT); } @@ -217,7 +217,7 @@ public class ClusterStateDiffPublishingTests extends ESTestCase { // Initial cluster state DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); - ClusterState clusterState = ClusterState.builder(new ClusterName("test")).nodes(discoveryNodes).build(); + ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); // cluster state update - add nodeB discoveryNodes = DiscoveryNodes.builder(discoveryNodes).put(nodeB.discoveryNode).build(); @@ -356,7 +356,7 @@ public class ClusterStateDiffPublishingTests extends ESTestCase { // Initial cluster state with both states - the second node still shouldn't get diff even though it's present in the previous cluster state DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).put(nodeB.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); - ClusterState previousClusterState = ClusterState.builder(new ClusterName("test")).nodes(discoveryNodes).build(); + ClusterState previousClusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); ClusterState clusterState = ClusterState.builder(previousClusterState).incrementVersion().build(); mockListenerB.add(new NewClusterStateExpectation() { @Override @@ -401,7 +401,7 @@ public class ClusterStateDiffPublishingTests extends ESTestCase { // Initial cluster state DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); - ClusterState clusterState = ClusterState.builder(new ClusterName("test")).nodes(discoveryNodes).build(); + ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); // cluster state update - add nodeB discoveryNodes = DiscoveryNodes.builder(discoveryNodes).put(nodeB.discoveryNode).build(); @@ -447,7 +447,7 @@ public class ClusterStateDiffPublishingTests extends ESTestCase { AssertingAckListener[] listeners = new AssertingAckListener[numberOfIterations]; DiscoveryNodes discoveryNodes = discoveryNodesBuilder.build(); MetaData metaData = MetaData.EMPTY_META_DATA; - ClusterState clusterState = ClusterState.builder(new ClusterName("test")).metaData(metaData).build(); + ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metaData(metaData).build(); ClusterState previousState; for (int i = 0; i < numberOfIterations; i++) { previousState = clusterState; @@ -477,7 +477,7 @@ public class ClusterStateDiffPublishingTests extends ESTestCase { // Initial cluster state with both states - the second node still shouldn't get diff even though it's present in the previous cluster state DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).put(nodeB.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); - ClusterState previousClusterState = ClusterState.builder(new ClusterName("test")).nodes(discoveryNodes).build(); + ClusterState previousClusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); ClusterState clusterState = ClusterState.builder(previousClusterState).incrementVersion().build(); mockListenerB.add(new NewClusterStateExpectation() { @Override @@ -545,7 +545,8 @@ public class ClusterStateDiffPublishingTests extends ESTestCase { public AssertingAckListener publishStateDiff(PublishClusterStateAction action, ClusterState state, ClusterState previousState) throws InterruptedException { AssertingAckListener assertingAckListener = new AssertingAckListener(state.nodes().getSize() - 1); ClusterChangedEvent changedEvent = new ClusterChangedEvent("test update", state, previousState); - action.publish(changedEvent, assertingAckListener); + int requiredNodes = randomIntBetween(-1, state.nodes().getSize() - 1); + action.publish(changedEvent, requiredNodes, assertingAckListener); return assertingAckListener; } diff --git a/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java b/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java index c18abdc5bef..6d4b0a9ee45 100644 --- a/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java +++ b/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java @@ -828,7 +828,11 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase { logger.info("blocking cluster state publishing from master [{}] to non master [{}]", masterNode, nonMasterNode); MockTransportService masterTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, masterNode); - masterTransportService.addFailToSendNoConnectRule(discoveryNodes.localNode(), PublishClusterStateAction.ACTION_NAME); + if (randomBoolean()) { + masterTransportService.addFailToSendNoConnectRule(discoveryNodes.localNode(), PublishClusterStateAction.SEND_ACTION_NAME); + } else { + masterTransportService.addFailToSendNoConnectRule(discoveryNodes.localNode(), PublishClusterStateAction.COMMIT_ACTION_NAME); + } logger.info("allowing requests from non master [{}] to master [{}], waiting for two join request", nonMasterNode, masterNode); final CountDownLatch countDownLatch = new CountDownLatch(2); diff --git a/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryIT.java b/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryIT.java index cc293375a2c..9b79ff9f7a7 100644 --- a/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryIT.java +++ b/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryIT.java @@ -205,7 +205,7 @@ public class ZenDiscoveryIT extends ESIntegTestCase { final CountDownLatch latch = new CountDownLatch(1); final AtomicReference reference = new AtomicReference<>(); - internalCluster().getInstance(TransportService.class, noneMasterNode).sendRequest(node, PublishClusterStateAction.ACTION_NAME, new BytesTransportRequest(bytes, Version.CURRENT), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { + internalCluster().getInstance(TransportService.class, noneMasterNode).sendRequest(node, PublishClusterStateAction.SEND_ACTION_NAME, new BytesTransportRequest(bytes, Version.CURRENT), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { @Override public void handleResponse(TransportResponse.Empty response) { From 81e07e81e0e13bb9eb255f33e14c728849c0d650 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Thu, 20 Aug 2015 15:49:35 +0200 Subject: [PATCH 02/21] simplified PublishClusterStateActionTests infra --- .../PublishClusterStateActionTests.java | 263 +++++++----------- 1 file changed, 93 insertions(+), 170 deletions(-) diff --git a/core/src/test/java/org/elasticsearch/cluster/PublishClusterStateActionTests.java b/core/src/test/java/org/elasticsearch/cluster/PublishClusterStateActionTests.java index 5bd51f8ec7a..a395ebdd3d4 100644 --- a/core/src/test/java/org/elasticsearch/cluster/PublishClusterStateActionTests.java +++ b/core/src/test/java/org/elasticsearch/cluster/PublishClusterStateActionTests.java @@ -31,6 +31,7 @@ import org.elasticsearch.common.collect.ImmutableOpenMap; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.discovery.Discovery; import org.elasticsearch.discovery.DiscoverySettings; @@ -65,45 +66,57 @@ public class PublishClusterStateActionTests extends ESTestCase { protected ThreadPool threadPool; protected Map nodes = newHashMap(); - public static class MockNode { + public static class MockNode implements PublishClusterStateAction.NewClusterStateListener { public final DiscoveryNode discoveryNode; public final MockTransportService service; - public final PublishClusterStateAction action; + public PublishClusterStateAction action; public final MockDiscoveryNodesProvider nodesProvider; + public final ClusterStateListener listener; - public MockNode(DiscoveryNode discoveryNode, MockTransportService service, PublishClusterStateAction action, MockDiscoveryNodesProvider nodesProvider) { + public volatile ClusterState clusterState; + + private final ESLogger logger; + + public MockNode(DiscoveryNode discoveryNode, MockTransportService service, MockDiscoveryNodesProvider nodesProvider, @Nullable ClusterStateListener listener, ESLogger logger) { this.discoveryNode = discoveryNode; this.service = service; - this.action = action; this.nodesProvider = nodesProvider; + this.listener = listener; + this.logger = logger; + this.clusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(DiscoveryNodes.builder().put(discoveryNode).localNodeId(discoveryNode.id()).build()).build(); } public void connectTo(DiscoveryNode node) { service.connectToNode(node); nodesProvider.addNode(node); } + + @Override + public void onNewClusterState(ClusterState newClusterState, NewStateProcessed newStateProcessed) { + logger.debug("[{}] received version [{}], uuid [{}]", discoveryNode.name(), newClusterState.version(), newClusterState.stateUUID()); + if (listener != null) { + ClusterChangedEvent event = new ClusterChangedEvent("", newClusterState, clusterState); + listener.clusterChanged(event); + } + clusterState = newClusterState; + newStateProcessed.onNewClusterStateProcessed(); + } } public MockNode createMockNode(final String name, Settings settings, Version version) throws Exception { - return createMockNode(name, settings, version, new PublishClusterStateAction.NewClusterStateListener() { - @Override - public void onNewClusterState(ClusterState clusterState, NewStateProcessed newStateProcessed) { - logger.debug("Node [{}] onNewClusterState version [{}], uuid [{}]", name, clusterState.version(), clusterState.stateUUID()); - newStateProcessed.onNewClusterStateProcessed(); - } - }); + return createMockNode(name, settings, version, null); } - public MockNode createMockNode(String name, Settings settings, Version version, PublishClusterStateAction.NewClusterStateListener listener) throws Exception { + public MockNode createMockNode(String name, Settings settings, Version version, @Nullable ClusterStateListener listener) throws Exception { MockTransportService service = buildTransportService( Settings.builder().put(settings).put("name", name, TransportService.SETTING_TRACE_LOG_INCLUDE, "", TransportService.SETTING_TRACE_LOG_EXCLUDE, "NOTHING").build(), version ); DiscoveryNode discoveryNode = new DiscoveryNode(name, name, service.boundAddress().publishAddress(), ImmutableMap.of(), version); MockDiscoveryNodesProvider nodesProvider = new MockDiscoveryNodesProvider(discoveryNode); - PublishClusterStateAction action = buildPublishClusterStateAction(settings, service, nodesProvider, listener); - MockNode node = new MockNode(discoveryNode, service, action, nodesProvider); + MockNode node = new MockNode(discoveryNode, service, nodesProvider, listener, logger); nodesProvider.addNode(discoveryNode); + node.action = buildPublishClusterStateAction(settings, service, nodesProvider, node); final CountDownLatch latch = new CountDownLatch(nodes.size() * 2 + 1); TransportConnectionListener waitForConnection = new TransportConnectionListener() { @Override @@ -209,11 +222,8 @@ public class PublishClusterStateActionTests extends ESTestCase { @Test @TestLogging("cluster:DEBUG,discovery.zen.publish:DEBUG") public void testSimpleClusterStatePublishing() throws Exception { - MockNewClusterStateListener mockListenerA = new MockNewClusterStateListener(); - MockNode nodeA = createMockNode("nodeA", Settings.EMPTY, Version.CURRENT, mockListenerA); - - MockNewClusterStateListener mockListenerB = new MockNewClusterStateListener(); - MockNode nodeB = createMockNode("nodeB", Settings.EMPTY, Version.CURRENT, mockListenerB); + MockNode nodeA = createMockNode("nodeA", Settings.EMPTY, Version.CURRENT); + MockNode nodeB = createMockNode("nodeB", Settings.EMPTY, Version.CURRENT); // Initial cluster state DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); @@ -223,103 +233,53 @@ public class PublishClusterStateActionTests extends ESTestCase { discoveryNodes = DiscoveryNodes.builder(discoveryNodes).put(nodeB.discoveryNode).build(); ClusterState previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).nodes(discoveryNodes).incrementVersion().build(); - mockListenerB.add(new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertFalse(clusterState.wasReadFromDiff()); - } - }); publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + assertSameStateFromFull(nodeB.clusterState, clusterState); // cluster state update - add block previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).blocks(ClusterBlocks.builder().addGlobalBlock(MetaData.CLUSTER_READ_ONLY_BLOCK)).incrementVersion().build(); - mockListenerB.add(new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertTrue(clusterState.wasReadFromDiff()); - assertThat(clusterState.blocks().global().size(), equalTo(1)); - } - }); publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + assertSameStateFromDiff(nodeB.clusterState, clusterState); + assertThat(nodeB.clusterState.blocks().global().size(), equalTo(1)); // cluster state update - remove block previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).blocks(ClusterBlocks.EMPTY_CLUSTER_BLOCK).incrementVersion().build(); - mockListenerB.add(new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertTrue(clusterState.wasReadFromDiff()); - assertThat(clusterState.blocks().global().size(), equalTo(0)); - } - }); publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + assertSameStateFromDiff(nodeB.clusterState, clusterState); + assertTrue(nodeB.clusterState.wasReadFromDiff()); // Adding new node - this node should get full cluster state while nodeB should still be getting diffs - MockNewClusterStateListener mockListenerC = new MockNewClusterStateListener(); - MockNode nodeC = createMockNode("nodeC", Settings.EMPTY, Version.CURRENT, mockListenerC); + MockNode nodeC = createMockNode("nodeC", Settings.EMPTY, Version.CURRENT); // cluster state update 3 - register node C previousClusterState = clusterState; discoveryNodes = DiscoveryNodes.builder(discoveryNodes).put(nodeC.discoveryNode).build(); clusterState = ClusterState.builder(clusterState).nodes(discoveryNodes).incrementVersion().build(); - mockListenerB.add(new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertTrue(clusterState.wasReadFromDiff()); - assertThat(clusterState.blocks().global().size(), equalTo(0)); - } - }); - mockListenerC.add(new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - // First state - assertFalse(clusterState.wasReadFromDiff()); - } - }); publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + assertSameStateFromDiff(nodeB.clusterState, clusterState); + // First state + assertSameStateFromFull(nodeC.clusterState, clusterState); // cluster state update 4 - update settings previousClusterState = clusterState; MetaData metaData = MetaData.builder(clusterState.metaData()).transientSettings(Settings.settingsBuilder().put("foo", "bar").build()).build(); clusterState = ClusterState.builder(clusterState).metaData(metaData).incrementVersion().build(); - NewClusterStateExpectation expectation = new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertTrue(clusterState.wasReadFromDiff()); - assertThat(clusterState.blocks().global().size(), equalTo(0)); - } - }; - mockListenerB.add(expectation); - mockListenerC.add(expectation); publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + assertSameStateFromDiff(nodeB.clusterState, clusterState); + assertThat(nodeB.clusterState.blocks().global().size(), equalTo(0)); + assertSameStateFromDiff(nodeC.clusterState, clusterState); + assertThat(nodeC.clusterState.blocks().global().size(), equalTo(0)); // cluster state update - skipping one version change - should request full cluster state previousClusterState = ClusterState.builder(clusterState).incrementVersion().build(); clusterState = ClusterState.builder(clusterState).incrementVersion().build(); - expectation = new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertFalse(clusterState.wasReadFromDiff()); - } - }; - mockListenerB.add(expectation); - mockListenerC.add(expectation); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); - - // cluster state update - skipping one version change - should request full cluster state - previousClusterState = ClusterState.builder(clusterState).incrementVersion().build(); - clusterState = ClusterState.builder(clusterState).incrementVersion().build(); - expectation = new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertFalse(clusterState.wasReadFromDiff()); - } - }; - mockListenerB.add(expectation); - mockListenerC.add(expectation); publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + assertSameStateFromFull(nodeB.clusterState, clusterState); + assertSameStateFromFull(nodeC.clusterState, clusterState); + assertFalse(nodeC.clusterState.wasReadFromDiff()); // node B becomes the master and sends a version of the cluster state that goes back discoveryNodes = DiscoveryNodes.builder(discoveryNodes) @@ -329,53 +289,36 @@ public class PublishClusterStateActionTests extends ESTestCase { .build(); previousClusterState = ClusterState.builder(new ClusterName("test")).nodes(discoveryNodes).build(); clusterState = ClusterState.builder(clusterState).nodes(discoveryNodes).incrementVersion().build(); - expectation = new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertFalse(clusterState.wasReadFromDiff()); - } - }; - mockListenerA.add(expectation); - mockListenerC.add(expectation); publishStateDiffAndWait(nodeB.action, clusterState, previousClusterState); + assertSameStateFromFull(nodeA.clusterState, clusterState); + assertSameStateFromFull(nodeC.clusterState, clusterState); } @Test @TestLogging("cluster:DEBUG,discovery.zen.publish:DEBUG") public void testUnexpectedDiffPublishing() throws Exception { - MockNode nodeA = createMockNode("nodeA", Settings.EMPTY, Version.CURRENT, new PublishClusterStateAction.NewClusterStateListener() { + MockNode nodeA = createMockNode("nodeA", Settings.EMPTY, Version.CURRENT, new ClusterStateListener() { @Override - public void onNewClusterState(ClusterState clusterState, NewStateProcessed newStateProcessed) { + public void clusterChanged(ClusterChangedEvent event) { fail("Shouldn't send cluster state to myself"); } }); - MockNewClusterStateListener mockListenerB = new MockNewClusterStateListener(); - MockNode nodeB = createMockNode("nodeB", Settings.EMPTY, Version.CURRENT, mockListenerB); + MockNode nodeB = createMockNode("nodeB", Settings.EMPTY, Version.CURRENT); // Initial cluster state with both states - the second node still shouldn't get diff even though it's present in the previous cluster state DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).put(nodeB.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); ClusterState previousClusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); ClusterState clusterState = ClusterState.builder(previousClusterState).incrementVersion().build(); - mockListenerB.add(new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertFalse(clusterState.wasReadFromDiff()); - } - }); publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + assertSameStateFromFull(nodeB.clusterState, clusterState); // cluster state update - add block previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).blocks(ClusterBlocks.builder().addGlobalBlock(MetaData.CLUSTER_READ_ONLY_BLOCK)).incrementVersion().build(); - mockListenerB.add(new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertTrue(clusterState.wasReadFromDiff()); - } - }); publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + assertSameStateFromDiff(nodeB.clusterState, clusterState); } @Test @@ -383,19 +326,17 @@ public class PublishClusterStateActionTests extends ESTestCase { public void testDisablingDiffPublishing() throws Exception { Settings noDiffPublishingSettings = Settings.builder().put(DiscoverySettings.PUBLISH_DIFF_ENABLE, false).build(); - MockNode nodeA = createMockNode("nodeA", noDiffPublishingSettings, Version.CURRENT, new PublishClusterStateAction.NewClusterStateListener() { + MockNode nodeA = createMockNode("nodeA", noDiffPublishingSettings, Version.CURRENT, new ClusterStateListener() { @Override - public void onNewClusterState(ClusterState clusterState, NewStateProcessed newStateProcessed) { + public void clusterChanged(ClusterChangedEvent event) { fail("Shouldn't send cluster state to myself"); } }); - MockNode nodeB = createMockNode("nodeB", noDiffPublishingSettings, Version.CURRENT, new PublishClusterStateAction.NewClusterStateListener() { + MockNode nodeB = createMockNode("nodeB", noDiffPublishingSettings, Version.CURRENT, new ClusterStateListener() { @Override - public void onNewClusterState(ClusterState clusterState, NewStateProcessed newStateProcessed) { - logger.debug("Got cluster state update, version [{}], guid [{}], from diff [{}]", clusterState.version(), clusterState.stateUUID(), clusterState.wasReadFromDiff()); - assertFalse(clusterState.wasReadFromDiff()); - newStateProcessed.onNewClusterStateProcessed(); + public void clusterChanged(ClusterChangedEvent event) { + assertFalse(event.state().wasReadFromDiff()); } }); @@ -418,33 +359,28 @@ public class PublishClusterStateActionTests extends ESTestCase { @Test @TestLogging("cluster:DEBUG,discovery.zen.publish:DEBUG") + /** + * Test concurrent publishing works correctly (although not strictly required, it's a good testamne + */ public void testSimultaneousClusterStatePublishing() throws Exception { int numberOfNodes = randomIntBetween(2, 10); int numberOfIterations = randomIntBetween(50, 200); - Settings settings = Settings.builder().put(DiscoverySettings.PUBLISH_TIMEOUT, "100ms").put(DiscoverySettings.PUBLISH_DIFF_ENABLE, true).build(); + Settings settings = Settings.builder().put(DiscoverySettings.PUBLISH_DIFF_ENABLE, randomBoolean()).build(); MockNode[] nodes = new MockNode[numberOfNodes]; DiscoveryNodes.Builder discoveryNodesBuilder = DiscoveryNodes.builder(); for (int i = 0; i < nodes.length; i++) { final String name = "node" + i; - nodes[i] = createMockNode(name, settings, Version.CURRENT, new PublishClusterStateAction.NewClusterStateListener() { + nodes[i] = createMockNode(name, settings, Version.CURRENT, new ClusterStateListener() { @Override - public synchronized void onNewClusterState(ClusterState clusterState, NewStateProcessed newStateProcessed) { - assertProperMetaDataForVersion(clusterState.metaData(), clusterState.version()); - if (randomInt(10) < 2) { - // Cause timeouts from time to time - try { - Thread.sleep(randomInt(110)); - } catch (InterruptedException ex) { - Thread.currentThread().interrupt(); - } - } - newStateProcessed.onNewClusterStateProcessed(); + public void clusterChanged(ClusterChangedEvent event) { + assertProperMetaDataForVersion(event.state().metaData(), event.state().version()); } }); discoveryNodesBuilder.put(nodes[i].discoveryNode); } AssertingAckListener[] listeners = new AssertingAckListener[numberOfIterations]; + discoveryNodesBuilder.localNodeId(nodes[0].discoveryNode.id()); DiscoveryNodes discoveryNodes = discoveryNodesBuilder.build(); MetaData metaData = MetaData.EMPTY_META_DATA; ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metaData(metaData).build(); @@ -459,43 +395,40 @@ public class PublishClusterStateActionTests extends ESTestCase { for (int i = 0; i < numberOfIterations; i++) { listeners[i].await(1, TimeUnit.SECONDS); } + + // fake node[0] - it is the master + nodes[0].clusterState = clusterState; + + for (MockNode node : nodes) { + assertThat(node.discoveryNode + " misses a cluster state", node.clusterState, notNullValue()); + assertThat(node.discoveryNode + " unexpected cluster state: " + node.clusterState, node.clusterState.version(), equalTo(clusterState.version())); + assertThat(node.clusterState.nodes().localNode(), equalTo(node.discoveryNode)); + } } @Test @TestLogging("cluster:DEBUG,discovery.zen.publish:DEBUG") public void testSerializationFailureDuringDiffPublishing() throws Exception { - MockNode nodeA = createMockNode("nodeA", Settings.EMPTY, Version.CURRENT, new PublishClusterStateAction.NewClusterStateListener() { + MockNode nodeA = createMockNode("nodeA", Settings.EMPTY, Version.CURRENT, new ClusterStateListener() { @Override - public void onNewClusterState(ClusterState clusterState, NewStateProcessed newStateProcessed) { + public void clusterChanged(ClusterChangedEvent event) { fail("Shouldn't send cluster state to myself"); } }); - MockNewClusterStateListener mockListenerB = new MockNewClusterStateListener(); - MockNode nodeB = createMockNode("nodeB", Settings.EMPTY, Version.CURRENT, mockListenerB); + MockNode nodeB = createMockNode("nodeB", Settings.EMPTY, Version.CURRENT); // Initial cluster state with both states - the second node still shouldn't get diff even though it's present in the previous cluster state DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).put(nodeB.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); ClusterState previousClusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); ClusterState clusterState = ClusterState.builder(previousClusterState).incrementVersion().build(); - mockListenerB.add(new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertFalse(clusterState.wasReadFromDiff()); - } - }); publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + assertSameStateFromFull(nodeB.clusterState, clusterState); // cluster state update - add block previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).blocks(ClusterBlocks.builder().addGlobalBlock(MetaData.CLUSTER_READ_ONLY_BLOCK)).incrementVersion().build(); - mockListenerB.add(new NewClusterStateExpectation() { - @Override - public void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - assertTrue(clusterState.wasReadFromDiff()); - } - }); ClusterState unserializableClusterState = new ClusterState(clusterState.version(), clusterState.stateUUID(), clusterState) { @Override @@ -589,31 +522,6 @@ public class PublishClusterStateActionTests extends ESTestCase { } - public interface NewClusterStateExpectation { - void check(ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed); - } - - public static class MockNewClusterStateListener implements PublishClusterStateAction.NewClusterStateListener { - CopyOnWriteArrayList expectations = new CopyOnWriteArrayList(); - - @Override - public void onNewClusterState(ClusterState clusterState, NewStateProcessed newStateProcessed) { - final NewClusterStateExpectation expectation; - try { - expectation = expectations.remove(0); - } catch (ArrayIndexOutOfBoundsException ex) { - fail("Unexpected cluster state update " + clusterState.prettyPrint()); - return; - } - expectation.check(clusterState, newStateProcessed); - newStateProcessed.onNewClusterStateProcessed(); - } - - public void add(NewClusterStateExpectation expectation) { - expectations.add(expectation); - } - } - public static class DelegatingClusterState extends ClusterState { public DelegatingClusterState(ClusterState clusterState) { @@ -623,4 +531,19 @@ public class PublishClusterStateActionTests extends ESTestCase { } + + void assertSameState(ClusterState actual, ClusterState expected) { + assertThat(actual, notNullValue()); + assertThat("\n--> actual ClusterState: " + actual.prettyPrint() + "\n--> expected ClusterState:" + expected.prettyPrint(), actual.stateUUID(), equalTo(expected.stateUUID())); + } + + void assertSameStateFromDiff(ClusterState actual, ClusterState expected) { + assertSameState(actual, expected); + assertTrue(actual.wasReadFromDiff()); + } + + void assertSameStateFromFull(ClusterState actual, ClusterState expected) { + assertSameState(actual, expected); + assertFalse(actual.wasReadFromDiff()); + } } From b702843fe9c11b4aa0b9a044ad331df06aae1a04 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Sun, 23 Aug 2015 18:13:36 +0200 Subject: [PATCH 03/21] beefed up testing... --- .../elasticsearch/cluster/ClusterModule.java | 1 + .../elasticsearch/cluster/ClusterState.java | 2 +- .../service/InternalClusterService.java | 10 +- .../discovery/DiscoverySettings.java | 9 +- .../discovery/zen/ZenDiscovery.java | 8 +- .../publish/PublishClusterStateAction.java | 326 +++++++----- .../cluster/MinimumMasterNodesIT.java | 78 +++ .../PublishClusterStateActionTests.java | 465 ++++++++++++++---- .../disruption/NetworkDelaysPartition.java | 6 +- 9 files changed, 664 insertions(+), 241 deletions(-) rename core/src/test/java/org/elasticsearch/{cluster => discovery/zen/publish}/PublishClusterStateActionTests.java (53%) diff --git a/core/src/main/java/org/elasticsearch/cluster/ClusterModule.java b/core/src/main/java/org/elasticsearch/cluster/ClusterModule.java index 4720bb087dc..64621999cf5 100644 --- a/core/src/main/java/org/elasticsearch/cluster/ClusterModule.java +++ b/core/src/main/java/org/elasticsearch/cluster/ClusterModule.java @@ -195,6 +195,7 @@ public class ClusterModule extends AbstractModule { registerClusterDynamicSetting(DestructiveOperations.REQUIRES_NAME, Validator.EMPTY); registerClusterDynamicSetting(DiscoverySettings.PUBLISH_TIMEOUT, Validator.TIME_NON_NEGATIVE); registerClusterDynamicSetting(DiscoverySettings.PUBLISH_DIFF_ENABLE, Validator.BOOLEAN); + registerClusterDynamicSetting(DiscoverySettings.COMMIT_TIMEOUT, Validator.TIME_NON_NEGATIVE); registerClusterDynamicSetting(HierarchyCircuitBreakerService.TOTAL_CIRCUIT_BREAKER_LIMIT_SETTING, Validator.MEMORY_SIZE); registerClusterDynamicSetting(HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_LIMIT_SETTING, Validator.MEMORY_SIZE); registerClusterDynamicSetting(HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_OVERHEAD_SETTING, Validator.NON_NEGATIVE_DOUBLE); diff --git a/core/src/main/java/org/elasticsearch/cluster/ClusterState.java b/core/src/main/java/org/elasticsearch/cluster/ClusterState.java index 2bae50771ae..5b05f199dfa 100644 --- a/core/src/main/java/org/elasticsearch/cluster/ClusterState.java +++ b/core/src/main/java/org/elasticsearch/cluster/ClusterState.java @@ -256,7 +256,7 @@ public class ClusterState implements ToXContent, Diffable { } // Used for testing and logging to determine how this cluster state was send over the wire - boolean wasReadFromDiff() { + public boolean wasReadFromDiff() { return wasReadFromDiff; } diff --git a/core/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java b/core/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java index b992c3612ee..76262a381b4 100644 --- a/core/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java +++ b/core/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java @@ -482,8 +482,14 @@ public class InternalClusterService extends AbstractLifecycleComponent implemen try { publishClusterState.publish(clusterChangedEvent, electMaster.minimumMasterNodes(), ackListener); } catch (PublishClusterStateAction.FailedToCommitException t) { - logger.warn("failed to publish [{}] (not enough nodes acknowledged, min master nodes [{}])", clusterChangedEvent.state().version(), electMaster.minimumMasterNodes()); + // cluster service logs a WARN message + logger.debug("failed to publish cluster state version [{}] (not enough nodes acknowledged, min master nodes [{}])", clusterChangedEvent.state().version(), electMaster.minimumMasterNodes()); clusterService.submitStateUpdateTask("zen-disco-failed-to-publish", Priority.IMMEDIATE, new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { @@ -856,7 +856,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen * If the second condition fails we ignore the cluster state. */ static boolean shouldIgnoreOrRejectNewClusterState(ESLogger logger, ClusterState currentState, ClusterState newClusterState) { - rejectNewClusterStateIfNeeded(logger, currentState.nodes(), newClusterState); + validateStateIsFromCurrentMaster(logger, currentState.nodes(), newClusterState); if (currentState.nodes().masterNodeId() != null && newClusterState.version() < currentState.version()) { // if the new state has a smaller version, and it has the same master node, then no need to process it logger.debug("received a cluster state that has a lower version than the current one, ignoring (received {}, current {})", newClusterState.version(), currentState.version()); @@ -871,7 +871,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen * This method checks for this and throws an exception if needed */ - public static void rejectNewClusterStateIfNeeded(ESLogger logger, DiscoveryNodes currentNodes, ClusterState newClusterState) { + public static void validateStateIsFromCurrentMaster(ESLogger logger, DiscoveryNodes currentNodes, ClusterState newClusterState) { if (currentNodes.masterNodeId() == null) { return; } diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java index 050805bcf5a..1100eeb7680 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java @@ -24,6 +24,7 @@ import org.elasticsearch.ElasticsearchException; import org.elasticsearch.Version; import org.elasticsearch.cluster.*; import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.component.AbstractComponent; @@ -96,13 +97,11 @@ public class PublishClusterStateAction extends AbstractComponent { } public void publish(ClusterChangedEvent clusterChangedEvent, int minMasterNodes, final Discovery.AckListener ackListener) { - Set nodesToPublishTo = new HashSet<>(clusterChangedEvent.state().nodes().size()); - DiscoveryNode localNode = nodesProvider.nodes().localNode(); - int totalMasterNodes = 0; - for (final DiscoveryNode node : clusterChangedEvent.state().nodes()) { - if (node.isMasterNode()) { - totalMasterNodes++; - } + final DiscoveryNodes nodes = clusterChangedEvent.state().nodes(); + Set nodesToPublishTo = new HashSet<>(nodes.size()); + DiscoveryNode localNode = nodes.localNode(); + final int totalMasterNodes = nodes.masterNodes().size(); + for (final DiscoveryNode node : nodes) { if (node.equals(localNode) == false) { nodesToPublishTo.add(node); } @@ -118,24 +117,24 @@ public class PublishClusterStateAction extends AbstractComponent { final ClusterState clusterState = clusterChangedEvent.state(); final ClusterState previousState = clusterChangedEvent.previousState(); - final AtomicBoolean timedOutWaitingForNodes = new AtomicBoolean(false); final TimeValue publishTimeout = discoverySettings.getPublishTimeout(); final boolean sendFullVersion = !discoverySettings.getPublishDiff() || previousState == null; final SendingController sendingController = new SendingController(clusterChangedEvent.state(), minMasterNodes, totalMasterNodes, publishResponseHandler); - Diff diff = null; + + // we build these early as a best effort not to commit in the case of error. + // sadly this is not water tight as it may that a failed diff based publishing to a node + // will cause a full serialization based on an older version, which may fail after the + // change has been committed. + buildDiffAndSerializeStates(clusterState, previousState, nodesToPublishTo, sendFullVersion, serializedStates, serializedDiffs); for (final DiscoveryNode node : nodesToPublishTo) { - // try and serialize the cluster state once (or per version), so we don't serialize it // per node when we send it over the wire, compress it while we are at it... // we don't send full version if node didn't exist in the previous version of cluster state if (sendFullVersion || !previousState.nodes().nodeExists(node.id())) { - sendFullClusterState(clusterState, serializedStates, node, timedOutWaitingForNodes, publishTimeout, sendingController); + sendFullClusterState(clusterState, serializedStates, node, publishTimeout, sendingController); } else { - if (diff == null) { - diff = clusterState.diff(previousState); - } - sendClusterStateDiff(clusterState, diff, serializedDiffs, node, timedOutWaitingForNodes, publishTimeout, sendingController); + sendClusterStateDiff(clusterState, serializedDiffs, serializedStates, node, publishTimeout, sendingController); } } @@ -144,8 +143,8 @@ public class PublishClusterStateAction extends AbstractComponent { if (publishTimeout.millis() > 0) { // only wait if the publish timeout is configured... try { - timedOutWaitingForNodes.set(!publishResponseHandler.awaitAllNodes(publishTimeout)); - if (timedOutWaitingForNodes.get()) { + sendingController.setPublishingTimedOut(!publishResponseHandler.awaitAllNodes(publishTimeout)); + if (sendingController.getPublishingTimedOut()) { DiscoveryNode[] pendingNodes = publishResponseHandler.pendingNodes(); // everyone may have just responded if (pendingNodes.length > 0) { @@ -159,13 +158,34 @@ public class PublishClusterStateAction extends AbstractComponent { } } - private void sendFullClusterState(ClusterState clusterState, @Nullable Map serializedStates, - DiscoveryNode node, AtomicBoolean timedOutWaitingForNodes, TimeValue publishTimeout, - SendingController sendingController) { - BytesReference bytes = null; - if (serializedStates != null) { - bytes = serializedStates.get(node.version()); + private void buildDiffAndSerializeStates(ClusterState clusterState, ClusterState previousState, Set nodesToPublishTo, + boolean sendFullVersion, Map serializedStates, Map serializedDiffs) { + Diff diff = null; + for (final DiscoveryNode node : nodesToPublishTo) { + try { + if (sendFullVersion || !previousState.nodes().nodeExists(node.id())) { + // will send a full reference + if (serializedStates.containsKey(node.version()) == false) { + serializedStates.put(node.version(), serializeFullClusterState(clusterState, node.version())); + } + } else { + // will send a diff + if (diff == null) { + diff = clusterState.diff(previousState); + } + if (serializedDiffs.containsKey(node.version()) == false) { + serializedDiffs.put(node.version(), serializeDiffClusterState(diff, node.version())); + } + } + } catch (IOException e) { + throw new ElasticsearchException("failed to serialize cluster_state for publishing to node {}", e, node); + } } + } + + private void sendFullClusterState(ClusterState clusterState, @Nullable Map serializedStates, + DiscoveryNode node, TimeValue publishTimeout, SendingController sendingController) { + BytesReference bytes = serializedStates.get(node.version()); if (bytes == null) { try { bytes = serializeFullClusterState(clusterState, node.version()); @@ -178,31 +198,22 @@ public class PublishClusterStateAction extends AbstractComponent { return; } } - sendClusterStateToNode(clusterState, bytes, node, timedOutWaitingForNodes, publishTimeout, sendingController, false); + sendClusterStateToNode(clusterState, bytes, node, publishTimeout, sendingController, false, serializedStates); } - private void sendClusterStateDiff(ClusterState clusterState, Diff diff, Map serializedDiffs, DiscoveryNode node, - AtomicBoolean timedOutWaitingForNodes, TimeValue publishTimeout, - SendingController sendingController) { + private void sendClusterStateDiff(ClusterState clusterState, + Map serializedDiffs, Map serializedStates, + DiscoveryNode node, TimeValue publishTimeout, SendingController sendingController) { BytesReference bytes = serializedDiffs.get(node.version()); - if (bytes == null) { - try { - bytes = serializeDiffClusterState(diff, node.version()); - serializedDiffs.put(node.version(), bytes); - } catch (Throwable e) { - logger.warn("failed to serialize diff of cluster_state before publishing it to node {}", e, node); - sendingController.onNodeSendFailed(node, e); - return; - } - } - sendClusterStateToNode(clusterState, bytes, node, timedOutWaitingForNodes, publishTimeout, sendingController, true); + assert bytes != null : "failed to find serialized diff for node " + node + " of version [" + node.version() + "]"; + sendClusterStateToNode(clusterState, bytes, node, publishTimeout, sendingController, true, serializedStates); } private void sendClusterStateToNode(final ClusterState clusterState, BytesReference bytes, - final DiscoveryNode node, final AtomicBoolean timedOutWaitingForNodes, + final DiscoveryNode node, final TimeValue publishTimeout, final SendingController sendingController, - final boolean sendDiffs) { + final boolean sendDiffs, final Map serializedStates) { try { TransportRequestOptions options = TransportRequestOptions.options().withType(TransportRequestOptions.Type.STATE).withCompress(false); // no need to put a timeout on the options here, because we want the response to eventually be received @@ -215,7 +226,7 @@ public class PublishClusterStateAction extends AbstractComponent { @Override public void handleResponse(TransportResponse.Empty response) { - if (timedOutWaitingForNodes.get()) { + if (sendingController.getPublishingTimedOut()) { logger.debug("node {} responded for cluster state [{}] (took longer than [{}])", node, clusterState.version(), publishTimeout); } sendingController.onNodeSendAck(node); @@ -225,7 +236,7 @@ public class PublishClusterStateAction extends AbstractComponent { public void handleException(TransportException exp) { if (sendDiffs && exp.unwrapCause() instanceof IncompatibleClusterStateVersionException) { logger.debug("resending full cluster state to node {} reason {}", node, exp.getDetailedMessage()); - sendFullClusterState(clusterState, null, node, timedOutWaitingForNodes, publishTimeout, sendingController); + sendFullClusterState(clusterState, serializedStates, node, publishTimeout, sendingController); } else { logger.debug("failed to send cluster state to {}", exp, node); sendingController.onNodeSendFailed(node, exp); @@ -238,7 +249,7 @@ public class PublishClusterStateAction extends AbstractComponent { } } - private void sendCommitToNode(final DiscoveryNode node, final ClusterState clusterState, final BlockingClusterStatePublishResponseHandler publishResponseHandler) { + private void sendCommitToNode(final DiscoveryNode node, final ClusterState clusterState, final SendingController sendingController) { try { logger.trace("sending commit for cluster state (uuid: [{}], version [{}]) to [{}]", clusterState.stateUUID(), clusterState.version(), node); TransportRequestOptions options = TransportRequestOptions.options().withType(TransportRequestOptions.Type.STATE).withCompress(false); @@ -252,21 +263,21 @@ public class PublishClusterStateAction extends AbstractComponent { @Override public void handleResponse(TransportResponse.Empty response) { -// if (timedOutWaitingForNodes.get()) { - logger.debug("node {} responded to cluster state commit [{}]", node, clusterState.version()); -// } - publishResponseHandler.onResponse(node); + if (sendingController.getPublishingTimedOut()) { + logger.debug("node {} responded to cluster state commit [{}]", node, clusterState.version()); + } + sendingController.getPublishResponseHandler().onResponse(node); } @Override public void handleException(TransportException exp) { logger.debug("failed to commit cluster state (uuid [{}], version [{}]) to {}", exp, clusterState.stateUUID(), clusterState.version(), node); - publishResponseHandler.onFailure(node, exp); + sendingController.getPublishResponseHandler().onFailure(node, exp); } }); } catch (Throwable t) { logger.warn("error sending cluster state commit (uuid [{}], version [{}]) to {}", t, clusterState.stateUUID(), clusterState.version(), node); - publishResponseHandler.onFailure(node, t); + sendingController.getPublishResponseHandler().onFailure(node, t); } } @@ -294,99 +305,116 @@ public class PublishClusterStateAction extends AbstractComponent { private Object lastSeenClusterStateMutex = new Object(); private ClusterState lastSeenClusterState; + protected void handleIncomingClusterStateRequest(BytesTransportRequest request, TransportChannel channel) throws IOException { + Compressor compressor = CompressorFactory.compressor(request.bytes()); + StreamInput in; + if (compressor != null) { + in = compressor.streamInput(request.bytes().streamInput()); + } else { + in = request.bytes().streamInput(); + } + in.setVersion(request.version()); + synchronized (lastSeenClusterStateMutex) { + final ClusterState incomingState; + // If true we received full cluster state - otherwise diffs + if (in.readBoolean()) { + incomingState = ClusterState.Builder.readFrom(in, nodesProvider.nodes().localNode()); + logger.debug("received full cluster state version [{}] with size [{}]", incomingState.version(), request.bytes().length()); + } else if (lastSeenClusterState != null) { + Diff diff = lastSeenClusterState.readDiffFrom(in); + incomingState = diff.apply(lastSeenClusterState); + logger.debug("received diff cluster state version [{}] with uuid [{}], diff size [{}]", incomingState.version(), incomingState.stateUUID(), request.bytes().length()); + } else { + logger.debug("received diff for but don't have any local cluster state - requesting full state"); + throw new IncompatibleClusterStateVersionException("have no local cluster state"); + } + // sanity check incoming state + validateIncomingState(incomingState); + + lastSeenClusterState = incomingState; + lastSeenClusterState.status(ClusterState.ClusterStateStatus.RECEIVED); + } + channel.sendResponse(TransportResponse.Empty.INSTANCE); + } + + // package private for testing + + /** + * does simple sanity check of the incoming cluster state. Throws an exception on rejections. + */ + void validateIncomingState(ClusterState state) { + final ClusterName incomingClusterName = state.getClusterName(); + if (!incomingClusterName.equals(PublishClusterStateAction.this.clusterName)) { + logger.warn("received cluster state from [{}] which is also master but with a different cluster name [{}]", state.nodes().masterNode(), incomingClusterName); + throw new IllegalStateException("received state from a node that is not part of the cluster"); + } + final DiscoveryNodes currentNodes = nodesProvider.nodes(); + + if (currentNodes.localNode().equals(state.nodes().localNode()) == false) { + logger.warn("received a cluster state from [{}] and not part of the cluster, should not happen", state.nodes().masterNode()); + throw new IllegalStateException("received state from a node that is not part of the cluster"); + } + // state from another master requires more subtle checks, so we let it pass for now (it will be checked in ZenDiscovery) + if (currentNodes.localNodeMaster() == false) { + ZenDiscovery.validateStateIsFromCurrentMaster(logger, currentNodes, state); + } + } + + protected void handleCommitRequest(CommitClusterStateRequest request, final TransportChannel channel) { + ClusterState committedClusterState; + synchronized (lastSeenClusterStateMutex) { + committedClusterState = lastSeenClusterState; + } + + // if this message somehow comes without a previous send, we won't have a cluster state + String lastSeenUUID = committedClusterState == null ? null : committedClusterState.stateUUID(); + if (request.stateUUID.equals(lastSeenUUID) == false) { + throw new IllegalStateException("tried to commit cluster state UUID [" + request.stateUUID + "], but last seen UUID is [" + lastSeenUUID + "]"); + } + + try { + listener.onNewClusterState(committedClusterState, new NewClusterStateListener.NewStateProcessed() { + @Override + public void onNewClusterStateProcessed() { + try { + channel.sendResponse(TransportResponse.Empty.INSTANCE); + } catch (Throwable e) { + logger.debug("failed to send response on cluster state processed", e); + onNewClusterStateFailed(e); + } + } + + @Override + public void onNewClusterStateFailed(Throwable t) { + try { + channel.sendResponse(t); + } catch (Throwable e) { + logger.debug("failed to send response on cluster state processed", e); + } + } + }); + } catch (Exception e) { + logger.warn("unexpected error while processing cluster state version [{}]", e, lastSeenClusterState.version()); + throw e; + } + } + private class SendClusterStateRequestHandler implements TransportRequestHandler { @Override public void messageReceived(BytesTransportRequest request, final TransportChannel channel) throws Exception { - Compressor compressor = CompressorFactory.compressor(request.bytes()); - StreamInput in; - if (compressor != null) { - in = compressor.streamInput(request.bytes().streamInput()); - } else { - in = request.bytes().streamInput(); - } - in.setVersion(request.version()); - synchronized (lastSeenClusterStateMutex) { - final ClusterState incomingState; - // If true we received full cluster state - otherwise diffs - if (in.readBoolean()) { - incomingState = ClusterState.Builder.readFrom(in, nodesProvider.nodes().localNode()); - logger.debug("received full cluster state version [{}] with size [{}]", incomingState.version(), request.bytes().length()); - } else if (lastSeenClusterState != null) { - Diff diff = lastSeenClusterState.readDiffFrom(in); - incomingState = diff.apply(lastSeenClusterState); - logger.debug("received diff cluster state version [{}] with uuid [{}], diff size [{}]", incomingState.version(), incomingState.stateUUID(), request.bytes().length()); - } else { - logger.debug("received diff for but don't have any local cluster state - requesting full state"); - throw new IncompatibleClusterStateVersionException("have no local cluster state"); - } - // sanity check incoming state - final ClusterName incomingClusterName = incomingState.getClusterName(); - if (!incomingClusterName.equals(PublishClusterStateAction.this.clusterName)) { - logger.warn("received cluster state from [{}] which is also master but with a different cluster name [{}]", incomingState.nodes().masterNode(), incomingClusterName); - throw new IllegalStateException("received state from a node that is not part of the cluster"); - } - if (incomingState.nodes().localNode() == null) { - logger.warn("received a cluster state from [{}] and not part of the cluster, should not happen", incomingState.nodes().masterNode()); - throw new IllegalStateException("received state from a node that is not part of the cluster"); - } - // state from another master requires more subtle checks, so we let it pass for now (it will be checked in ZenDiscovery) - if (nodesProvider.nodes().localNodeMaster() == false) { - ZenDiscovery.rejectNewClusterStateIfNeeded(logger, nodesProvider.nodes(), incomingState); - } - - lastSeenClusterState = incomingState; - lastSeenClusterState.status(ClusterState.ClusterStateStatus.RECEIVED); - } - channel.sendResponse(TransportResponse.Empty.INSTANCE); + handleIncomingClusterStateRequest(request, channel); } } private class CommitClusterStateRequestHandler implements TransportRequestHandler { @Override public void messageReceived(CommitClusterStateRequest request, final TransportChannel channel) throws Exception { - ClusterState committedClusterState; - synchronized (lastSeenClusterStateMutex) { - committedClusterState = lastSeenClusterState; - } - if (committedClusterState.stateUUID().equals(request.stateUUID) == false) { - // nocommit: we need something better here - channel.sendResponse(TransportResponse.Empty.INSTANCE); - return; - } - - try { - listener.onNewClusterState(committedClusterState, new NewClusterStateListener.NewStateProcessed() { - @Override - public void onNewClusterStateProcessed() { - try { - channel.sendResponse(TransportResponse.Empty.INSTANCE); - } catch (Throwable e) { - logger.debug("failed to send response on cluster state processed", e); - } - } - - @Override - public void onNewClusterStateFailed(Throwable t) { - try { - channel.sendResponse(t); - } catch (Throwable e) { - logger.debug("failed to send response on cluster state processed", e); - } - } - }); - } catch (Exception e) { - logger.warn("unexpected error while processing cluster state version [{}]", e, lastSeenClusterState.version()); - try { - channel.sendResponse(e); - } catch (Throwable e1) { - logger.debug("failed to send response on cluster state processed", e1); - } - } + handleCommitRequest(request, channel); } } - static class CommitClusterStateRequest extends TransportRequest { + protected static class CommitClusterStateRequest extends TransportRequest { String stateUUID; @@ -413,14 +441,19 @@ public class PublishClusterStateAction extends AbstractComponent { public class FailedToCommitException extends ElasticsearchException { - public FailedToCommitException(String msg) { - super(msg); + public FailedToCommitException(String msg, Object... args) { + super(msg, args); } } class SendingController { private final ClusterState clusterState; + + public BlockingClusterStatePublishResponseHandler getPublishResponseHandler() { + return publishResponseHandler; + } + private final BlockingClusterStatePublishResponseHandler publishResponseHandler; volatile int neededMastersToCommit; int pendingMasterNodes; @@ -428,23 +461,31 @@ public class PublishClusterStateAction extends AbstractComponent { final CountDownLatch comittedOrFailed; final AtomicBoolean committed; + // an external marker to note that the publishing process is timed out. This is usefull for proper logging. + final AtomicBoolean publishingTimedOut = new AtomicBoolean(); + private SendingController(ClusterState clusterState, int minMasterNodes, int totalMasterNodes, BlockingClusterStatePublishResponseHandler publishResponseHandler) { this.clusterState = clusterState; this.publishResponseHandler = publishResponseHandler; this.neededMastersToCommit = Math.max(0, minMasterNodes - 1); // we are one of the master nodes this.pendingMasterNodes = totalMasterNodes - 1; + if (this.neededMastersToCommit > this.pendingMasterNodes) { + throw new FailedToCommitException("not enough masters to ack sent cluster state. [{}] needed , have [{}]", neededMastersToCommit, pendingMasterNodes); + } this.committed = new AtomicBoolean(neededMastersToCommit == 0); this.comittedOrFailed = new CountDownLatch(committed.get() ? 0 : 1); } public void waitForCommit(TimeValue commitTimeout) { + boolean timedout = false; try { - comittedOrFailed.await(commitTimeout.millis(), TimeUnit.MILLISECONDS); + timedout = comittedOrFailed.await(commitTimeout.millis(), TimeUnit.MILLISECONDS) == false; } catch (InterruptedException e) { } if (committed.get() == false) { - throw new FailedToCommitException("failed to get enough masters to ack sent cluster state. [" + neededMastersToCommit + "] left"); + throw new FailedToCommitException("{} enough masters to ack sent cluster state. [{}] left", + timedout ? "timed out while waiting for" : "failed to get", neededMastersToCommit); } } @@ -456,17 +497,19 @@ public class PublishClusterStateAction extends AbstractComponent { } } else { assert sendAckedBeforeCommit.isEmpty(); - sendCommitToNode(node, clusterState, publishResponseHandler); + sendCommitToNode(node, clusterState, this); } } private void onMasterNodeSendAck(DiscoveryNode node) { + logger.trace("master node {} acked cluster state version [{}]. processing ... (current pending [{}], needed [{}])", + node, clusterState.version(), pendingMasterNodes, neededMastersToCommit); neededMastersToCommit--; if (neededMastersToCommit == 0) { logger.trace("committing version [{}]", clusterState.version()); for (DiscoveryNode nodeToCommit : sendAckedBeforeCommit) { - sendCommitToNode(nodeToCommit, clusterState, publishResponseHandler); + sendCommitToNode(nodeToCommit, clusterState, this); } sendAckedBeforeCommit.clear(); boolean success = committed.compareAndSet(false, true); @@ -478,17 +521,28 @@ public class PublishClusterStateAction extends AbstractComponent { private void onMasterNodeDone(DiscoveryNode node) { pendingMasterNodes--; - if (pendingMasterNodes == 0) { + if (pendingMasterNodes == 0 && neededMastersToCommit > 0) { + logger.trace("failed to commit version [{}]. All master nodes acked or failed but [{}] acks are still needed", + clusterState.version(), neededMastersToCommit); comittedOrFailed.countDown(); } } synchronized public void onNodeSendFailed(DiscoveryNode node, Throwable t) { if (node.isMasterNode()) { + logger.trace("master node {} failed to ack cluster state version [{}]. processing ... (current pending [{}], needed [{}])", + node, clusterState.version(), pendingMasterNodes, neededMastersToCommit); onMasterNodeDone(node); } publishResponseHandler.onFailure(node, t); } + public boolean getPublishingTimedOut() { + return publishingTimedOut.get(); + } + + public void setPublishingTimedOut(boolean isTimedOut) { + publishingTimedOut.set(isTimedOut); + } } } \ No newline at end of file diff --git a/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java b/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java index a00679d05be..93238b70477 100644 --- a/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java +++ b/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java @@ -23,24 +23,37 @@ import com.google.common.base.Predicate; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; import org.elasticsearch.client.Client; +import org.elasticsearch.cluster.metadata.MetaData; import org.elasticsearch.common.Priority; import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.discovery.DiscoverySettings; +import org.elasticsearch.discovery.zen.ZenDiscovery; import org.elasticsearch.discovery.zen.elect.ElectMasterService; +import org.elasticsearch.discovery.zen.fd.FaultDetection; +import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase.ClusterScope; +import org.elasticsearch.test.disruption.NetworkDelaysPartition; +import org.elasticsearch.test.disruption.NetworkUnresponsivePartition; import org.elasticsearch.test.junit.annotations.TestLogging; import org.junit.Test; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; import static org.elasticsearch.common.settings.Settings.settingsBuilder; import static org.elasticsearch.test.ESIntegTestCase.Scope; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoTimeout; import static org.hamcrest.Matchers.*; @ClusterScope(scope = Scope.TEST, numDataNodes = 0) @@ -332,4 +345,69 @@ public class MinimumMasterNodesIT extends ESIntegTestCase { logger.info("--> verifying no node left and master is up"); assertFalse(client().admin().cluster().prepareHealth().setWaitForNodes(Integer.toString(nodeCount)).get().isTimedOut()); } + + public void testCanNotPublishWithoutMinMastNodes() throws Exception { + Settings settings = settingsBuilder() + .put("discovery.type", "zen") + .put(FaultDetection.SETTING_PING_TIMEOUT, "1h") // disable it + .put(ZenDiscovery.SETTING_PING_TIMEOUT, "200ms") + .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, 2) + .put(DiscoverySettings.COMMIT_TIMEOUT, "100ms") // speed things up + .build(); + internalCluster().startNodesAsync(3, settings).get(); + + final String master = internalCluster().getMasterName(); + Set otherNodes = new HashSet<>(Arrays.asList(internalCluster().getNodeNames())); + otherNodes.remove(master); + NetworkDelaysPartition partition = new NetworkDelaysPartition(Collections.singleton(master), otherNodes, 60000, random()); + internalCluster().setDisruptionScheme(partition); + partition.startDisrupting(); + + final CountDownLatch latch = new CountDownLatch(1); + final AtomicReference failure = new AtomicReference<>(); + logger.debug("--> submitting for cluster state to be rejected"); + final ClusterService masterClusterService = internalCluster().clusterService(master); + masterClusterService.submitStateUpdateTask("test", new ProcessedClusterStateUpdateTask() { + @Override + public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { + latch.countDown(); + } + + @Override + public ClusterState execute(ClusterState currentState) throws Exception { + MetaData.Builder metaData = MetaData.builder(currentState.metaData()).persistentSettings( + Settings.builder().put(currentState.metaData().persistentSettings()).put("_SHOULD_NOT_BE_THERE_", true).build() + ); + return ClusterState.builder(currentState).metaData(metaData).build(); + } + + @Override + public void onFailure(String source, Throwable t) { + failure.set(t); + latch.countDown(); + } + }); + + logger.debug("--> waiting for cluster state to be processed/rejected"); + latch.await(); + + assertThat(failure.get(), instanceOf(PublishClusterStateAction.FailedToCommitException.class)); + assertBusy(new Runnable() { + @Override + public void run() { + assertThat(masterClusterService.state().nodes().masterNode(), nullValue()); + } + }); + + partition.stopDisrupting(); + + logger.debug("--> waiting for cluster to heal"); + assertNoTimeout(client().admin().cluster().prepareHealth().setWaitForNodes("3").setWaitForEvents(Priority.LANGUID)); + + for (String node : internalCluster().getNodeNames()) { + Settings nodeSetting = internalCluster().clusterService(node).state().metaData().settings(); + assertThat(node + " processed the cluster state despite of a min master node violation", nodeSetting.get("_SHOULD_NOT_BE_THERE_"), nullValue()); + } + + } } diff --git a/core/src/test/java/org/elasticsearch/cluster/PublishClusterStateActionTests.java b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java similarity index 53% rename from core/src/test/java/org/elasticsearch/cluster/PublishClusterStateActionTests.java rename to core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java index a395ebdd3d4..80224f055a1 100644 --- a/core/src/test/java/org/elasticsearch/cluster/PublishClusterStateActionTests.java +++ b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java @@ -17,16 +17,20 @@ * under the License. */ -package org.elasticsearch.cluster; +package org.elasticsearch.discovery.zen.publish; -import com.google.common.collect.ImmutableMap; +import com.carrotsearch.randomizedtesting.annotations.Repeat; +import com.google.common.collect.Maps; +import org.elasticsearch.ElasticsearchException; import org.elasticsearch.Version; +import org.elasticsearch.cluster.*; import org.elasticsearch.cluster.block.ClusterBlocks; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.MetaData; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.common.Nullable; +import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.collect.ImmutableOpenMap; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.io.stream.NamedWriteableRegistry; @@ -36,51 +40,51 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.discovery.Discovery; import org.elasticsearch.discovery.DiscoverySettings; import org.elasticsearch.discovery.zen.DiscoveryNodesProvider; -import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction; import org.elasticsearch.node.service.NodeService; import org.elasticsearch.node.settings.NodeSettingsService; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.junit.annotations.TestLogging; import org.elasticsearch.test.transport.MockTransportService; import org.elasticsearch.threadpool.ThreadPool; -import org.elasticsearch.transport.TransportConnectionListener; -import org.elasticsearch.transport.TransportService; +import org.elasticsearch.transport.*; import org.elasticsearch.transport.local.LocalTransport; import org.junit.After; import org.junit.Before; import org.junit.Test; import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; import static com.google.common.collect.Maps.newHashMap; import static org.hamcrest.Matchers.*; +@TestLogging("discovery.zen.publish:TRACE") public class PublishClusterStateActionTests extends ESTestCase { protected ThreadPool threadPool; protected Map nodes = newHashMap(); - public static class MockNode implements PublishClusterStateAction.NewClusterStateListener { + public static class MockNode implements PublishClusterStateAction.NewClusterStateListener, DiscoveryNodesProvider { public final DiscoveryNode discoveryNode; public final MockTransportService service; - public PublishClusterStateAction action; - public final MockDiscoveryNodesProvider nodesProvider; + public MockPublishAction action; public final ClusterStateListener listener; public volatile ClusterState clusterState; private final ESLogger logger; - public MockNode(DiscoveryNode discoveryNode, MockTransportService service, MockDiscoveryNodesProvider nodesProvider, @Nullable ClusterStateListener listener, ESLogger logger) { + public MockNode(DiscoveryNode discoveryNode, MockTransportService service, @Nullable ClusterStateListener listener, ESLogger logger) { this.discoveryNode = discoveryNode; this.service = service; - this.nodesProvider = nodesProvider; this.listener = listener; this.logger = logger; this.clusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(DiscoveryNodes.builder().put(discoveryNode).localNodeId(discoveryNode.id()).build()).build(); @@ -88,7 +92,6 @@ public class PublishClusterStateActionTests extends ESTestCase { public void connectTo(DiscoveryNode node) { service.connectToNode(node); - nodesProvider.addNode(node); } @Override @@ -101,6 +104,25 @@ public class PublishClusterStateActionTests extends ESTestCase { clusterState = newClusterState; newStateProcessed.onNewClusterStateProcessed(); } + + @Override + public DiscoveryNodes nodes() { + return clusterState.nodes(); + } + + @Override + public NodeService nodeService() { + assert false; + throw new UnsupportedOperationException("Shouldn't be here"); + } + } + + public MockNode createMockNode(final String name) throws Exception { + return createMockNode(name, Settings.EMPTY, Version.CURRENT); + } + + public MockNode createMockNode(String name, Settings settings) throws Exception { + return createMockNode(name, settings, Version.CURRENT); } public MockNode createMockNode(final String name, Settings settings, Version version) throws Exception { @@ -108,15 +130,17 @@ public class PublishClusterStateActionTests extends ESTestCase { } public MockNode createMockNode(String name, Settings settings, Version version, @Nullable ClusterStateListener listener) throws Exception { - MockTransportService service = buildTransportService( - Settings.builder().put(settings).put("name", name, TransportService.SETTING_TRACE_LOG_INCLUDE, "", TransportService.SETTING_TRACE_LOG_EXCLUDE, "NOTHING").build(), - version - ); - DiscoveryNode discoveryNode = new DiscoveryNode(name, name, service.boundAddress().publishAddress(), ImmutableMap.of(), version); - MockDiscoveryNodesProvider nodesProvider = new MockDiscoveryNodesProvider(discoveryNode); - MockNode node = new MockNode(discoveryNode, service, nodesProvider, listener, logger); - nodesProvider.addNode(discoveryNode); - node.action = buildPublishClusterStateAction(settings, service, nodesProvider, node); + settings = Settings.builder() + .put("name", name) + .put(TransportService.SETTING_TRACE_LOG_INCLUDE, "", TransportService.SETTING_TRACE_LOG_EXCLUDE, "NOTHING") + .put(settings) + .build(); + + MockTransportService service = buildTransportService(settings, version); + DiscoveryNode discoveryNode = new DiscoveryNode(name, name, service.boundAddress().publishAddress(), + Maps.newHashMap(settings.getByPrefix("node.").getAsMap()), version); + MockNode node = new MockNode(discoveryNode, service, listener, logger); + node.action = buildPublishClusterStateAction(settings, service, node, node); final CountDownLatch latch = new CountDownLatch(nodes.size() * 2 + 1); TransportConnectionListener waitForConnection = new TransportConnectionListener() { @Override @@ -187,40 +211,13 @@ public class PublishClusterStateActionTests extends ESTestCase { return transportService; } - protected PublishClusterStateAction buildPublishClusterStateAction(Settings settings, MockTransportService transportService, MockDiscoveryNodesProvider nodesProvider, - PublishClusterStateAction.NewClusterStateListener listener) { + protected MockPublishAction buildPublishClusterStateAction(Settings settings, MockTransportService transportService, DiscoveryNodesProvider nodesProvider, + PublishClusterStateAction.NewClusterStateListener listener) { DiscoverySettings discoverySettings = new DiscoverySettings(settings, new NodeSettingsService(settings)); - return new PublishClusterStateAction(settings, transportService, nodesProvider, listener, discoverySettings, ClusterName.DEFAULT); + return new MockPublishAction(settings, transportService, nodesProvider, listener, discoverySettings, ClusterName.DEFAULT); } - - static class MockDiscoveryNodesProvider implements DiscoveryNodesProvider { - - private DiscoveryNodes discoveryNodes = DiscoveryNodes.EMPTY_NODES; - - public MockDiscoveryNodesProvider(DiscoveryNode localNode) { - discoveryNodes = DiscoveryNodes.builder().put(localNode).localNodeId(localNode.id()).build(); - } - - public void addNode(DiscoveryNode node) { - discoveryNodes = DiscoveryNodes.builder(discoveryNodes).put(node).build(); - } - - @Override - public DiscoveryNodes nodes() { - return discoveryNodes; - } - - @Override - public NodeService nodeService() { - assert false; - throw new UnsupportedOperationException("Shouldn't be here"); - } - } - - @Test - @TestLogging("cluster:DEBUG,discovery.zen.publish:DEBUG") public void testSimpleClusterStatePublishing() throws Exception { MockNode nodeA = createMockNode("nodeA", Settings.EMPTY, Version.CURRENT); MockNode nodeB = createMockNode("nodeB", Settings.EMPTY, Version.CURRENT); @@ -233,20 +230,20 @@ public class PublishClusterStateActionTests extends ESTestCase { discoveryNodes = DiscoveryNodes.builder(discoveryNodes).put(nodeB.discoveryNode).build(); ClusterState previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).nodes(discoveryNodes).incrementVersion().build(); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + publishStateAndWait(nodeA.action, clusterState, previousClusterState); assertSameStateFromFull(nodeB.clusterState, clusterState); // cluster state update - add block previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).blocks(ClusterBlocks.builder().addGlobalBlock(MetaData.CLUSTER_READ_ONLY_BLOCK)).incrementVersion().build(); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + publishStateAndWait(nodeA.action, clusterState, previousClusterState); assertSameStateFromDiff(nodeB.clusterState, clusterState); assertThat(nodeB.clusterState.blocks().global().size(), equalTo(1)); // cluster state update - remove block previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).blocks(ClusterBlocks.EMPTY_CLUSTER_BLOCK).incrementVersion().build(); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + publishStateAndWait(nodeA.action, clusterState, previousClusterState); assertSameStateFromDiff(nodeB.clusterState, clusterState); assertTrue(nodeB.clusterState.wasReadFromDiff()); @@ -258,7 +255,7 @@ public class PublishClusterStateActionTests extends ESTestCase { previousClusterState = clusterState; discoveryNodes = DiscoveryNodes.builder(discoveryNodes).put(nodeC.discoveryNode).build(); clusterState = ClusterState.builder(clusterState).nodes(discoveryNodes).incrementVersion().build(); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + publishStateAndWait(nodeA.action, clusterState, previousClusterState); assertSameStateFromDiff(nodeB.clusterState, clusterState); // First state assertSameStateFromFull(nodeC.clusterState, clusterState); @@ -267,7 +264,7 @@ public class PublishClusterStateActionTests extends ESTestCase { previousClusterState = clusterState; MetaData metaData = MetaData.builder(clusterState.metaData()).transientSettings(Settings.settingsBuilder().put("foo", "bar").build()).build(); clusterState = ClusterState.builder(clusterState).metaData(metaData).incrementVersion().build(); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + publishStateAndWait(nodeA.action, clusterState, previousClusterState); assertSameStateFromDiff(nodeB.clusterState, clusterState); assertThat(nodeB.clusterState.blocks().global().size(), equalTo(0)); assertSameStateFromDiff(nodeC.clusterState, clusterState); @@ -276,7 +273,7 @@ public class PublishClusterStateActionTests extends ESTestCase { // cluster state update - skipping one version change - should request full cluster state previousClusterState = ClusterState.builder(clusterState).incrementVersion().build(); clusterState = ClusterState.builder(clusterState).incrementVersion().build(); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + publishStateAndWait(nodeA.action, clusterState, previousClusterState); assertSameStateFromFull(nodeB.clusterState, clusterState); assertSameStateFromFull(nodeC.clusterState, clusterState); assertFalse(nodeC.clusterState.wasReadFromDiff()); @@ -286,16 +283,17 @@ public class PublishClusterStateActionTests extends ESTestCase { .put(nodeA.discoveryNode) .put(nodeB.discoveryNode) .put(nodeC.discoveryNode) + .masterNodeId(nodeB.discoveryNode.id()) + .localNodeId(nodeB.discoveryNode.id()) .build(); previousClusterState = ClusterState.builder(new ClusterName("test")).nodes(discoveryNodes).build(); clusterState = ClusterState.builder(clusterState).nodes(discoveryNodes).incrementVersion().build(); - publishStateDiffAndWait(nodeB.action, clusterState, previousClusterState); + publishStateAndWait(nodeB.action, clusterState, previousClusterState); assertSameStateFromFull(nodeA.clusterState, clusterState); assertSameStateFromFull(nodeC.clusterState, clusterState); } @Test - @TestLogging("cluster:DEBUG,discovery.zen.publish:DEBUG") public void testUnexpectedDiffPublishing() throws Exception { MockNode nodeA = createMockNode("nodeA", Settings.EMPTY, Version.CURRENT, new ClusterStateListener() { @@ -311,18 +309,17 @@ public class PublishClusterStateActionTests extends ESTestCase { DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).put(nodeB.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); ClusterState previousClusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); ClusterState clusterState = ClusterState.builder(previousClusterState).incrementVersion().build(); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + publishStateAndWait(nodeA.action, clusterState, previousClusterState); assertSameStateFromFull(nodeB.clusterState, clusterState); // cluster state update - add block previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).blocks(ClusterBlocks.builder().addGlobalBlock(MetaData.CLUSTER_READ_ONLY_BLOCK)).incrementVersion().build(); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + publishStateAndWait(nodeA.action, clusterState, previousClusterState); assertSameStateFromDiff(nodeB.clusterState, clusterState); } @Test - @TestLogging("cluster:DEBUG,discovery.zen.publish:DEBUG") public void testDisablingDiffPublishing() throws Exception { Settings noDiffPublishingSettings = Settings.builder().put(DiscoverySettings.PUBLISH_DIFF_ENABLE, false).build(); @@ -348,39 +345,41 @@ public class PublishClusterStateActionTests extends ESTestCase { discoveryNodes = DiscoveryNodes.builder(discoveryNodes).put(nodeB.discoveryNode).build(); ClusterState previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).nodes(discoveryNodes).incrementVersion().build(); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + publishStateAndWait(nodeA.action, clusterState, previousClusterState); // cluster state update - add block previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).blocks(ClusterBlocks.builder().addGlobalBlock(MetaData.CLUSTER_READ_ONLY_BLOCK)).incrementVersion().build(); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + publishStateAndWait(nodeA.action, clusterState, previousClusterState); } - @Test - @TestLogging("cluster:DEBUG,discovery.zen.publish:DEBUG") /** * Test concurrent publishing works correctly (although not strictly required, it's a good testamne */ + @Test public void testSimultaneousClusterStatePublishing() throws Exception { int numberOfNodes = randomIntBetween(2, 10); - int numberOfIterations = randomIntBetween(50, 200); + int numberOfIterations = randomIntBetween(10, 50); Settings settings = Settings.builder().put(DiscoverySettings.PUBLISH_DIFF_ENABLE, randomBoolean()).build(); - MockNode[] nodes = new MockNode[numberOfNodes]; DiscoveryNodes.Builder discoveryNodesBuilder = DiscoveryNodes.builder(); - for (int i = 0; i < nodes.length; i++) { + MockNode master = null; + for (int i = 0; i < numberOfNodes; i++) { final String name = "node" + i; - nodes[i] = createMockNode(name, settings, Version.CURRENT, new ClusterStateListener() { + final MockNode node = createMockNode(name, settings, Version.CURRENT, new ClusterStateListener() { @Override public void clusterChanged(ClusterChangedEvent event) { assertProperMetaDataForVersion(event.state().metaData(), event.state().version()); } }); - discoveryNodesBuilder.put(nodes[i].discoveryNode); + if (i == 0) { + master = node; + } + discoveryNodesBuilder.put(node.discoveryNode); } AssertingAckListener[] listeners = new AssertingAckListener[numberOfIterations]; - discoveryNodesBuilder.localNodeId(nodes[0].discoveryNode.id()); + discoveryNodesBuilder.localNodeId(master.discoveryNode.id()); DiscoveryNodes discoveryNodes = discoveryNodesBuilder.build(); MetaData metaData = MetaData.EMPTY_META_DATA; ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metaData(metaData).build(); @@ -389,17 +388,17 @@ public class PublishClusterStateActionTests extends ESTestCase { previousState = clusterState; metaData = buildMetaDataForVersion(metaData, i + 1); clusterState = ClusterState.builder(clusterState).incrementVersion().metaData(metaData).nodes(discoveryNodes).build(); - listeners[i] = publishStateDiff(nodes[0].action, clusterState, previousState); + listeners[i] = publishState(master.action, clusterState, previousState); } for (int i = 0; i < numberOfIterations; i++) { listeners[i].await(1, TimeUnit.SECONDS); } - // fake node[0] - it is the master - nodes[0].clusterState = clusterState; + // set the master cs + master.clusterState = clusterState; - for (MockNode node : nodes) { + for (MockNode node : nodes.values()) { assertThat(node.discoveryNode + " misses a cluster state", node.clusterState, notNullValue()); assertThat(node.discoveryNode + " unexpected cluster state: " + node.clusterState, node.clusterState.version(), equalTo(clusterState.version())); assertThat(node.clusterState.nodes().localNode(), equalTo(node.discoveryNode)); @@ -407,7 +406,6 @@ public class PublishClusterStateActionTests extends ESTestCase { } @Test - @TestLogging("cluster:DEBUG,discovery.zen.publish:DEBUG") public void testSerializationFailureDuringDiffPublishing() throws Exception { MockNode nodeA = createMockNode("nodeA", Settings.EMPTY, Version.CURRENT, new ClusterStateListener() { @@ -423,7 +421,7 @@ public class PublishClusterStateActionTests extends ESTestCase { DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).put(nodeB.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); ClusterState previousClusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); ClusterState clusterState = ClusterState.builder(previousClusterState).incrementVersion().build(); - publishStateDiffAndWait(nodeA.action, clusterState, previousClusterState); + publishStateAndWait(nodeA.action, clusterState, previousClusterState); assertSameStateFromFull(nodeB.clusterState, clusterState); // cluster state update - add block @@ -447,11 +445,217 @@ public class PublishClusterStateActionTests extends ESTestCase { }; } }; - List> errors = publishStateDiff(nodeA.action, unserializableClusterState, previousClusterState).awaitErrors(1, TimeUnit.SECONDS); - assertThat(errors.size(), equalTo(1)); - assertThat(errors.get(0).v2().getMessage(), containsString("Simulated failure of diff serialization")); + try { + publishStateAndWait(nodeA.action, unserializableClusterState, previousClusterState); + fail("cluster state published despite of diff errors"); + } catch (ElasticsearchException e) { + assertThat(e.getCause(), notNullValue()); + assertThat(e.getCause().getMessage(), containsString("Simulated")); + } } + + public void testFailToPublishWithLessThanMinMasterNodes() throws Exception { + final int masterNodes = randomIntBetween(1, 10); + + MockNode master = createMockNode("master"); + DiscoveryNodes.Builder discoveryNodesBuilder = DiscoveryNodes.builder().put(master.discoveryNode); + for (int i = 1; i < masterNodes; i++) { + discoveryNodesBuilder.put(createMockNode("node" + i).discoveryNode); + } + final int dataNodes = randomIntBetween(0, 5); + final Settings dataSettings = Settings.builder().put("node.master", false).build(); + for (int i = 0; i < dataNodes; i++) { + discoveryNodesBuilder.put(createMockNode("data_" + i, dataSettings).discoveryNode); + } + discoveryNodesBuilder.localNodeId(master.discoveryNode.id()).masterNodeId(master.discoveryNode.id()); + DiscoveryNodes discoveryNodes = discoveryNodesBuilder.build(); + MetaData metaData = MetaData.EMPTY_META_DATA; + ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metaData(metaData).nodes(discoveryNodes).build(); + ClusterState previousState = master.clusterState; + try { + publishState(master.action, clusterState, previousState, masterNodes + randomIntBetween(1, 5)); + fail("cluster state publishing didn't fail despite of not having enough nodes"); + } catch (PublishClusterStateAction.FailedToCommitException expected) { + logger.debug("failed to publish as expected", expected); + } + } + + public void testPublishingWithSendingErrors() throws Exception { + int goodNodes = randomIntBetween(2, 5); + int errorNodes = randomIntBetween(1, 5); + int timeOutNodes = randomBoolean() ? 0 : randomIntBetween(1, 5); // adding timeout nodes will force timeout errors + final int numberOfMasterNodes = goodNodes + errorNodes + timeOutNodes + 1; // master + final boolean expectingToCommit = randomBoolean(); + Settings.Builder settings = Settings.builder(); + // make sure we have a reasonable timeout if we expect to timeout, o.w. one that will make the test "hang" + settings.put(DiscoverySettings.COMMIT_TIMEOUT, expectingToCommit == false && timeOutNodes > 0 ? "100ms" : "1h") + .put(DiscoverySettings.PUBLISH_TIMEOUT, "5ms"); // test is about comitting + + MockNode master = createMockNode("master", settings.build()); + + // randomize things a bit + int[] nodeTypes = new int[goodNodes + errorNodes + timeOutNodes]; + for (int i = 0; i < goodNodes; i++) { + nodeTypes[i] = 0; + } + for (int i = goodNodes; i < goodNodes + errorNodes; i++) { + nodeTypes[i] = 1; + } + for (int i = goodNodes + errorNodes; i < nodeTypes.length; i++) { + nodeTypes[i] = 2; + } + Collections.shuffle(Arrays.asList(nodeTypes), random()); + + DiscoveryNodes.Builder discoveryNodesBuilder = DiscoveryNodes.builder().put(master.discoveryNode); + for (int i = 0; i < nodeTypes.length; i++) { + final MockNode mockNode = createMockNode("node" + i); + discoveryNodesBuilder.put(mockNode.discoveryNode); + switch (nodeTypes[i]) { + case 1: + mockNode.action.errorOnSend.set(true); + break; + case 2: + mockNode.action.timeoutOnSend.set(true); + break; + } + } + final int dataNodes = randomIntBetween(0, 3); // data nodes don't matter + for (int i = 0; i < dataNodes; i++) { + final MockNode mockNode = createMockNode("data_" + i, Settings.builder().put("node.master", false).build()); + discoveryNodesBuilder.put(mockNode.discoveryNode); + if (randomBoolean()) { + // we really don't care - just chaos monkey + mockNode.action.errorOnCommit.set(randomBoolean()); + mockNode.action.errorOnSend.set(randomBoolean()); + mockNode.action.timeoutOnCommit.set(randomBoolean()); + mockNode.action.timeoutOnSend.set(randomBoolean()); + } + } + + final int minMasterNodes; + final String expectedBehavior; + if (expectingToCommit) { + minMasterNodes = randomIntBetween(0, goodNodes + 1); // count master + expectedBehavior = "succeed"; + } else { + minMasterNodes = randomIntBetween(goodNodes + 2, numberOfMasterNodes); // +2 because of master + expectedBehavior = timeOutNodes > 0 ? "timeout" : "fail"; + } + logger.info("--> expecting commit to {}. good nodes [{}], errors [{}], timeouts [{}]. min_master_nodes [{}]", + expectedBehavior, goodNodes + 1, errorNodes, timeOutNodes, minMasterNodes); + + discoveryNodesBuilder.localNodeId(master.discoveryNode.id()).masterNodeId(master.discoveryNode.id()); + DiscoveryNodes discoveryNodes = discoveryNodesBuilder.build(); + MetaData metaData = MetaData.EMPTY_META_DATA; + ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metaData(metaData).nodes(discoveryNodes).build(); + ClusterState previousState = master.clusterState; + try { + publishState(master.action, clusterState, previousState, minMasterNodes); + if (expectingToCommit == false) { + fail("cluster state publishing didn't fail despite of not have enough nodes"); + } + } catch (PublishClusterStateAction.FailedToCommitException exception) { + logger.debug("failed to publish as expected", exception); + if (expectingToCommit) { + throw exception; + } + assertThat(exception.getMessage(), containsString(timeOutNodes > 0 ? "timed out" : "failed")); + } + } + + public void testIncomingClusterStateVerification() throws Exception { + MockNode node = createMockNode("node"); + + logger.info("--> testing acceptances of any master when having no master"); + ClusterState state = ClusterState.builder(node.clusterState) + .nodes(DiscoveryNodes.builder(node.nodes()).masterNodeId(randomAsciiOfLength(10))).build(); + node.action.validateIncomingState(state); + + // now set a master node + node.clusterState = ClusterState.builder(node.clusterState).nodes(DiscoveryNodes.builder(node.nodes()).masterNodeId("master")).build(); + logger.info("--> testing rejection of another master"); + try { + node.action.validateIncomingState(state); + fail("node accepted state from another master"); + } catch (IllegalStateException OK) { + } + + logger.info("--> test state from the current master is accepted"); + node.action.validateIncomingState(ClusterState.builder(node.clusterState) + .nodes(DiscoveryNodes.builder(node.nodes()).masterNodeId("master")).build()); + + + logger.info("--> testing rejection of another cluster name"); + try { + node.action.validateIncomingState(ClusterState.builder(new ClusterName(randomAsciiOfLength(10))).nodes(node.nodes()).build()); + fail("node accepted state with another cluster name"); + } catch (IllegalStateException OK) { + } + + logger.info("--> testing rejection of a cluster state with wrong local node"); + try { + state = ClusterState.builder(node.clusterState).nodes(DiscoveryNodes.builder(node.nodes()).localNodeId("_non_existing_").build()).build(); + node.action.validateIncomingState(state); + fail("node accepted state with non-existence local node"); + } catch (IllegalStateException OK) { + } + + try { + MockNode otherNode = createMockNode("otherNode"); + state = ClusterState.builder(node.clusterState).nodes( + DiscoveryNodes.builder(node.nodes()).put(otherNode.discoveryNode).localNodeId(otherNode.discoveryNode.id()).build() + ).build(); + node.action.validateIncomingState(state); + fail("node accepted state with existent but wrong local node"); + } catch (IllegalStateException OK) { + } + } + + public void testInterleavedPublishCommit() throws Throwable { + MockNode node = createMockNode("node"); + final ClusterState state1 = ClusterState.builder(node.clusterState).incrementVersion().build(); + final ClusterState state2 = ClusterState.builder(state1).incrementVersion().build(); + final BytesReference state1Bytes = PublishClusterStateAction.serializeFullClusterState(state1, Version.CURRENT); + final BytesReference state2Bytes = PublishClusterStateAction.serializeFullClusterState(state2, Version.CURRENT); + final CapturingTransportChannel channel = new CapturingTransportChannel(); + + node.action.handleIncomingClusterStateRequest(new BytesTransportRequest(state1Bytes, Version.CURRENT), channel); + assertThat(channel.response.get(), equalTo((TransportResponse) TransportResponse.Empty.INSTANCE)); + assertThat(channel.error.get(), nullValue()); + channel.clear(); + + // another incoming state is OK. Should just override pending state + node.action.handleIncomingClusterStateRequest(new BytesTransportRequest(state2Bytes, Version.CURRENT), channel); + assertThat(channel.response.get(), equalTo((TransportResponse) TransportResponse.Empty.INSTANCE)); + assertThat(channel.error.get(), nullValue()); + channel.clear(); + + // committing previous state should fail + try { + node.action.handleCommitRequest(new PublishClusterStateAction.CommitClusterStateRequest(state1.stateUUID()), channel); + // sadly, there are ways to percolate errors + assertThat(channel.response.get(), nullValue()); + assertThat(channel.error.get(), notNullValue()); + if (channel.error.get() instanceof IllegalStateException == false) { + throw channel.error.get(); + } + } catch (IllegalStateException OK) { + + } + channel.clear(); + + // committing second state should succeed + node.action.handleCommitRequest(new PublishClusterStateAction.CommitClusterStateRequest(state2.stateUUID()), channel); + assertThat(channel.response.get(), equalTo((TransportResponse) TransportResponse.Empty.INSTANCE)); + assertThat(channel.error.get(), nullValue()); + channel.clear(); + + // now check it was really committed + assertSameState(node.clusterState, state2); + } + + private MetaData buildMetaDataForVersion(MetaData metaData, long version) { ImmutableOpenMap.Builder indices = ImmutableOpenMap.builder(metaData.indices()); indices.put("test" + version, IndexMetaData.builder("test" + version).settings(Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)) @@ -471,15 +675,19 @@ public class PublishClusterStateActionTests extends ESTestCase { assertThat(metaData.transientSettings().get("test"), equalTo(Long.toString(version))); } - public void publishStateDiffAndWait(PublishClusterStateAction action, ClusterState state, ClusterState previousState) throws InterruptedException { - publishStateDiff(action, state, previousState).await(1, TimeUnit.SECONDS); + public void publishStateAndWait(PublishClusterStateAction action, ClusterState state, ClusterState previousState) throws InterruptedException { + publishState(action, state, previousState).await(1, TimeUnit.SECONDS); } - public AssertingAckListener publishStateDiff(PublishClusterStateAction action, ClusterState state, ClusterState previousState) throws InterruptedException { + public AssertingAckListener publishState(PublishClusterStateAction action, ClusterState state, ClusterState previousState) throws InterruptedException { + final int minimumMasterNodes = randomIntBetween(-1, state.nodes().getMasterNodes().size()); + return publishState(action, state, previousState, minimumMasterNodes); + } + + public AssertingAckListener publishState(PublishClusterStateAction action, ClusterState state, ClusterState previousState, int minMasterNodes) throws InterruptedException { AssertingAckListener assertingAckListener = new AssertingAckListener(state.nodes().getSize() - 1); ClusterChangedEvent changedEvent = new ClusterChangedEvent("test update", state, previousState); - int requiredNodes = randomIntBetween(-1, state.nodes().getSize() - 1); - action.publish(changedEvent, requiredNodes, assertingAckListener); + action.publish(changedEvent, minMasterNodes, assertingAckListener); return assertingAckListener; } @@ -522,19 +730,11 @@ public class PublishClusterStateActionTests extends ESTestCase { } - public static class DelegatingClusterState extends ClusterState { - - public DelegatingClusterState(ClusterState clusterState) { - super(clusterState.version(), clusterState.stateUUID(), clusterState); - } - - - } - - void assertSameState(ClusterState actual, ClusterState expected) { assertThat(actual, notNullValue()); - assertThat("\n--> actual ClusterState: " + actual.prettyPrint() + "\n--> expected ClusterState:" + expected.prettyPrint(), actual.stateUUID(), equalTo(expected.stateUUID())); + final String reason = "\n--> actual ClusterState: " + actual.prettyPrint() + "\n--> expected ClusterState:" + expected.prettyPrint(); + assertThat("unequal UUIDs" + reason, actual.stateUUID(), equalTo(expected.stateUUID())); + assertThat("unequal versions" + reason, actual.version(), equalTo(expected.version())); } void assertSameStateFromDiff(ClusterState actual, ClusterState expected) { @@ -546,4 +746,77 @@ public class PublishClusterStateActionTests extends ESTestCase { assertSameState(actual, expected); assertFalse(actual.wasReadFromDiff()); } + + static class MockPublishAction extends PublishClusterStateAction { + + AtomicBoolean timeoutOnSend = new AtomicBoolean(); + AtomicBoolean errorOnSend = new AtomicBoolean(); + AtomicBoolean timeoutOnCommit = new AtomicBoolean(); + AtomicBoolean errorOnCommit = new AtomicBoolean(); + + public MockPublishAction(Settings settings, TransportService transportService, DiscoveryNodesProvider nodesProvider, NewClusterStateListener listener, DiscoverySettings discoverySettings, ClusterName clusterName) { + super(settings, transportService, nodesProvider, listener, discoverySettings, clusterName); + } + + @Override + protected void handleIncomingClusterStateRequest(BytesTransportRequest request, TransportChannel channel) throws IOException { + if (errorOnSend.get()) { + throw new ElasticsearchException("forced error on incoming cluster state"); + } + if (timeoutOnSend.get()) { + return; + } + super.handleIncomingClusterStateRequest(request, channel); + } + + @Override + protected void handleCommitRequest(PublishClusterStateAction.CommitClusterStateRequest request, TransportChannel channel) { + if (errorOnCommit.get()) { + throw new ElasticsearchException("forced error on incoming commit"); + } + if (timeoutOnCommit.get()) { + return; + } + super.handleCommitRequest(request, channel); + } + } + + static class CapturingTransportChannel implements TransportChannel { + + AtomicReference response = new AtomicReference<>(); + AtomicReference error = new AtomicReference<>(); + + public void clear() { + response.set(null); + error.set(null); + } + + @Override + public String action() { + return "_noop_"; + } + + @Override + public String getProfileName() { + return "_noop_"; + } + + @Override + public void sendResponse(TransportResponse response) throws IOException { + this.response.set(response); + assertThat(error.get(), nullValue()); + } + + @Override + public void sendResponse(TransportResponse response, TransportResponseOptions options) throws IOException { + this.response.set(response); + assertThat(error.get(), nullValue()); + } + + @Override + public void sendResponse(Throwable error) throws IOException { + this.error.set(error); + assertThat(response.get(), nullValue()); + } + } } diff --git a/core/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java b/core/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java index 9eb99302e46..8439f6e8f76 100644 --- a/core/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java +++ b/core/src/test/java/org/elasticsearch/test/disruption/NetworkDelaysPartition.java @@ -60,6 +60,10 @@ public class NetworkDelaysPartition extends NetworkPartition { this(nodesSideOne, nodesSideTwo, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX, random); } + public NetworkDelaysPartition(Set nodesSideOne, Set nodesSideTwo, long delay, Random random) { + this(nodesSideOne, nodesSideTwo, delay, delay, random); + } + public NetworkDelaysPartition(Set nodesSideOne, Set nodesSideTwo, long delayMin, long delayMax, Random random) { super(nodesSideOne, nodesSideTwo, random); this.delayMin = delayMin; @@ -69,7 +73,7 @@ public class NetworkDelaysPartition extends NetworkPartition { @Override public synchronized void startDisrupting() { - duration = new TimeValue(delayMin + random.nextInt((int) (delayMax - delayMin))); + duration = new TimeValue(delayMin == delayMax ? delayMin : delayMin + random.nextInt((int) (delayMax - delayMin))); super.startDisrupting(); } From 7390bcf833cda4f13dc5eefc6b5a0c033718e6da Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Sun, 23 Aug 2015 18:24:53 +0200 Subject: [PATCH 04/21] add FailedToCommitException to registration --- .../main/java/org/elasticsearch/ElasticsearchException.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/elasticsearch/ElasticsearchException.java b/core/src/main/java/org/elasticsearch/ElasticsearchException.java index 4f478057fa0..3e54d499db3 100644 --- a/core/src/main/java/org/elasticsearch/ElasticsearchException.java +++ b/core/src/main/java/org/elasticsearch/ElasticsearchException.java @@ -596,7 +596,8 @@ public class ElasticsearchException extends RuntimeException implements ToXConte ResourceNotFoundException.class, IndexNotFoundException.class, ShardNotFoundException.class, - NotSerializableExceptionWrapper.class + NotSerializableExceptionWrapper.class, + org.elasticsearch.discovery.zen.publish.PublishClusterStateAction.FailedToCommitException.class }; Map> mapping = new HashMap<>(exceptions.length); for (Class e : exceptions) { From 7d3a36b20f47572eb6d5a806e1c90f4b2c7b43fd Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Sun, 23 Aug 2015 18:34:52 +0200 Subject: [PATCH 05/21] fix ZenDiscoveryUnitTest.testShouldIgnoreNewClusterState --- .../elasticsearch/discovery/zen/ZenDiscovery.java | 4 ++-- .../discovery/zen/ZenDiscoveryUnitTest.java | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java index 6284aa9987e..1ad2bd96514 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java @@ -876,8 +876,8 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen return; } if (!currentNodes.masterNodeId().equals(newClusterState.nodes().masterNodeId())) { - logger.warn("received a cluster state from a different master then the current one, rejecting (received {}, current {})", newClusterState.nodes().masterNode(), currentNodes.masterNode()); - throw new IllegalStateException("cluster state from a different master then the current one, rejecting (received " + newClusterState.nodes().masterNode() + ", current " + currentNodes.masterNode() + ")"); + logger.warn("received a cluster state from a different master than the current one, rejecting (received {}, current {})", newClusterState.nodes().masterNode(), currentNodes.masterNode()); + throw new IllegalStateException("cluster state from a different master than the current one, rejecting (received " + newClusterState.nodes().masterNode() + ", current " + currentNodes.masterNode() + ")"); } } diff --git a/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java b/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java index cc6bc2b235c..0ce3fd45a36 100644 --- a/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java +++ b/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java @@ -19,9 +19,13 @@ package org.elasticsearch.discovery.zen; +import org.elasticsearch.Version; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.common.transport.DummyTransportAddress; +import org.elasticsearch.common.transport.TransportAddress; import org.elasticsearch.test.ESTestCase; import java.util.Collections; @@ -41,9 +45,10 @@ public class ZenDiscoveryUnitTest extends ESTestCase { ClusterName clusterName = new ClusterName("abc"); DiscoveryNodes.Builder currentNodes = DiscoveryNodes.builder(); - currentNodes.masterNodeId("a"); + currentNodes.masterNodeId("a").put(new DiscoveryNode("a", DummyTransportAddress.INSTANCE, Version.CURRENT)); + ; DiscoveryNodes.Builder newNodes = DiscoveryNodes.builder(); - newNodes.masterNodeId("a"); + newNodes.masterNodeId("a").put(new DiscoveryNode("a", DummyTransportAddress.INSTANCE, Version.CURRENT)); ClusterState.Builder currentState = ClusterState.builder(clusterName); currentState.nodes(currentNodes); @@ -61,7 +66,8 @@ public class ZenDiscoveryUnitTest extends ESTestCase { assertFalse("should not ignore, because new state's version is higher to current state's version", shouldIgnoreOrRejectNewClusterState(logger, currentState.build(), newState.build())); currentNodes = DiscoveryNodes.builder(); - currentNodes.masterNodeId("b"); + currentNodes.masterNodeId("b").put(new DiscoveryNode("b", DummyTransportAddress.INSTANCE, Version.CURRENT)); + ; // version isn't taken into account, so randomize it to ensure this. if (randomBoolean()) { currentState.version(2); From 4d31681057a26b6c52a445d8fc8c5fe9a95831e4 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Sun, 23 Aug 2015 18:40:17 +0200 Subject: [PATCH 06/21] added constructor to FailedToCommitException --- .../discovery/zen/publish/PublishClusterStateAction.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java index 1100eeb7680..5083dc98af3 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java @@ -439,7 +439,11 @@ public class PublishClusterStateAction extends AbstractComponent { } - public class FailedToCommitException extends ElasticsearchException { + public static class FailedToCommitException extends ElasticsearchException { + + public FailedToCommitException(StreamInput in) throws IOException { + super(in); + } public FailedToCommitException(String msg, Object... args) { super(msg, args); From 234a3794e5e6cb85ef8d43619884209bb93c6399 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Sun, 23 Aug 2015 22:41:21 +0200 Subject: [PATCH 07/21] improved timeout handling --- .../discovery/DiscoverySettings.java | 11 +++++++- .../publish/PublishClusterStateAction.java | 27 ++++++++++--------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java b/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java index 8807bfcbf3d..cbf4cf10477 100644 --- a/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java +++ b/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java @@ -35,13 +35,22 @@ import java.util.EnumSet; */ public class DiscoverySettings extends AbstractComponent { + /** + * sets the timeout for a complete publishing cycle, including both sending and committing. the master + * will continute to process the next cluster state update after this time has elapsed + **/ public static final String PUBLISH_TIMEOUT = "discovery.zen.publish_timeout"; + + /** + * sets the timeout for receiving enough acks for a specific cluster state and committing it. failing + * to receive responses within this window will cause the cluster state change to be rejected. + */ public static final String COMMIT_TIMEOUT = "discovery.zen.commit_timeout"; public static final String NO_MASTER_BLOCK = "discovery.zen.no_master_block"; public static final String PUBLISH_DIFF_ENABLE = "discovery.zen.publish_diff.enable"; public static final TimeValue DEFAULT_PUBLISH_TIMEOUT = TimeValue.timeValueSeconds(30); - public static final TimeValue DEFAULT_COMMIT_TIMEOUT = TimeValue.timeValueSeconds(1); + public static final TimeValue DEFAULT_COMMIT_TIMEOUT = TimeValue.timeValueSeconds(30); public static final String DEFAULT_NO_MASTER_BLOCK = "write"; public final static int NO_MASTER_BLOCK_ID = 2; public final static boolean DEFAULT_PUBLISH_DIFF_ENABLE = true; diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java index 5083dc98af3..7722511b6ab 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java @@ -121,6 +121,8 @@ public class PublishClusterStateAction extends AbstractComponent { final boolean sendFullVersion = !discoverySettings.getPublishDiff() || previousState == null; final SendingController sendingController = new SendingController(clusterChangedEvent.state(), minMasterNodes, totalMasterNodes, publishResponseHandler); + final long publishingStartInNanos = System.nanoTime(); + // we build these early as a best effort not to commit in the case of error. // sadly this is not water tight as it may that a failed diff based publishing to a node // will cause a full serialization based on an older version, which may fail after the @@ -140,21 +142,19 @@ public class PublishClusterStateAction extends AbstractComponent { sendingController.waitForCommit(discoverySettings.getCommitTimeout()); - if (publishTimeout.millis() > 0) { - // only wait if the publish timeout is configured... - try { - sendingController.setPublishingTimedOut(!publishResponseHandler.awaitAllNodes(publishTimeout)); - if (sendingController.getPublishingTimedOut()) { - DiscoveryNode[] pendingNodes = publishResponseHandler.pendingNodes(); - // everyone may have just responded - if (pendingNodes.length > 0) { - logger.warn("timed out waiting for all nodes to process published state [{}] (timeout [{}], pending nodes: {})", clusterState.version(), publishTimeout, pendingNodes); - } + try { + long timeLeftInNanos = Math.max(0, publishTimeout.nanos() - (System.nanoTime() - publishingStartInNanos)); + sendingController.setPublishingTimedOut(!publishResponseHandler.awaitAllNodes(TimeValue.timeValueNanos(timeLeftInNanos))); + if (sendingController.getPublishingTimedOut()) { + DiscoveryNode[] pendingNodes = publishResponseHandler.pendingNodes(); + // everyone may have just responded + if (pendingNodes.length > 0) { + logger.warn("timed out waiting for all nodes to process published state [{}] (timeout [{}], pending nodes: {})", clusterState.version(), publishTimeout, pendingNodes); } - } catch (InterruptedException e) { - // ignore & restore interrupt - Thread.currentThread().interrupt(); } + } catch (InterruptedException e) { + // ignore & restore interrupt + Thread.currentThread().interrupt(); } } @@ -487,6 +487,7 @@ public class PublishClusterStateAction extends AbstractComponent { } catch (InterruptedException e) { } + //nocommit: make sure we prevent publishing successfully! if (committed.get() == false) { throw new FailedToCommitException("{} enough masters to ack sent cluster state. [{}] left", timedout ? "timed out while waiting for" : "failed to get", neededMastersToCommit); From e3e0aa5049b41db0081743960a91e7e2400116b3 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Mon, 24 Aug 2015 17:13:15 +0200 Subject: [PATCH 08/21] Improved concurrency controls In SendingController to make sure that a CS is never committed after publishing is marked out as timed out --- .../publish/PublishClusterStateAction.java | 95 +++++++++++++------ .../PublishClusterStateActionTests.java | 43 ++++++++- 2 files changed, 106 insertions(+), 32 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java index 7722511b6ab..873e45d4a17 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java @@ -459,13 +459,16 @@ public class PublishClusterStateAction extends AbstractComponent { } private final BlockingClusterStatePublishResponseHandler publishResponseHandler; - volatile int neededMastersToCommit; - int pendingMasterNodes; final ArrayList sendAckedBeforeCommit = new ArrayList<>(); - final CountDownLatch comittedOrFailed; - final AtomicBoolean committed; + final CountDownLatch committedOrFailedLatch; - // an external marker to note that the publishing process is timed out. This is usefull for proper logging. + // writes and reads of these are protected under synchronization + boolean committedOrFailed; // true if a decision was made w.r.t committing or failing + boolean committed; // true if cluster state was committed + int neededMastersToCommit; // number of master nodes acks still needed before committing + int pendingMasterNodes; // how many master node still need to respond + + // an external marker to note that the publishing process is timed out. This is useful for proper logging. final AtomicBoolean publishingTimedOut = new AtomicBoolean(); private SendingController(ClusterState clusterState, int minMasterNodes, int totalMasterNodes, BlockingClusterStatePublishResponseHandler publishResponseHandler) { @@ -476,60 +479,66 @@ public class PublishClusterStateAction extends AbstractComponent { if (this.neededMastersToCommit > this.pendingMasterNodes) { throw new FailedToCommitException("not enough masters to ack sent cluster state. [{}] needed , have [{}]", neededMastersToCommit, pendingMasterNodes); } - this.committed = new AtomicBoolean(neededMastersToCommit == 0); - this.comittedOrFailed = new CountDownLatch(committed.get() ? 0 : 1); + this.committed = neededMastersToCommit == 0; + this.committedOrFailed = committed; + this.committedOrFailedLatch = new CountDownLatch(committed ? 0 : 1); } public void waitForCommit(TimeValue commitTimeout) { boolean timedout = false; try { - timedout = comittedOrFailed.await(commitTimeout.millis(), TimeUnit.MILLISECONDS) == false; + timedout = committedOrFailedLatch.await(commitTimeout.millis(), TimeUnit.MILLISECONDS) == false; } catch (InterruptedException e) { } - //nocommit: make sure we prevent publishing successfully! - if (committed.get() == false) { + + if (timedout) { + markAsFailed("timed out waiting for commit (commit timeout [" + commitTimeout + "]"); + } + if (isCommitted() == false) { throw new FailedToCommitException("{} enough masters to ack sent cluster state. [{}] left", timedout ? "timed out while waiting for" : "failed to get", neededMastersToCommit); } } + synchronized public boolean isCommitted() { + return committed; + } + synchronized public void onNodeSendAck(DiscoveryNode node) { - if (committed.get() == false) { + if (committed) { + assert sendAckedBeforeCommit.isEmpty(); + sendCommitToNode(node, clusterState, this); + } else if (committedOrFailed) { + logger.trace("ignoring ack from [{}] for cluster state version [{}]. already failed", node, clusterState.version()); + } else { + // we're still waiting sendAckedBeforeCommit.add(node); if (node.isMasterNode()) { onMasterNodeSendAck(node); } - } else { - assert sendAckedBeforeCommit.isEmpty(); - sendCommitToNode(node, clusterState, this); } - } - private void onMasterNodeSendAck(DiscoveryNode node) { + synchronized private void onMasterNodeSendAck(DiscoveryNode node) { logger.trace("master node {} acked cluster state version [{}]. processing ... (current pending [{}], needed [{}])", node, clusterState.version(), pendingMasterNodes, neededMastersToCommit); neededMastersToCommit--; if (neededMastersToCommit == 0) { - logger.trace("committing version [{}]", clusterState.version()); - for (DiscoveryNode nodeToCommit : sendAckedBeforeCommit) { - sendCommitToNode(nodeToCommit, clusterState, this); + if (markAsCommitted()) { + for (DiscoveryNode nodeToCommit : sendAckedBeforeCommit) { + sendCommitToNode(nodeToCommit, clusterState, this); + } + sendAckedBeforeCommit.clear(); } - sendAckedBeforeCommit.clear(); - boolean success = committed.compareAndSet(false, true); - assert success; - comittedOrFailed.countDown(); } onMasterNodeDone(node); } - private void onMasterNodeDone(DiscoveryNode node) { + synchronized private void onMasterNodeDone(DiscoveryNode node) { pendingMasterNodes--; if (pendingMasterNodes == 0 && neededMastersToCommit > 0) { - logger.trace("failed to commit version [{}]. All master nodes acked or failed but [{}] acks are still needed", - clusterState.version(), neededMastersToCommit); - comittedOrFailed.countDown(); + markAsFailed("All master nodes acked or failed but [" + neededMastersToCommit + "] acks are still needed"); } } @@ -542,6 +551,38 @@ public class PublishClusterStateAction extends AbstractComponent { publishResponseHandler.onFailure(node, t); } + /** + * tries and commit the current state, if a decision wasn't made yet + * + * @return true if successful + */ + synchronized private boolean markAsCommitted() { + if (committedOrFailed) { + return committed; + } + logger.trace("committing version [{}]", clusterState.version()); + committed = true; + committedOrFailed = true; + committedOrFailedLatch.countDown(); + return true; + } + + /** + * tries marking the publishing as failed, if a decision wasn't made yet + * + * @return true if the publishing was failed and the cluster state is *not* committed + **/ + synchronized private boolean markAsFailed(String reason) { + if (committedOrFailed) { + return committed == false; + } + logger.trace("failed to commit version [{}]. {}", clusterState.version(), reason); + committedOrFailed = true; + committed = false; + committedOrFailedLatch.countDown(); + return true; + } + public boolean getPublishingTimedOut() { return publishingTimedOut.get(); } diff --git a/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java index 80224f055a1..994805ab3f4 100644 --- a/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java +++ b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java @@ -360,7 +360,7 @@ public class PublishClusterStateActionTests extends ESTestCase { @Test public void testSimultaneousClusterStatePublishing() throws Exception { int numberOfNodes = randomIntBetween(2, 10); - int numberOfIterations = randomIntBetween(10, 50); + int numberOfIterations = scaledRandomIntBetween(5, 50); Settings settings = Settings.builder().put(DiscoverySettings.PUBLISH_DIFF_ENABLE, randomBoolean()).build(); DiscoveryNodes.Builder discoveryNodesBuilder = DiscoveryNodes.builder(); MockNode master = null; @@ -490,7 +490,7 @@ public class PublishClusterStateActionTests extends ESTestCase { Settings.Builder settings = Settings.builder(); // make sure we have a reasonable timeout if we expect to timeout, o.w. one that will make the test "hang" settings.put(DiscoverySettings.COMMIT_TIMEOUT, expectingToCommit == false && timeOutNodes > 0 ? "100ms" : "1h") - .put(DiscoverySettings.PUBLISH_TIMEOUT, "5ms"); // test is about comitting + .put(DiscoverySettings.PUBLISH_TIMEOUT, "5ms"); // test is about committing MockNode master = createMockNode("master", settings.build()); @@ -655,6 +655,39 @@ public class PublishClusterStateActionTests extends ESTestCase { assertSameState(node.clusterState, state2); } + /** + * Tests that cluster is committed or times out. It should never be the case that we fail + * an update due to a commit timeout, but it ends up being committed anyway + */ + public void testTimeoutOrCommit() throws Exception { + Settings settings = Settings.builder() + .put(DiscoverySettings.COMMIT_TIMEOUT, "1ms").build(); // short but so we will sometime commit sometime timeout + + MockNode master = createMockNode("master", settings); + MockNode node = createMockNode("node", settings); + ClusterState state = ClusterState.builder(master.clusterState) + .nodes(DiscoveryNodes.builder(master.clusterState.nodes()).put(node.discoveryNode).masterNodeId(master.discoveryNode.id())).build(); + + for (int i = 0; i < 10; i++) { + state = ClusterState.builder(state).incrementVersion().build(); + logger.debug("--> publishing version [{}], UUID [{}]", state.version(), state.stateUUID()); + boolean success; + try { + publishState(master.action, state, master.clusterState, 2).await(1, TimeUnit.HOURS); + success = true; + } catch (PublishClusterStateAction.FailedToCommitException OK) { + success = false; + } + logger.debug("--> publishing [{}], verifying...", success ? "succeeded" : "failed"); + + if (success) { + assertSameState(node.clusterState, state); + } else { + assertThat(node.clusterState.stateUUID(), not(equalTo(state.stateUUID()))); + } + } + } + private MetaData buildMetaDataForVersion(MetaData metaData, long version) { ImmutableOpenMap.Builder indices = ImmutableOpenMap.builder(metaData.indices()); @@ -693,7 +726,7 @@ public class PublishClusterStateActionTests extends ESTestCase { public static class AssertingAckListener implements Discovery.AckListener { private final List> errors = new CopyOnWriteArrayList<>(); - private final AtomicBoolean timeoutOccured = new AtomicBoolean(); + private final AtomicBoolean timeoutOccurred = new AtomicBoolean(); private final CountDownLatch countDown; public AssertingAckListener(int nodeCount) { @@ -710,7 +743,7 @@ public class PublishClusterStateActionTests extends ESTestCase { @Override public void onTimeout() { - timeoutOccured.set(true); + timeoutOccurred.set(true); // Fast forward the counter - no reason to wait here long currentCount = countDown.getCount(); for (long i = 0; i < currentCount; i++) { @@ -724,7 +757,7 @@ public class PublishClusterStateActionTests extends ESTestCase { public List> awaitErrors(long timeout, TimeUnit unit) throws InterruptedException { countDown.await(timeout, unit); - assertFalse(timeoutOccured.get()); + assertFalse(timeoutOccurred.get()); return errors; } From a56d67d8d73a5f2137a4486a20bc390f47d50be1 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Mon, 24 Aug 2015 17:22:38 +0200 Subject: [PATCH 09/21] force mock transport in testCanNotPublishWithoutMinMastNodes --- .../java/org/elasticsearch/cluster/MinimumMasterNodesIT.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java b/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java index 93238b70477..00bf606d061 100644 --- a/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java +++ b/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java @@ -38,6 +38,7 @@ import org.elasticsearch.test.ESIntegTestCase.ClusterScope; import org.elasticsearch.test.disruption.NetworkDelaysPartition; import org.elasticsearch.test.disruption.NetworkUnresponsivePartition; import org.elasticsearch.test.junit.annotations.TestLogging; +import org.elasticsearch.test.transport.MockTransportService; import org.junit.Test; import java.util.Arrays; @@ -353,6 +354,7 @@ public class MinimumMasterNodesIT extends ESIntegTestCase { .put(ZenDiscovery.SETTING_PING_TIMEOUT, "200ms") .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, 2) .put(DiscoverySettings.COMMIT_TIMEOUT, "100ms") // speed things up + .put("plugin.types", MockTransportService.TestPlugin.class.getName()) .build(); internalCluster().startNodesAsync(3, settings).get(); From 91dee8b3110e2675fdcde98d4b366cd4de4fcd8d Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Tue, 25 Aug 2015 11:45:19 +0200 Subject: [PATCH 10/21] reject older cluster state from the same master --- .../publish/PublishClusterStateAction.java | 28 +++++++----- .../PublishClusterStateActionTests.java | 44 ++++++++++++++----- 2 files changed, 49 insertions(+), 23 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java index 873e45d4a17..b224c777109 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java @@ -45,10 +45,7 @@ import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.*; import java.io.IOException; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.util.*; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; @@ -329,7 +326,7 @@ public class PublishClusterStateAction extends AbstractComponent { throw new IncompatibleClusterStateVersionException("have no local cluster state"); } // sanity check incoming state - validateIncomingState(incomingState); + validateIncomingState(incomingState, lastSeenClusterState); lastSeenClusterState = incomingState; lastSeenClusterState.status(ClusterState.ClusterStateStatus.RECEIVED); @@ -338,25 +335,32 @@ public class PublishClusterStateAction extends AbstractComponent { } // package private for testing - /** * does simple sanity check of the incoming cluster state. Throws an exception on rejections. */ - void validateIncomingState(ClusterState state) { - final ClusterName incomingClusterName = state.getClusterName(); + void validateIncomingState(ClusterState incomingState, ClusterState lastSeenClusterState) { + final ClusterName incomingClusterName = incomingState.getClusterName(); if (!incomingClusterName.equals(PublishClusterStateAction.this.clusterName)) { - logger.warn("received cluster state from [{}] which is also master but with a different cluster name [{}]", state.nodes().masterNode(), incomingClusterName); + logger.warn("received cluster state from [{}] which is also master but with a different cluster name [{}]", incomingState.nodes().masterNode(), incomingClusterName); throw new IllegalStateException("received state from a node that is not part of the cluster"); } final DiscoveryNodes currentNodes = nodesProvider.nodes(); - if (currentNodes.localNode().equals(state.nodes().localNode()) == false) { - logger.warn("received a cluster state from [{}] and not part of the cluster, should not happen", state.nodes().masterNode()); + if (currentNodes.localNode().equals(incomingState.nodes().localNode()) == false) { + logger.warn("received a cluster state from [{}] and not part of the cluster, should not happen", incomingState.nodes().masterNode()); throw new IllegalStateException("received state from a node that is not part of the cluster"); } // state from another master requires more subtle checks, so we let it pass for now (it will be checked in ZenDiscovery) if (currentNodes.localNodeMaster() == false) { - ZenDiscovery.validateStateIsFromCurrentMaster(logger, currentNodes, state); + ZenDiscovery.validateStateIsFromCurrentMaster(logger, currentNodes, incomingState); + } + + if (lastSeenClusterState != null + && Objects.equals(lastSeenClusterState.nodes().masterNodeId(), incomingState.nodes().masterNodeId()) + && lastSeenClusterState.version() > incomingState.version()) { + logger.debug("received an older cluster state from master, rejecting (received version [{}], last version is [{}])", + incomingState.version(), lastSeenClusterState.version()); + throw new IllegalStateException("cluster state version [" + incomingState.version() + "] is old (last seen version [" + lastSeenClusterState.version() + "])"); } } diff --git a/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java index 994805ab3f4..d2b69327bd5 100644 --- a/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java +++ b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java @@ -19,7 +19,6 @@ package org.elasticsearch.discovery.zen.publish; -import com.carrotsearch.randomizedtesting.annotations.Repeat; import com.google.common.collect.Maps; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.Version; @@ -564,39 +563,41 @@ public class PublishClusterStateActionTests extends ESTestCase { } } - public void testIncomingClusterStateVerification() throws Exception { + public void testIncomingClusterStateValidation() throws Exception { MockNode node = createMockNode("node"); logger.info("--> testing acceptances of any master when having no master"); ClusterState state = ClusterState.builder(node.clusterState) - .nodes(DiscoveryNodes.builder(node.nodes()).masterNodeId(randomAsciiOfLength(10))).build(); - node.action.validateIncomingState(state); + .nodes(DiscoveryNodes.builder(node.nodes()).masterNodeId(randomAsciiOfLength(10))).incrementVersion().build(); + node.action.validateIncomingState(state, null); // now set a master node node.clusterState = ClusterState.builder(node.clusterState).nodes(DiscoveryNodes.builder(node.nodes()).masterNodeId("master")).build(); logger.info("--> testing rejection of another master"); try { - node.action.validateIncomingState(state); + node.action.validateIncomingState(state, node.clusterState); fail("node accepted state from another master"); } catch (IllegalStateException OK) { } logger.info("--> test state from the current master is accepted"); node.action.validateIncomingState(ClusterState.builder(node.clusterState) - .nodes(DiscoveryNodes.builder(node.nodes()).masterNodeId("master")).build()); + .nodes(DiscoveryNodes.builder(node.nodes()).masterNodeId("master")).build(), node.clusterState); logger.info("--> testing rejection of another cluster name"); try { - node.action.validateIncomingState(ClusterState.builder(new ClusterName(randomAsciiOfLength(10))).nodes(node.nodes()).build()); + node.action.validateIncomingState(ClusterState.builder(new ClusterName(randomAsciiOfLength(10))).nodes(node.nodes()).build(), node.clusterState); fail("node accepted state with another cluster name"); } catch (IllegalStateException OK) { } logger.info("--> testing rejection of a cluster state with wrong local node"); try { - state = ClusterState.builder(node.clusterState).nodes(DiscoveryNodes.builder(node.nodes()).localNodeId("_non_existing_").build()).build(); - node.action.validateIncomingState(state); + state = ClusterState.builder(node.clusterState) + .nodes(DiscoveryNodes.builder(node.nodes()).localNodeId("_non_existing_").build()) + .incrementVersion().build(); + node.action.validateIncomingState(state, node.clusterState); fail("node accepted state with non-existence local node"); } catch (IllegalStateException OK) { } @@ -605,11 +606,32 @@ public class PublishClusterStateActionTests extends ESTestCase { MockNode otherNode = createMockNode("otherNode"); state = ClusterState.builder(node.clusterState).nodes( DiscoveryNodes.builder(node.nodes()).put(otherNode.discoveryNode).localNodeId(otherNode.discoveryNode.id()).build() - ).build(); - node.action.validateIncomingState(state); + ).incrementVersion().build(); + node.action.validateIncomingState(state, node.clusterState); fail("node accepted state with existent but wrong local node"); } catch (IllegalStateException OK) { } + + logger.info("--> testing rejection of an old cluster state"); + state = node.clusterState; + node.clusterState = ClusterState.builder(node.clusterState).incrementVersion().build(); + try { + node.action.validateIncomingState(state, node.clusterState); + fail("node accepted state with an older version"); + } catch (IllegalStateException OK) { + } + + // an older version from a *new* master is OK! + ClusterState previousState = ClusterState.builder(node.clusterState).incrementVersion().build(); + state = ClusterState.builder(node.clusterState) + .nodes(DiscoveryNodes.builder(node.clusterState.nodes()).masterNodeId("_new_master_").build()) + .build(); + // remove the master of the node (but still have a previous cluster state with it)! + node.clusterState = ClusterState.builder(node.clusterState) + .nodes(DiscoveryNodes.builder(node.clusterState.nodes()).masterNodeId(null).build()) + .build(); + + node.action.validateIncomingState(state, previousState); } public void testInterleavedPublishCommit() throws Throwable { From 620824821557ad4d5a4d0406d11a173a9dcb18af Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Tue, 25 Aug 2015 11:55:53 +0200 Subject: [PATCH 11/21] fix defaults in DiscoverySettings --- .../org/elasticsearch/discovery/DiscoverySettings.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java b/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java index cbf4cf10477..688de8e13d2 100644 --- a/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java +++ b/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java @@ -59,17 +59,17 @@ public class DiscoverySettings extends AbstractComponent { public final static ClusterBlock NO_MASTER_BLOCK_WRITES = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, EnumSet.of(ClusterBlockLevel.WRITE, ClusterBlockLevel.METADATA_WRITE)); private volatile ClusterBlock noMasterBlock; - private volatile TimeValue publishTimeout = DEFAULT_PUBLISH_TIMEOUT; - private volatile TimeValue commitTimeout = DEFAULT_COMMIT_TIMEOUT; - private volatile boolean publishDiff = DEFAULT_PUBLISH_DIFF_ENABLE; + private volatile TimeValue publishTimeout; + private volatile TimeValue commitTimeout; + private volatile boolean publishDiff; @Inject public DiscoverySettings(Settings settings, NodeSettingsService nodeSettingsService) { super(settings); nodeSettingsService.addListener(new ApplySettings()); this.noMasterBlock = parseNoMasterBlock(settings.get(NO_MASTER_BLOCK, DEFAULT_NO_MASTER_BLOCK)); - this.publishTimeout = settings.getAsTime(PUBLISH_TIMEOUT, publishTimeout); - this.commitTimeout = settings.getAsTime(COMMIT_TIMEOUT, publishTimeout); + this.publishTimeout = settings.getAsTime(PUBLISH_TIMEOUT, DEFAULT_PUBLISH_TIMEOUT); + this.commitTimeout = settings.getAsTime(COMMIT_TIMEOUT, DEFAULT_COMMIT_TIMEOUT); this.publishDiff = settings.getAsBoolean(PUBLISH_DIFF_ENABLE, DEFAULT_PUBLISH_DIFF_ENABLE); } From c7c65b626ff7cf53d63bc81b9441a208e730a6bc Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Tue, 25 Aug 2015 14:02:23 +0200 Subject: [PATCH 12/21] commit timeout default should never be larger than publishing timeout --- .../java/org/elasticsearch/discovery/DiscoverySettings.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java b/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java index 688de8e13d2..20f2c96b120 100644 --- a/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java +++ b/core/src/main/java/org/elasticsearch/discovery/DiscoverySettings.java @@ -69,7 +69,7 @@ public class DiscoverySettings extends AbstractComponent { nodeSettingsService.addListener(new ApplySettings()); this.noMasterBlock = parseNoMasterBlock(settings.get(NO_MASTER_BLOCK, DEFAULT_NO_MASTER_BLOCK)); this.publishTimeout = settings.getAsTime(PUBLISH_TIMEOUT, DEFAULT_PUBLISH_TIMEOUT); - this.commitTimeout = settings.getAsTime(COMMIT_TIMEOUT, DEFAULT_COMMIT_TIMEOUT); + this.commitTimeout = settings.getAsTime(COMMIT_TIMEOUT, new TimeValue(Math.min(DEFAULT_COMMIT_TIMEOUT.millis(), publishTimeout.millis()))); this.publishDiff = settings.getAsBoolean(PUBLISH_DIFF_ENABLE, DEFAULT_PUBLISH_DIFF_ENABLE); } @@ -98,6 +98,10 @@ public class DiscoverySettings extends AbstractComponent { if (newPublishTimeout.millis() != publishTimeout.millis()) { logger.info("updating [{}] from [{}] to [{}]", PUBLISH_TIMEOUT, publishTimeout, newPublishTimeout); publishTimeout = newPublishTimeout; + if (settings.getAsTime(COMMIT_TIMEOUT, null) == null && commitTimeout.millis() > publishTimeout.millis()) { + logger.info("reducing default [{}] to [{}] due to publish timeout change", COMMIT_TIMEOUT, publishTimeout); + commitTimeout = publishTimeout; + } } } TimeValue newCommitTimeout = settings.getAsTime(COMMIT_TIMEOUT, null); From f70ed876d693a61f186ec4ec56add63fb93836e1 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Tue, 25 Aug 2015 15:03:17 +0200 Subject: [PATCH 13/21] added docs --- docs/reference/modules/discovery/zen.asciidoc | 18 ++++++++++++------ docs/resiliency/index.asciidoc | 15 +++++++++++++++ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/docs/reference/modules/discovery/zen.asciidoc b/docs/reference/modules/discovery/zen.asciidoc index fa5ad6ac282..5e1d4651e04 100644 --- a/docs/reference/modules/discovery/zen.asciidoc +++ b/docs/reference/modules/discovery/zen.asciidoc @@ -108,12 +108,18 @@ considered failed. Defaults to `3`. The master node is the only node in a cluster that can make changes to the cluster state. The master node processes one cluster state update at a time, applies the required changes and publishes the updated cluster state to all -the other nodes in the cluster. Each node receives the publish message, -updates its own cluster state and replies to the master node, which waits for -all nodes to respond, up to a timeout, before going ahead processing the next -updates in the queue. The `discovery.zen.publish_timeout` is set by default -to 30 seconds and can be changed dynamically through the -<> +the other nodes in the cluster. Each node receives the publish message, acknowledges +it but do *not* yet apply it. If the master does not receive acknowledgement from +at least `discovery.zen.minimum_master_nodes` nodes within a certain time (controlled by +the `discovery.zen.commit_timeout` setting and defaults to 30 seconds) the cluster state +change is rejected. + +Once enough nodes have responded, the cluster state is committed and a message will +be sent to all the nodes. The nodes then proceed and apply the new cluster state to their +internal state. The master node waits for all nodes to respond, up to a timeout, before +going ahead processing the next updates in the queue. The `discovery.zen.publish_timeout` is +set by default to 30 seconds and is measured from the moment the publishing started. Both +timeout settings can be changed dynamically through the <> [float] [[no-master-block]] diff --git a/docs/resiliency/index.asciidoc b/docs/resiliency/index.asciidoc index 2a055611935..2783447c51f 100644 --- a/docs/resiliency/index.asciidoc +++ b/docs/resiliency/index.asciidoc @@ -55,6 +55,21 @@ If you encounter an issue, https://github.com/elasticsearch/elasticsearch/issues We are committed to tracking down and fixing all the issues that are posted. +[float] +=== Use two phase commit for Cluster State publishing (STATUS: ONGOING) + +A master node in Elasticsearch continuously https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-discovery-zen.html#fault-detection[monitors the cluster nodes] +and removes any node from the cluster that doesn't respond to it's pings in a timely +fashion. If the master is left with less nodes than the `discovery.zen.minimum_master_nodes` +settings, it will step down and a new master election will start. + +When a network partition occurs causing a master to loose many followers, there is a +short window of time until detects it and master steps down. During that window, the +master may erroneously accept and ack cluster state changes. To avoid this, we introduce +a new phase to cluster state publishing where the proposed cluster state is sent to all nodes +but is not yet committed. Only once enough nodes (`minimum_master_nodes`) actively acknowledge +the change, it is committed and commit messages are sent to the nodes. See See {GIT}13062[#13062]. + [float] === Make index creation more user friendly (STATUS: ONGOING) From d9f6e302b53e24c1a0affc0a66b7e6fa11f3e28b Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Tue, 25 Aug 2015 16:14:08 +0200 Subject: [PATCH 14/21] doc feedback --- docs/reference/modules/discovery/zen.asciidoc | 4 ++-- docs/resiliency/index.asciidoc | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/reference/modules/discovery/zen.asciidoc b/docs/reference/modules/discovery/zen.asciidoc index 5e1d4651e04..2ad71334f17 100644 --- a/docs/reference/modules/discovery/zen.asciidoc +++ b/docs/reference/modules/discovery/zen.asciidoc @@ -109,13 +109,13 @@ The master node is the only node in a cluster that can make changes to the cluster state. The master node processes one cluster state update at a time, applies the required changes and publishes the updated cluster state to all the other nodes in the cluster. Each node receives the publish message, acknowledges -it but do *not* yet apply it. If the master does not receive acknowledgement from +it, but does *not* yet apply it. If the master does not receive acknowledgement from at least `discovery.zen.minimum_master_nodes` nodes within a certain time (controlled by the `discovery.zen.commit_timeout` setting and defaults to 30 seconds) the cluster state change is rejected. Once enough nodes have responded, the cluster state is committed and a message will -be sent to all the nodes. The nodes then proceed and apply the new cluster state to their +be sent to all the nodes. The nodes then proceed to apply the new cluster state to their internal state. The master node waits for all nodes to respond, up to a timeout, before going ahead processing the next updates in the queue. The `discovery.zen.publish_timeout` is set by default to 30 seconds and is measured from the moment the publishing started. Both diff --git a/docs/resiliency/index.asciidoc b/docs/resiliency/index.asciidoc index 2783447c51f..7ca7cf943ec 100644 --- a/docs/resiliency/index.asciidoc +++ b/docs/resiliency/index.asciidoc @@ -59,16 +59,16 @@ We are committed to tracking down and fixing all the issues that are posted. === Use two phase commit for Cluster State publishing (STATUS: ONGOING) A master node in Elasticsearch continuously https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-discovery-zen.html#fault-detection[monitors the cluster nodes] -and removes any node from the cluster that doesn't respond to it's pings in a timely -fashion. If the master is left with less nodes than the `discovery.zen.minimum_master_nodes` +and removes any node from the cluster that doesn't respond to its pings in a timely +fashion. If the master is left with fewer nodes than the `discovery.zen.minimum_master_nodes` settings, it will step down and a new master election will start. -When a network partition occurs causing a master to loose many followers, there is a -short window of time until detects it and master steps down. During that window, the -master may erroneously accept and ack cluster state changes. To avoid this, we introduce +When a network partition causes a master node to lose many followers, there is a short window +in time until the node loss is detected and the master steps down. During that window, the +master may erroneously accept and acknowledge cluster state changes. To avoid this, we introduce a new phase to cluster state publishing where the proposed cluster state is sent to all nodes -but is not yet committed. Only once enough nodes (`minimum_master_nodes`) actively acknowledge -the change, it is committed and commit messages are sent to the nodes. See See {GIT}13062[#13062]. +but is not yet committed. Only once enough nodes (`discovery.zen.minimum_master_nodes`) actively acknowledge +the change, it is committed and commit messages are sent to the nodes. See {GIT}13062[#13062]. [float] === Make index creation more user friendly (STATUS: ONGOING) From 98ed133dd73cabec1f094e4b03a35bb5aacd5109 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Wed, 26 Aug 2015 10:33:17 +0200 Subject: [PATCH 15/21] reduce log chatter --- .../org/elasticsearch/discovery/local/LocalDiscovery.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java b/core/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java index 482ce196672..a8d82ccb84b 100644 --- a/core/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java +++ b/core/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java @@ -333,9 +333,9 @@ public class LocalDiscovery extends AbstractLifecycleComponent implem } try { newNodeSpecificClusterState = discovery.lastProcessedClusterState.readDiffFrom(StreamInput.wrap(clusterStateDiffBytes)).apply(discovery.lastProcessedClusterState); - logger.debug("sending diff cluster state version with size {} to [{}]", clusterStateDiffBytes.length, discovery.localNode.getName()); + logger.trace("sending diff cluster state version [{}] with size {} to [{}]", clusterState.version(), clusterStateDiffBytes.length, discovery.localNode.getName()); } catch (IncompatibleClusterStateVersionException ex) { - logger.warn("incompatible cluster state version - resending complete cluster state", ex); + logger.warn("incompatible cluster state version [{}] - resending complete cluster state", ex, clusterState.version()); } } if (newNodeSpecificClusterState == null) { From c9ee8dbd163d8d7f92b470857a47bb633dba5e34 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Wed, 26 Aug 2015 15:55:31 +0200 Subject: [PATCH 16/21] tighten up FailedToCommitClusterStateException semantics and other feedback --- .../elasticsearch/ElasticsearchException.java | 3 +- .../service/InternalClusterService.java | 5 +- .../elasticsearch/discovery/Discovery.java | 26 ++++- .../discovery/zen/ZenDiscovery.java | 2 +- .../publish/PublishClusterStateAction.java | 101 +++++++++++------- .../cluster/MinimumMasterNodesIT.java | 5 +- .../PublishClusterStateActionTests.java | 12 +-- 7 files changed, 99 insertions(+), 55 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/ElasticsearchException.java b/core/src/main/java/org/elasticsearch/ElasticsearchException.java index 3e54d499db3..19376c48eb5 100644 --- a/core/src/main/java/org/elasticsearch/ElasticsearchException.java +++ b/core/src/main/java/org/elasticsearch/ElasticsearchException.java @@ -26,6 +26,7 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.logging.support.LoggerMessageFormat; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.discovery.Discovery; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexNotFoundException; import org.elasticsearch.index.shard.ShardId; @@ -597,7 +598,7 @@ public class ElasticsearchException extends RuntimeException implements ToXConte IndexNotFoundException.class, ShardNotFoundException.class, NotSerializableExceptionWrapper.class, - org.elasticsearch.discovery.zen.publish.PublishClusterStateAction.FailedToCommitException.class + Discovery.FailedToCommitClusterStateException.class }; Map> mapping = new HashMap<>(exceptions.length); for (Class e : exceptions) { diff --git a/core/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java b/core/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java index 76262a381b4..08cdbbb863b 100644 --- a/core/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java +++ b/core/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java @@ -40,7 +40,6 @@ import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.text.StringText; -import org.elasticsearch.common.transport.BoundTransportAddress; import org.elasticsearch.common.transport.TransportAddress; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.util.concurrent.*; @@ -485,8 +484,8 @@ public class InternalClusterService extends AbstractLifecycleComponent { * * The {@link AckListener} allows to keep track of the ack received from nodes, and verify whether * they updated their own cluster state or not. + * + * The method is guaranteed to throw a {@link FailedToCommitClusterStateException} if the change is not committed and should be rejected. + * Any other exception signals the something wrong happened but the change is committed. */ void publish(ClusterChangedEvent clusterChangedEvent, AckListener ackListener); - public static interface AckListener { + interface AckListener { void onNodeAck(DiscoveryNode node, @Nullable Throwable t); void onTimeout(); } + + class FailedToCommitClusterStateException extends ElasticsearchException { + + public FailedToCommitClusterStateException(StreamInput in) throws IOException { + super(in); + } + + public FailedToCommitClusterStateException(String msg, Object... args) { + super(msg, args); + } + + public FailedToCommitClusterStateException(String msg, Throwable cause, Object... args) { + super(msg, cause, args); + } + } } diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java index 1ad2bd96514..a38fef4cc71 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java @@ -331,7 +331,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen nodesFD.updateNodesAndPing(clusterChangedEvent.state()); try { publishClusterState.publish(clusterChangedEvent, electMaster.minimumMasterNodes(), ackListener); - } catch (PublishClusterStateAction.FailedToCommitException t) { + } catch (FailedToCommitClusterStateException t) { // cluster service logs a WARN message logger.debug("failed to publish cluster state version [{}] (not enough nodes acknowledged, min master nodes [{}])", clusterChangedEvent.state().version(), electMaster.minimumMasterNodes()); clusterService.submitStateUpdateTask("zen-disco-failed-to-publish", Priority.IMMEDIATE, new ClusterStateUpdateTask() { diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java index b224c777109..32867fae78d 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java @@ -93,39 +93,73 @@ public class PublishClusterStateAction extends AbstractComponent { transportService.removeHandler(COMMIT_ACTION_NAME); } - public void publish(ClusterChangedEvent clusterChangedEvent, int minMasterNodes, final Discovery.AckListener ackListener) { - final DiscoveryNodes nodes = clusterChangedEvent.state().nodes(); - Set nodesToPublishTo = new HashSet<>(nodes.size()); - DiscoveryNode localNode = nodes.localNode(); - final int totalMasterNodes = nodes.masterNodes().size(); - for (final DiscoveryNode node : nodes) { - if (node.equals(localNode) == false) { - nodesToPublishTo.add(node); + /** + * publishes a cluster change event to other nodes. if at least minMasterNodes acknowledge the change it is committed and will + * be processed by the master and the other nodes. + *

+ * The method is guaranteed to throw a {@link Discovery.FailedToCommitClusterStateException} if the change is not committed and should be rejected. + * Any other exception signals the something wrong happened but the change is committed. + */ + public void publish(final ClusterChangedEvent clusterChangedEvent, final int minMasterNodes, final Discovery.AckListener ackListener) throws Discovery.FailedToCommitClusterStateException { + final DiscoveryNodes nodes; + final SendingController sendingController; + final Set nodesToPublishTo; + final Map serializedStates; + final Map serializedDiffs; + final boolean sendFullVersion; + try { + nodes = clusterChangedEvent.state().nodes(); + nodesToPublishTo = new HashSet<>(nodes.size()); + DiscoveryNode localNode = nodes.localNode(); + final int totalMasterNodes = nodes.masterNodes().size(); + for (final DiscoveryNode node : nodes) { + if (node.equals(localNode) == false) { + nodesToPublishTo.add(node); + } + } + sendFullVersion = !discoverySettings.getPublishDiff() || clusterChangedEvent.previousState() == null; + serializedStates = Maps.newHashMap(); + serializedDiffs = Maps.newHashMap(); + + // we build these early as a best effort not to commit in the case of error. + // sadly this is not water tight as it may that a failed diff based publishing to a node + // will cause a full serialization based on an older version, which may fail after the + // change has been committed. + buildDiffAndSerializeStates(clusterChangedEvent.state(), clusterChangedEvent.previousState(), + nodesToPublishTo, sendFullVersion, serializedStates, serializedDiffs); + + final BlockingClusterStatePublishResponseHandler publishResponseHandler = new AckClusterStatePublishResponseHandler(nodesToPublishTo, ackListener); + sendingController = new SendingController(clusterChangedEvent.state(), minMasterNodes, totalMasterNodes, publishResponseHandler); + } catch (Throwable t) { + throw new Discovery.FailedToCommitClusterStateException("unexpected error while preparing to publish", t); + } + + try { + innerPublish(clusterChangedEvent, nodesToPublishTo, sendingController, sendFullVersion, serializedStates, serializedDiffs); + } catch (Discovery.FailedToCommitClusterStateException t) { + throw t; + } catch (Throwable t) { + // try to fail committing, in cause it's still on going + sendingController.markAsFailed("unexpected error [" + t.getMessage() + "]"); + if (sendingController.isCommitted() == false) { + // signal the change should be rejected + throw new Discovery.FailedToCommitClusterStateException("unexpected error [{}]", t, t.getMessage()); + } else { + throw t; } } - publish(clusterChangedEvent, minMasterNodes, totalMasterNodes, nodesToPublishTo, new AckClusterStatePublishResponseHandler(nodesToPublishTo, ackListener)); } - private void publish(final ClusterChangedEvent clusterChangedEvent, int minMasterNodes, int totalMasterNodes, final Set nodesToPublishTo, - final BlockingClusterStatePublishResponseHandler publishResponseHandler) { - - Map serializedStates = Maps.newHashMap(); - Map serializedDiffs = Maps.newHashMap(); + private void innerPublish(final ClusterChangedEvent clusterChangedEvent, final Set nodesToPublishTo, + final SendingController sendingController, final boolean sendFullVersion, + final Map serializedStates, final Map serializedDiffs) { final ClusterState clusterState = clusterChangedEvent.state(); final ClusterState previousState = clusterChangedEvent.previousState(); final TimeValue publishTimeout = discoverySettings.getPublishTimeout(); - final boolean sendFullVersion = !discoverySettings.getPublishDiff() || previousState == null; - final SendingController sendingController = new SendingController(clusterChangedEvent.state(), minMasterNodes, totalMasterNodes, publishResponseHandler); final long publishingStartInNanos = System.nanoTime(); - // we build these early as a best effort not to commit in the case of error. - // sadly this is not water tight as it may that a failed diff based publishing to a node - // will cause a full serialization based on an older version, which may fail after the - // change has been committed. - buildDiffAndSerializeStates(clusterState, previousState, nodesToPublishTo, sendFullVersion, serializedStates, serializedDiffs); - for (final DiscoveryNode node : nodesToPublishTo) { // try and serialize the cluster state once (or per version), so we don't serialize it // per node when we send it over the wire, compress it while we are at it... @@ -141,6 +175,7 @@ public class PublishClusterStateAction extends AbstractComponent { try { long timeLeftInNanos = Math.max(0, publishTimeout.nanos() - (System.nanoTime() - publishingStartInNanos)); + final BlockingClusterStatePublishResponseHandler publishResponseHandler = sendingController.getPublishResponseHandler(); sendingController.setPublishingTimedOut(!publishResponseHandler.awaitAllNodes(TimeValue.timeValueNanos(timeLeftInNanos))); if (sendingController.getPublishingTimedOut()) { DiscoveryNode[] pendingNodes = publishResponseHandler.pendingNodes(); @@ -335,12 +370,13 @@ public class PublishClusterStateAction extends AbstractComponent { } // package private for testing + /** * does simple sanity check of the incoming cluster state. Throws an exception on rejections. */ void validateIncomingState(ClusterState incomingState, ClusterState lastSeenClusterState) { final ClusterName incomingClusterName = incomingState.getClusterName(); - if (!incomingClusterName.equals(PublishClusterStateAction.this.clusterName)) { + if (!incomingClusterName.equals(this.clusterName)) { logger.warn("received cluster state from [{}] which is also master but with a different cluster name [{}]", incomingState.nodes().masterNode(), incomingClusterName); throw new IllegalStateException("received state from a node that is not part of the cluster"); } @@ -443,17 +479,6 @@ public class PublishClusterStateAction extends AbstractComponent { } - public static class FailedToCommitException extends ElasticsearchException { - - public FailedToCommitException(StreamInput in) throws IOException { - super(in); - } - - public FailedToCommitException(String msg, Object... args) { - super(msg, args); - } - } - class SendingController { private final ClusterState clusterState; @@ -481,7 +506,7 @@ public class PublishClusterStateAction extends AbstractComponent { this.neededMastersToCommit = Math.max(0, minMasterNodes - 1); // we are one of the master nodes this.pendingMasterNodes = totalMasterNodes - 1; if (this.neededMastersToCommit > this.pendingMasterNodes) { - throw new FailedToCommitException("not enough masters to ack sent cluster state. [{}] needed , have [{}]", neededMastersToCommit, pendingMasterNodes); + throw new Discovery.FailedToCommitClusterStateException("not enough masters to ack sent cluster state. [{}] needed , have [{}]", neededMastersToCommit, pendingMasterNodes); } this.committed = neededMastersToCommit == 0; this.committedOrFailed = committed; @@ -493,14 +518,14 @@ public class PublishClusterStateAction extends AbstractComponent { try { timedout = committedOrFailedLatch.await(commitTimeout.millis(), TimeUnit.MILLISECONDS) == false; } catch (InterruptedException e) { - + // the commit check bellow will either translate to an exception or we are committed and we can safely continue } if (timedout) { markAsFailed("timed out waiting for commit (commit timeout [" + commitTimeout + "]"); } if (isCommitted() == false) { - throw new FailedToCommitException("{} enough masters to ack sent cluster state. [{}] left", + throw new Discovery.FailedToCommitClusterStateException("{} enough masters to ack sent cluster state. [{}] left", timedout ? "timed out while waiting for" : "failed to get", neededMastersToCommit); } } @@ -542,7 +567,7 @@ public class PublishClusterStateAction extends AbstractComponent { synchronized private void onMasterNodeDone(DiscoveryNode node) { pendingMasterNodes--; if (pendingMasterNodes == 0 && neededMastersToCommit > 0) { - markAsFailed("All master nodes acked or failed but [" + neededMastersToCommit + "] acks are still needed"); + markAsFailed("no more pending master nodes, but [" + neededMastersToCommit + "] acks are still needed"); } } diff --git a/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java b/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java index 00bf606d061..6817aa1b38e 100644 --- a/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java +++ b/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java @@ -27,16 +27,15 @@ import org.elasticsearch.cluster.metadata.MetaData; import org.elasticsearch.common.Priority; import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.discovery.Discovery; import org.elasticsearch.discovery.DiscoverySettings; import org.elasticsearch.discovery.zen.ZenDiscovery; import org.elasticsearch.discovery.zen.elect.ElectMasterService; import org.elasticsearch.discovery.zen.fd.FaultDetection; -import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase.ClusterScope; import org.elasticsearch.test.disruption.NetworkDelaysPartition; -import org.elasticsearch.test.disruption.NetworkUnresponsivePartition; import org.elasticsearch.test.junit.annotations.TestLogging; import org.elasticsearch.test.transport.MockTransportService; import org.junit.Test; @@ -393,7 +392,7 @@ public class MinimumMasterNodesIT extends ESIntegTestCase { logger.debug("--> waiting for cluster state to be processed/rejected"); latch.await(); - assertThat(failure.get(), instanceOf(PublishClusterStateAction.FailedToCommitException.class)); + assertThat(failure.get(), instanceOf(Discovery.FailedToCommitClusterStateException.class)); assertBusy(new Runnable() { @Override public void run() { diff --git a/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java index d2b69327bd5..1a078171fdd 100644 --- a/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java +++ b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java @@ -354,7 +354,7 @@ public class PublishClusterStateActionTests extends ESTestCase { /** - * Test concurrent publishing works correctly (although not strictly required, it's a good testamne + * Test not waiting publishing works correctly (i.e., publishing times out) */ @Test public void testSimultaneousClusterStatePublishing() throws Exception { @@ -447,9 +447,9 @@ public class PublishClusterStateActionTests extends ESTestCase { try { publishStateAndWait(nodeA.action, unserializableClusterState, previousClusterState); fail("cluster state published despite of diff errors"); - } catch (ElasticsearchException e) { + } catch (Discovery.FailedToCommitClusterStateException e) { assertThat(e.getCause(), notNullValue()); - assertThat(e.getCause().getMessage(), containsString("Simulated")); + assertThat(e.getCause().getMessage(), containsString("failed to serialize")); } } @@ -475,7 +475,7 @@ public class PublishClusterStateActionTests extends ESTestCase { try { publishState(master.action, clusterState, previousState, masterNodes + randomIntBetween(1, 5)); fail("cluster state publishing didn't fail despite of not having enough nodes"); - } catch (PublishClusterStateAction.FailedToCommitException expected) { + } catch (Discovery.FailedToCommitClusterStateException expected) { logger.debug("failed to publish as expected", expected); } } @@ -554,7 +554,7 @@ public class PublishClusterStateActionTests extends ESTestCase { if (expectingToCommit == false) { fail("cluster state publishing didn't fail despite of not have enough nodes"); } - } catch (PublishClusterStateAction.FailedToCommitException exception) { + } catch (Discovery.FailedToCommitClusterStateException exception) { logger.debug("failed to publish as expected", exception); if (expectingToCommit) { throw exception; @@ -697,7 +697,7 @@ public class PublishClusterStateActionTests extends ESTestCase { try { publishState(master.action, state, master.clusterState, 2).await(1, TimeUnit.HOURS); success = true; - } catch (PublishClusterStateAction.FailedToCommitException OK) { + } catch (Discovery.FailedToCommitClusterStateException OK) { success = false; } logger.debug("--> publishing [{}], verifying...", success ? "succeeded" : "failed"); From 0668e0d6237c4fd8ec0f60a972d59859956b1622 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Thu, 27 Aug 2015 11:29:00 +0200 Subject: [PATCH 17/21] more feedback --- .../publish/PublishClusterStateAction.java | 19 +++++++++---------- .../discovery/zen/ZenDiscoveryUnitTest.java | 2 -- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java index 32867fae78d..adddfb1a376 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java @@ -140,8 +140,7 @@ public class PublishClusterStateAction extends AbstractComponent { throw t; } catch (Throwable t) { // try to fail committing, in cause it's still on going - sendingController.markAsFailed("unexpected error [" + t.getMessage() + "]"); - if (sendingController.isCommitted() == false) { + if (sendingController.markAsFailed("unexpected error [" + t.getMessage() + "]")) { // signal the change should be rejected throw new Discovery.FailedToCommitClusterStateException("unexpected error [{}]", t, t.getMessage()); } else { @@ -215,7 +214,7 @@ public class PublishClusterStateAction extends AbstractComponent { } } - private void sendFullClusterState(ClusterState clusterState, @Nullable Map serializedStates, + private void sendFullClusterState(ClusterState clusterState, Map serializedStates, DiscoveryNode node, TimeValue publishTimeout, SendingController sendingController) { BytesReference bytes = serializedStates.get(node.version()); if (bytes == null) { @@ -247,13 +246,14 @@ public class PublishClusterStateAction extends AbstractComponent { final SendingController sendingController, final boolean sendDiffs, final Map serializedStates) { try { + + // -> no need to put a timeout on the options here, because we want the response to eventually be received + // and not log an error if it arrives after the timeout + // -> no need to compress, we already compressed the bytes TransportRequestOptions options = TransportRequestOptions.options().withType(TransportRequestOptions.Type.STATE).withCompress(false); - // no need to put a timeout on the options here, because we want the response to eventually be received - // and not log an error if it arrives after the timeout transportService.sendRequest(node, SEND_ACTION_NAME, new BytesTransportRequest(bytes, node.version()), - options, // no need to compress, we already compressed the bytes - + options, new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { @Override @@ -284,13 +284,12 @@ public class PublishClusterStateAction extends AbstractComponent { private void sendCommitToNode(final DiscoveryNode node, final ClusterState clusterState, final SendingController sendingController) { try { logger.trace("sending commit for cluster state (uuid: [{}], version [{}]) to [{}]", clusterState.stateUUID(), clusterState.version(), node); - TransportRequestOptions options = TransportRequestOptions.options().withType(TransportRequestOptions.Type.STATE).withCompress(false); + TransportRequestOptions options = TransportRequestOptions.options().withType(TransportRequestOptions.Type.STATE); // no need to put a timeout on the options here, because we want the response to eventually be received // and not log an error if it arrives after the timeout transportService.sendRequest(node, COMMIT_ACTION_NAME, new CommitClusterStateRequest(clusterState.stateUUID()), - options, // no need to compress, we already compressed the bytes - + options, new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { @Override diff --git a/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java b/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java index 0ce3fd45a36..61b1062dbed 100644 --- a/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java +++ b/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java @@ -25,7 +25,6 @@ import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.common.transport.DummyTransportAddress; -import org.elasticsearch.common.transport.TransportAddress; import org.elasticsearch.test.ESTestCase; import java.util.Collections; @@ -46,7 +45,6 @@ public class ZenDiscoveryUnitTest extends ESTestCase { DiscoveryNodes.Builder currentNodes = DiscoveryNodes.builder(); currentNodes.masterNodeId("a").put(new DiscoveryNode("a", DummyTransportAddress.INSTANCE, Version.CURRENT)); - ; DiscoveryNodes.Builder newNodes = DiscoveryNodes.builder(); newNodes.masterNodeId("a").put(new DiscoveryNode("a", DummyTransportAddress.INSTANCE, Version.CURRENT)); From 10e8c410ea853654868df52dc06883826562b839 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Thu, 27 Aug 2015 17:15:18 +0200 Subject: [PATCH 18/21] more feedback --- .../publish/PublishClusterStateAction.java | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java index adddfb1a376..a9b9010d910 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java @@ -25,7 +25,6 @@ import org.elasticsearch.Version; import org.elasticsearch.cluster.*; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodes; -import org.elasticsearch.common.Nullable; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.compress.Compressor; @@ -220,9 +219,7 @@ public class PublishClusterStateAction extends AbstractComponent { if (bytes == null) { try { bytes = serializeFullClusterState(clusterState, node.version()); - if (serializedStates != null) { - serializedStates.put(node.version(), bytes); - } + serializedStates.put(node.version(), bytes); } catch (Throwable e) { logger.warn("failed to serialize cluster_state before publishing it to node {}", e, node); sendingController.onNodeSendFailed(node, e); @@ -478,6 +475,12 @@ public class PublishClusterStateAction extends AbstractComponent { } + /** + * Coordinates acknowledgments of the sent cluster state from the different nodes. Commits the change + * after `minimum_master_nodes` have successfully responded or fails the entire change. After committing + * the cluster state, will trigger a commit message to all nodes that responded previously and responds immediately + * to all future acknowledgments. + */ class SendingController { private final ClusterState clusterState; @@ -543,14 +546,18 @@ public class PublishClusterStateAction extends AbstractComponent { // we're still waiting sendAckedBeforeCommit.add(node); if (node.isMasterNode()) { - onMasterNodeSendAck(node); + checkForCommitOrFailIfNoPending(node); } } } - synchronized private void onMasterNodeSendAck(DiscoveryNode node) { + /** + * check if enough master node responded to commit the change. fails the commit + * if there are no more pending master nodes but not enough acks to commit. + */ + synchronized private void checkForCommitOrFailIfNoPending(DiscoveryNode masterNode) { logger.trace("master node {} acked cluster state version [{}]. processing ... (current pending [{}], needed [{}])", - node, clusterState.version(), pendingMasterNodes, neededMastersToCommit); + masterNode, clusterState.version(), pendingMasterNodes, neededMastersToCommit); neededMastersToCommit--; if (neededMastersToCommit == 0) { if (markAsCommitted()) { @@ -560,13 +567,13 @@ public class PublishClusterStateAction extends AbstractComponent { sendAckedBeforeCommit.clear(); } } - onMasterNodeDone(node); + decrementPendingMasterAcksAndChangeForFailure(); } - synchronized private void onMasterNodeDone(DiscoveryNode node) { + synchronized private void decrementPendingMasterAcksAndChangeForFailure() { pendingMasterNodes--; if (pendingMasterNodes == 0 && neededMastersToCommit > 0) { - markAsFailed("no more pending master nodes, but [" + neededMastersToCommit + "] acks are still needed"); + markAsFailed("no more pending master nodes, but failed to reach needed acks ([" + neededMastersToCommit + "] left)"); } } @@ -574,7 +581,7 @@ public class PublishClusterStateAction extends AbstractComponent { if (node.isMasterNode()) { logger.trace("master node {} failed to ack cluster state version [{}]. processing ... (current pending [{}], needed [{}])", node, clusterState.version(), pendingMasterNodes, neededMastersToCommit); - onMasterNodeDone(node); + decrementPendingMasterAcksAndChangeForFailure(); } publishResponseHandler.onFailure(node, t); } From 218979da1b6026f0ad7d6f4616589c30d6eb78a2 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Fri, 28 Aug 2015 12:31:37 +0200 Subject: [PATCH 19/21] remove committedOrFailed and use committedOrFailedLatch for state --- .../zen/publish/PublishClusterStateAction.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java index a9b9010d910..c9fe8d688ff 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java @@ -491,10 +491,9 @@ public class PublishClusterStateAction extends AbstractComponent { private final BlockingClusterStatePublishResponseHandler publishResponseHandler; final ArrayList sendAckedBeforeCommit = new ArrayList<>(); - final CountDownLatch committedOrFailedLatch; // writes and reads of these are protected under synchronization - boolean committedOrFailed; // true if a decision was made w.r.t committing or failing + final CountDownLatch committedOrFailedLatch; // 0 count indicates that a decision was made w.r.t committing or failing boolean committed; // true if cluster state was committed int neededMastersToCommit; // number of master nodes acks still needed before committing int pendingMasterNodes; // how many master node still need to respond @@ -511,7 +510,6 @@ public class PublishClusterStateAction extends AbstractComponent { throw new Discovery.FailedToCommitClusterStateException("not enough masters to ack sent cluster state. [{}] needed , have [{}]", neededMastersToCommit, pendingMasterNodes); } this.committed = neededMastersToCommit == 0; - this.committedOrFailed = committed; this.committedOrFailedLatch = new CountDownLatch(committed ? 0 : 1); } @@ -540,7 +538,7 @@ public class PublishClusterStateAction extends AbstractComponent { if (committed) { assert sendAckedBeforeCommit.isEmpty(); sendCommitToNode(node, clusterState, this); - } else if (committedOrFailed) { + } else if (committedOrFailed()) { logger.trace("ignoring ack from [{}] for cluster state version [{}]. already failed", node, clusterState.version()); } else { // we're still waiting @@ -551,6 +549,10 @@ public class PublishClusterStateAction extends AbstractComponent { } } + private synchronized boolean committedOrFailed() { + return committedOrFailedLatch.getCount() == 0; + } + /** * check if enough master node responded to commit the change. fails the commit * if there are no more pending master nodes but not enough acks to commit. @@ -592,12 +594,11 @@ public class PublishClusterStateAction extends AbstractComponent { * @return true if successful */ synchronized private boolean markAsCommitted() { - if (committedOrFailed) { + if (committedOrFailed()) { return committed; } logger.trace("committing version [{}]", clusterState.version()); committed = true; - committedOrFailed = true; committedOrFailedLatch.countDown(); return true; } @@ -608,11 +609,10 @@ public class PublishClusterStateAction extends AbstractComponent { * @return true if the publishing was failed and the cluster state is *not* committed **/ synchronized private boolean markAsFailed(String reason) { - if (committedOrFailed) { + if (committedOrFailed()) { return committed == false; } logger.trace("failed to commit version [{}]. {}", clusterState.version(), reason); - committedOrFailed = true; committed = false; committedOrFailedLatch.countDown(); return true; From 80b59e0d66a91c4fdfccd2cfd2432060f5600bd4 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Tue, 1 Sep 2015 15:39:00 +0200 Subject: [PATCH 20/21] Discovery: Add a dedicate queue for incoming ClusterStates The initial implementation of two phase commit based cluster state publishing (#13062) relied on a single in memory "pending" cluster state that is only processed by ZenDiscovery once committed by the master. While this is fine on it's own, it resulted in an issue with acknowledged APIs, such as the open index API, in the extreme case where a node falls behind and receives a commit message after a new cluster state has been published. Specifically: 1) Master receives and acked-API call and publishes cluster state CS1 2) Master waits for a min-master nodes to receives CS1 and commits it. 3) All nodes that have responded to CS1 are sent a commit message, however, node N didn't respond yet 4) Master waits for publish timeout (defaults to 30s) for all nodes to process the commit. Node N fails to do so. 5) Master publishes a cluster state CS2. Node N responds to cluster state CS1's publishing but receives cluster state CS2 before the commit for CS1 arrives. 6) The commit message for cluster CS1 is processed on node N, but fails because CS2 is pending. This caused the acked API in step 1 to return (but CS2 , is not yet processed). In this case, the action indicated by CS1 is not yet executed on node N and therefore the acked API calls return pre-maturely. Note that once CS2 is processed but the change in CS1 takes effect (cluster state operations are safe to batch and we do so all the time). An example failure can be found on: http://build-us-00.elastic.co/job/es_feature_two_phase_pub/314/ This commit extracts the already existing pending cluster state queue (processNewClusterStates) from ZenDiscovery into it's own class, which serves as a temporary container for in-flight cluster states. Once committed the cluster states are transferred to ZenDiscovery as they used to before. This allows "lagging" cluster states to still be successfully committed and processed (and likely to be ignored as a newer cluster state has already been processed). As a side effect, all batching logic is now extracted from ZenDiscovery and is unit tested. --- .../elasticsearch/cluster/ClusterState.java | 16 +- .../discovery/local/LocalDiscovery.java | 2 +- .../discovery/zen/ZenDiscovery.java | 268 ++++++---------- .../publish/PendingClusterStatesQueue.java | 286 ++++++++++++++++++ .../publish/PublishClusterStateAction.java | 89 +++--- .../cluster/ClusterStateTests.java | 54 ++++ .../org/elasticsearch/cluster/ack/AckIT.java | 16 +- .../DiscoveryWithServiceDisruptionsIT.java | 16 +- .../discovery/zen/ZenDiscoveryUnitTest.java | 57 +--- .../PendingClusterStatesQueueTests.java | 224 ++++++++++++++ .../PublishClusterStateActionTests.java | 146 ++++----- .../elasticsearch/test/ESIntegTestCase.java | 49 ++- 12 files changed, 820 insertions(+), 403 deletions(-) create mode 100644 core/src/main/java/org/elasticsearch/discovery/zen/publish/PendingClusterStatesQueue.java create mode 100644 core/src/test/java/org/elasticsearch/cluster/ClusterStateTests.java create mode 100644 core/src/test/java/org/elasticsearch/discovery/zen/publish/PendingClusterStatesQueueTests.java diff --git a/core/src/main/java/org/elasticsearch/cluster/ClusterState.java b/core/src/main/java/org/elasticsearch/cluster/ClusterState.java index 5b05f199dfa..21962fb421b 100644 --- a/core/src/main/java/org/elasticsearch/cluster/ClusterState.java +++ b/core/src/main/java/org/elasticsearch/cluster/ClusterState.java @@ -52,10 +52,7 @@ import org.elasticsearch.discovery.local.LocalDiscovery; import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction; import java.io.IOException; -import java.util.EnumSet; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; +import java.util.*; /** * Represents the current state of the cluster. @@ -296,6 +293,16 @@ public class ClusterState implements ToXContent, Diffable { } } + /** + * a cluster state supersedes another state iff they are from the same master and the version this state is higher thant the other state. + *

+ * In essence that means that all the changes from the other cluster state are also reflected by the current one + */ + public boolean supersedes(ClusterState other) { + return this.nodes().masterNodeId() != null && this.nodes().masterNodeId().equals(other.nodes().masterNodeId()) && this.version() > other.version(); + + } + public enum Metric { VERSION("version"), MASTER_NODE("master_node"), @@ -814,6 +821,7 @@ public class ClusterState implements ToXContent, Diffable { builder.fromDiff(true); return builder.build(); } + } } diff --git a/core/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java b/core/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java index a8d82ccb84b..91c2bf22c1e 100644 --- a/core/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java +++ b/core/src/main/java/org/elasticsearch/discovery/local/LocalDiscovery.java @@ -357,7 +357,7 @@ public class LocalDiscovery extends AbstractLifecycleComponent implem discovery.clusterService.submitStateUpdateTask("local-disco-receive(from master)", new ProcessedClusterStateNonMasterUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { - if (nodeSpecificClusterState.version() < currentState.version() && Objects.equal(nodeSpecificClusterState.nodes().masterNodeId(), currentState.nodes().masterNodeId())) { + if (currentState.supersedes(nodeSpecificClusterState)) { return currentState; } diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java index a38fef4cc71..3887ee4256a 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java @@ -19,18 +19,11 @@ package org.elasticsearch.discovery.zen; -import com.google.common.base.Objects; import com.google.common.collect.Sets; +import org.elasticsearch.ElasticsearchException; import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.Version; -import org.elasticsearch.cluster.ClusterChangedEvent; -import org.elasticsearch.cluster.ClusterName; -import org.elasticsearch.cluster.ClusterService; -import org.elasticsearch.cluster.ClusterState; -import org.elasticsearch.cluster.ClusterStateNonMasterUpdateTask; -import org.elasticsearch.cluster.ClusterStateUpdateTask; -import org.elasticsearch.cluster.ProcessedClusterStateNonMasterUpdateTask; -import org.elasticsearch.cluster.ProcessedClusterStateUpdateTask; +import org.elasticsearch.cluster.*; import org.elasticsearch.cluster.block.ClusterBlocks; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.MetaData; @@ -49,7 +42,6 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.common.util.concurrent.ConcurrentCollections; import org.elasticsearch.discovery.Discovery; import org.elasticsearch.discovery.DiscoverySettings; import org.elasticsearch.discovery.InitialStateDiscoveryListener; @@ -64,20 +56,12 @@ import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction; import org.elasticsearch.node.service.NodeService; import org.elasticsearch.node.settings.NodeSettingsService; import org.elasticsearch.threadpool.ThreadPool; -import org.elasticsearch.transport.EmptyTransportResponseHandler; -import org.elasticsearch.transport.TransportChannel; -import org.elasticsearch.transport.TransportException; -import org.elasticsearch.transport.TransportRequest; -import org.elasticsearch.transport.TransportRequestHandler; -import org.elasticsearch.transport.TransportResponse; -import org.elasticsearch.transport.TransportService; +import org.elasticsearch.transport.*; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.Queue; import java.util.Set; -import java.util.concurrent.BlockingQueue; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; @@ -199,7 +183,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService, clusterName); this.nodesFD.addListener(new NodeFaultDetectionListener()); - this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewClusterStateListener(), discoverySettings, clusterName); + this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewPendingClusterStateListener(), discoverySettings, clusterName); this.pingService.setPingContextProvider(this); this.membership = new MembershipAction(settings, clusterService, transportService, this, new MembershipListener()); @@ -358,6 +342,12 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen return joinThreadControl.joinThreadActive(); } + + // used for testing + public ClusterState[] pendingClusterStates() { + return publishClusterState.pendingStatesQueue().pendingClusterStates(); + } + /** * the main function of a join thread. This function is guaranteed to join the cluster * or spawn a new join thread upon failure to do so. @@ -428,7 +418,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen return joinThreadControl.stopRunningThreadAndRejoin(currentState, "master_switched_while_finalizing_join"); } - // Note: we do not have to start master fault detection here because it's set at {@link #handleNewClusterStateFromMaster } + // Note: we do not have to start master fault detection here because it's set at {@link #processNextPendingClusterState } // when the first cluster state arrives. joinThreadControl.markThreadAsDone(currentThread); return currentState; @@ -634,9 +624,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen .masterNodeId(null).build(); // flush any pending cluster states from old master, so it will not be set as master again - ArrayList pendingNewClusterStates = new ArrayList<>(); - processNewClusterStates.drainTo(pendingNewClusterStates); - logger.trace("removed [{}] pending cluster states", pendingNewClusterStates.size()); + publishClusterState.pendingStatesQueue().failAllStatesAndClear(new ElasticsearchException("master left [{}]", reason)); if (rejoinOnMasterGone) { return rejoin(ClusterState.builder(currentState).nodes(discoveryNodes).build(), "master left (reason = " + reason + ")"); @@ -682,171 +670,98 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen }); } - static class ProcessClusterState { - final ClusterState clusterState; - volatile boolean processed; + void processNextPendingClusterState(String reason) { + clusterService.submitStateUpdateTask("zen-disco-receive(from master [" + reason + "])", Priority.URGENT, new ProcessedClusterStateNonMasterUpdateTask() { + ClusterState newClusterState = null; - ProcessClusterState(ClusterState clusterState) { - this.clusterState = clusterState; - } - } + @Override + public ClusterState execute(ClusterState currentState) { + newClusterState = publishClusterState.pendingStatesQueue().getNextClusterStateToProcess(); - private final BlockingQueue processNewClusterStates = ConcurrentCollections.newBlockingQueue(); - - void handleNewClusterStateFromMaster(ClusterState newClusterState, final PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { - final ClusterName incomingClusterName = newClusterState.getClusterName(); - if (localNodeMaster()) { - logger.debug("received cluster state from [{}] which is also master with cluster name [{}]", newClusterState.nodes().masterNode(), incomingClusterName); - final ClusterState newState = newClusterState; - clusterService.submitStateUpdateTask("zen-disco-master_receive_cluster_state_from_another_master [" + newState.nodes().masterNode() + "]", Priority.URGENT, new ProcessedClusterStateUpdateTask() { - @Override - public ClusterState execute(ClusterState currentState) { - return handleAnotherMaster(currentState, newState.nodes().masterNode(), newState.version(), "via a new cluster state"); + // all pending states have been processed + if (newClusterState == null) { + return currentState; } - @Override - public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { - newStateProcessed.onNewClusterStateProcessed(); + assert newClusterState.nodes().masterNode() != null : "received a cluster state without a master"; + assert !newClusterState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock()) : "received a cluster state with a master block"; + + if (currentState.nodes().localNodeMaster()) { + return handleAnotherMaster(currentState, newClusterState.nodes().masterNode(), newClusterState.version(), "via a new cluster state"); } - @Override - public void onFailure(String source, Throwable t) { - logger.error("unexpected failure during [{}]", t, source); - newStateProcessed.onNewClusterStateFailed(t); + if (shouldIgnoreOrRejectNewClusterState(logger, currentState, newClusterState)) { + return currentState; } - }); - } else { + // check to see that we monitor the correct master of the cluster + if (masterFD.masterNode() == null || !masterFD.masterNode().equals(newClusterState.nodes().masterNode())) { + masterFD.restart(newClusterState.nodes().masterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]"); + } + + if (currentState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock())) { + // its a fresh update from the master as we transition from a start of not having a master to having one + logger.debug("got first state from fresh master [{}]", newClusterState.nodes().masterNodeId()); + long count = clusterJoinsCounter.incrementAndGet(); + logger.trace("updated cluster join cluster to [{}]", count); + + return newClusterState; + } - final ProcessClusterState processClusterState = new ProcessClusterState(newClusterState); - processNewClusterStates.add(processClusterState); + // some optimizations to make sure we keep old objects where possible + ClusterState.Builder builder = ClusterState.builder(newClusterState); - assert newClusterState.nodes().masterNode() != null : "received a cluster state without a master"; - assert !newClusterState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock()) : "received a cluster state with a master block"; - - clusterService.submitStateUpdateTask("zen-disco-receive(from master [" + newClusterState.nodes().masterNode() + "])", Priority.URGENT, new ProcessedClusterStateNonMasterUpdateTask() { - @Override - public ClusterState execute(ClusterState currentState) { - // we already processed it in a previous event - if (processClusterState.processed) { - return currentState; - } - - // TODO: once improvement that we can do is change the message structure to include version and masterNodeId - // at the start, this will allow us to keep the "compressed bytes" around, and only parse the first page - // to figure out if we need to use it or not, and only once we picked the latest one, parse the whole state - - - ClusterState updatedState = selectNextStateToProcess(processNewClusterStates); - if (updatedState == null) { - updatedState = currentState; - } - if (shouldIgnoreOrRejectNewClusterState(logger, currentState, updatedState)) { - return currentState; - } - - // we don't need to do this, since we ping the master, and get notified when it has moved from being a master - // because it doesn't have enough master nodes... - //if (!electMaster.hasEnoughMasterNodes(newState.nodes())) { - // return disconnectFromCluster(newState, "not enough master nodes on new cluster state wreceived from [" + newState.nodes().masterNode() + "]"); - //} - - // check to see that we monitor the correct master of the cluster - if (masterFD.masterNode() == null || !masterFD.masterNode().equals(updatedState.nodes().masterNode())) { - masterFD.restart(updatedState.nodes().masterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]"); - } - - if (currentState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock())) { - // its a fresh update from the master as we transition from a start of not having a master to having one - logger.debug("got first state from fresh master [{}]", updatedState.nodes().masterNodeId()); - long count = clusterJoinsCounter.incrementAndGet(); - logger.trace("updated cluster join cluster to [{}]", count); - - return updatedState; - } - - - // some optimizations to make sure we keep old objects where possible - ClusterState.Builder builder = ClusterState.builder(updatedState); - - // if the routing table did not change, use the original one - if (updatedState.routingTable().version() == currentState.routingTable().version()) { - builder.routingTable(currentState.routingTable()); - } - // same for metadata - if (updatedState.metaData().version() == currentState.metaData().version()) { - builder.metaData(currentState.metaData()); - } else { - // if its not the same version, only copy over new indices or ones that changed the version - MetaData.Builder metaDataBuilder = MetaData.builder(updatedState.metaData()).removeAllIndices(); - for (IndexMetaData indexMetaData : updatedState.metaData()) { - IndexMetaData currentIndexMetaData = currentState.metaData().index(indexMetaData.index()); - if (currentIndexMetaData != null && currentIndexMetaData.isSameUUID(indexMetaData.indexUUID()) && - currentIndexMetaData.version() == indexMetaData.version()) { - // safe to reuse - metaDataBuilder.put(currentIndexMetaData, false); - } else { - metaDataBuilder.put(indexMetaData, false); - } + // if the routing table did not change, use the original one + if (newClusterState.routingTable().version() == currentState.routingTable().version()) { + builder.routingTable(currentState.routingTable()); + } + // same for metadata + if (newClusterState.metaData().version() == currentState.metaData().version()) { + builder.metaData(currentState.metaData()); + } else { + // if its not the same version, only copy over new indices or ones that changed the version + MetaData.Builder metaDataBuilder = MetaData.builder(newClusterState.metaData()).removeAllIndices(); + for (IndexMetaData indexMetaData : newClusterState.metaData()) { + IndexMetaData currentIndexMetaData = currentState.metaData().index(indexMetaData.index()); + if (currentIndexMetaData != null && currentIndexMetaData.isSameUUID(indexMetaData.indexUUID()) && + currentIndexMetaData.version() == indexMetaData.version()) { + // safe to reuse + metaDataBuilder.put(currentIndexMetaData, false); + } else { + metaDataBuilder.put(indexMetaData, false); } - builder.metaData(metaDataBuilder); } - - return builder.build(); + builder.metaData(metaDataBuilder); } - @Override - public void onFailure(String source, Throwable t) { - logger.error("unexpected failure during [{}]", t, source); - newStateProcessed.onNewClusterStateFailed(t); - } + return builder.build(); + } - @Override - public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { + @Override + public void onFailure(String source, Throwable t) { + logger.error("unexpected failure during [{}]", t, source); + if (newClusterState != null) { + try { + publishClusterState.pendingStatesQueue().markAsFailed(newClusterState, t); + } catch (Throwable unexpected) { + logger.error("unexpected exception while failing [{}]", unexpected, source); + } + } + } + + @Override + public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { + try { sendInitialStateEventIfNeeded(); - newStateProcessed.onNewClusterStateProcessed(); + if (newClusterState != null) { + publishClusterState.pendingStatesQueue().markAsProcessed(newClusterState); + } + } catch (Throwable t) { + onFailure(source, t); } - }); - } - } - - /** - * Picks the cluster state with highest version with the same master from the queue. All cluster states with - * lower versions are ignored. If a cluster state with a different master is seen the processing logic stops and the - * last processed state is returned. - */ - static ClusterState selectNextStateToProcess(Queue processNewClusterStates) { - // try and get the state with the highest version out of all the ones with the same master node id - ProcessClusterState stateToProcess = processNewClusterStates.poll(); - if (stateToProcess == null) { - return null; - } - stateToProcess.processed = true; - while (true) { - ProcessClusterState potentialState = processNewClusterStates.peek(); - // nothing else in the queue, bail - if (potentialState == null) { - break; } - // if its not from the same master, then bail - if (!Objects.equal(stateToProcess.clusterState.nodes().masterNodeId(), potentialState.clusterState.nodes().masterNodeId())) { - break; - } - // we are going to use it for sure, poll (remove) it - potentialState = processNewClusterStates.poll(); - if (potentialState == null) { - // might happen if the queue is drained - break; - } - potentialState.processed = true; - - if (potentialState.clusterState.version() > stateToProcess.clusterState.version()) { - // we found a new one - stateToProcess = potentialState; - } - } - return stateToProcess.clusterState; + }); } /** @@ -857,7 +772,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen */ static boolean shouldIgnoreOrRejectNewClusterState(ESLogger logger, ClusterState currentState, ClusterState newClusterState) { validateStateIsFromCurrentMaster(logger, currentState.nodes(), newClusterState); - if (currentState.nodes().masterNodeId() != null && newClusterState.version() < currentState.version()) { + if (currentState.supersedes(newClusterState)) { // if the new state has a smaller version, and it has the same master node, then no need to process it logger.debug("received a cluster state that has a lower version than the current one, ignoring (received {}, current {})", newClusterState.version(), currentState.version()); return true; @@ -1073,11 +988,11 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen } } - private class NewClusterStateListener implements PublishClusterStateAction.NewClusterStateListener { + private class NewPendingClusterStateListener implements PublishClusterStateAction.NewPendingClusterStateListener { @Override - public void onNewClusterState(ClusterState clusterState, NewStateProcessed newStateProcessed) { - handleNewClusterStateFromMaster(clusterState, newStateProcessed); + public void onNewClusterState(String reason) { + processNextPendingClusterState(reason); } } @@ -1111,11 +1026,6 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen return; } - // nodes pre 1.4.0 do not send this information - if (pingRequest.masterNode() == null) { - return; - } - if (pingsWhileMaster.incrementAndGet() < maxPingsFromAnotherMaster) { logger.trace("got a ping from another master {}. current ping count: [{}]", pingRequest.masterNode(), pingsWhileMaster.get()); return; diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PendingClusterStatesQueue.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PendingClusterStatesQueue.java new file mode 100644 index 00000000000..fc894f3d07e --- /dev/null +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PendingClusterStatesQueue.java @@ -0,0 +1,286 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.discovery.zen.publish; + +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.common.logging.ESLogger; + +import java.util.ArrayList; +import java.util.Locale; +import java.util.Objects; + +/** + * A queue that holds all "in-flight" incoming cluster states from the master. Once a master commits a cluster + * state, it is made available via {@link #getNextClusterStateToProcess()}. The class also takes care of batching + * cluster states for processing and failures. + *

+ * The queue is bound by {@link #maxQueueSize}. When the queue is at capacity and a new cluster state is inserted + * the oldest cluster state will be dropped. This is safe because: + * 1) Under normal operations, master will publish & commit a cluster state before processing another change (i.e., the queue length is 1) + * 2) If the master fails to commit a change, it will step down, causing a master election, which will flush the queue. + * 3) In general it's safe to process the incoming cluster state as a replacement to the cluster state that's dropped. + * a) If the dropped cluster is from the same master as the incoming one is, it is likely to be superseded by the incoming state (or another state in the queue). + * This is only not true in very extreme cases of out of order delivery. + * b) If the dropping cluster state is not from the same master, it means that: + * i) we are no longer following the master of the dropped cluster state but follow the incoming one + * ii) we are no longer following any master, in which case it doesn't matter which cluster state will be processed first. + *

+ * The class is fully thread safe and can be used concurrently. + */ +public class PendingClusterStatesQueue { + + interface StateProcessedListener { + + void onNewClusterStateProcessed(); + + void onNewClusterStateFailed(Throwable t); + } + + final ArrayList pendingStates = new ArrayList<>(); + final ESLogger logger; + final int maxQueueSize; + + public PendingClusterStatesQueue(ESLogger logger, int maxQueueSize) { + this.logger = logger; + this.maxQueueSize = maxQueueSize; + } + + /** Add an incoming, not yet committed cluster state */ + public synchronized void addPending(ClusterState state) { + pendingStates.add(new ClusterStateContext(state)); + if (pendingStates.size() > maxQueueSize) { + ClusterStateContext context = pendingStates.remove(0); + logger.warn("dropping pending state [{}]. more than [{}] pending states.", context, maxQueueSize); + if (context.committed()) { + context.listener.onNewClusterStateFailed(new ElasticsearchException("too many pending states ([{}] pending)", maxQueueSize)); + } + } + } + + /** + * Mark a previously added cluster state as committed. This will make it available via {@link #getNextClusterStateToProcess()} + * When the cluster state is processed (or failed), the supplied listener will be called + **/ + public synchronized ClusterState markAsCommitted(String stateUUID, StateProcessedListener listener) { + final ClusterStateContext context = findState(stateUUID); + if (context == null) { + listener.onNewClusterStateFailed(new IllegalStateException("can't resolve cluster state with uuid [" + stateUUID + "] to commit")); + return null; + } + if (context.committed()) { + listener.onNewClusterStateFailed(new IllegalStateException("cluster state with uuid [" + stateUUID + "] is already committed")); + return null; + } + context.markAsCommitted(listener); + return context.state; + } + + /** + * mark that the processing of the given state has failed. All committed states that are {@link ClusterState#supersedes(ClusterState)}-ed + * by this failed state, will be failed as well + */ + public synchronized void markAsFailed(ClusterState state, Throwable reason) { + final ClusterStateContext failedContext = findState(state.stateUUID()); + if (failedContext == null) { + throw new IllegalArgumentException("can't resolve failed cluster state with uuid [" + state.stateUUID() + "], version [" + state.version() + "]"); + } + if (failedContext.committed() == false) { + throw new IllegalArgumentException("failed cluster state is not committed " + state); + } + + // fail all committed states which are batch together with the failed state + ArrayList statesToRemove = new ArrayList<>(); + for (int index = 0; index < pendingStates.size(); index++) { + final ClusterStateContext pendingContext = pendingStates.get(index); + if (pendingContext.committed() == false) { + continue; + } + final ClusterState pendingState = pendingContext.state; + if (pendingContext.equals(failedContext)) { + statesToRemove.add(pendingContext); + pendingContext.listener.onNewClusterStateFailed(reason); + } else if (state.supersedes(pendingState)) { + statesToRemove.add(pendingContext); + logger.debug("failing committed state {} together with state {}", pendingContext, failedContext); + pendingContext.listener.onNewClusterStateFailed(reason); + } + } + pendingStates.removeAll(statesToRemove); + assert findState(state.stateUUID()) == null : "state was marked as processed but can still be found in pending list " + state; + } + + /** + * indicates that a cluster state was successfully processed. Any committed state that is {@link ClusterState#supersedes(ClusterState)}-ed + * by the processed state will be marked as processed as well. + *

+ * NOTE: successfully processing a state indicates we are following the master it came from. Any committed state from another master will + * be failed by this method + */ + public synchronized void markAsProcessed(ClusterState state) { + if (findState(state.stateUUID()) == null) { + throw new IllegalStateException("can't resolve processed cluster state with uuid [" + state.stateUUID() + "], version [" + state.version() + "]"); + } + final DiscoveryNode currentMaster = state.nodes().masterNode(); + assert currentMaster != null : "processed cluster state mast have a master. " + state; + + // fail or remove any incoming state from a different master + // respond to any committed state from the same master with same or lower version (we processed a higher version) + ArrayList contextsToRemove = new ArrayList<>(); + for (int index = 0; index < pendingStates.size(); index++) { + final ClusterStateContext pendingContext = pendingStates.get(index); + final ClusterState pendingState = pendingContext.state; + final DiscoveryNode pendingMasterNode = pendingState.nodes().masterNode(); + if (Objects.equals(currentMaster, pendingMasterNode) == false) { + contextsToRemove.add(pendingContext); + if (pendingContext.committed()) { + // this is a committed state , warn + logger.warn("received a cluster state (uuid[{}]/v[{}]) from a different master than the current one, rejecting (received {}, current {})", + pendingState.stateUUID(), pendingState.version(), + pendingMasterNode, currentMaster); + pendingContext.listener.onNewClusterStateFailed( + new IllegalStateException("cluster state from a different master than the current one, rejecting (received " + pendingMasterNode + ", current " + currentMaster + ")") + ); + } else { + logger.trace("removing non-committed state with uuid[{}]/v[{}] from [{}] - a state from [{}] was successfully processed", + pendingState.stateUUID(), pendingState.version(), pendingMasterNode, + currentMaster + ); + } + } else if (state.supersedes(pendingState) && pendingContext.committed()) { + logger.trace("processing pending state uuid[{}]/v[{}] together with state uuid[{}]/v[{}]", + pendingState.stateUUID(), pendingState.version(), state.stateUUID(), state.version() + ); + contextsToRemove.add(pendingContext); + pendingContext.listener.onNewClusterStateProcessed(); + } else if (pendingState.stateUUID().equals(state.stateUUID())) { + assert pendingContext.committed() : "processed cluster state is not committed " + state; + contextsToRemove.add(pendingContext); + pendingContext.listener.onNewClusterStateProcessed(); + } + } + // now ack the processed state + pendingStates.removeAll(contextsToRemove); + assert findState(state.stateUUID()) == null : "state was marked as processed but can still be found in pending list " + state; + + } + + ClusterStateContext findState(String stateUUID) { + for (int i = 0; i < pendingStates.size(); i++) { + final ClusterStateContext context = pendingStates.get(i); + if (context.stateUUID().equals(stateUUID)) { + return context; + } + } + return null; + } + + /** clear the incoming queue. any committed state will be failed */ + public synchronized void failAllStatesAndClear(Throwable reason) { + for (ClusterStateContext pendingState : pendingStates) { + if (pendingState.committed()) { + pendingState.listener.onNewClusterStateFailed(reason); + } + } + pendingStates.clear(); + } + + /** + * Gets the next committed state to process. + *

+ * The method tries to batch operation by getting the cluster state the highest possible committed states + * which succeeds the first committed state in queue (i.e., it comes from the same master). + */ + public synchronized ClusterState getNextClusterStateToProcess() { + if (pendingStates.isEmpty()) { + return null; + } + + ClusterStateContext stateToProcess = null; + int index = 0; + for (; index < pendingStates.size(); index++) { + ClusterStateContext potentialState = pendingStates.get(index); + if (potentialState.committed()) { + stateToProcess = potentialState; + break; + } + } + if (stateToProcess == null) { + return null; + } + + // now try to find the highest committed state from the same master + for (; index < pendingStates.size(); index++) { + ClusterStateContext potentialState = pendingStates.get(index); + + if (potentialState.state.supersedes(stateToProcess.state) && potentialState.committed()) { + // we found a new one + stateToProcess = potentialState; + } + } + assert stateToProcess.committed() : "should only return committed cluster state. found " + stateToProcess.state; + return stateToProcess.state; + } + + /** returns all pending states, committed or not */ + public synchronized ClusterState[] pendingClusterStates() { + ArrayList states = new ArrayList<>(); + for (ClusterStateContext context : pendingStates) { + states.add(context.state); + } + return states.toArray(new ClusterState[states.size()]); + } + + static class ClusterStateContext { + final ClusterState state; + StateProcessedListener listener; + + ClusterStateContext(ClusterState clusterState) { + this.state = clusterState; + } + + void markAsCommitted(StateProcessedListener listener) { + if (this.listener != null) { + throw new IllegalStateException(toString() + "is already committed"); + } + this.listener = listener; + } + + boolean committed() { + return listener != null; + } + + public String stateUUID() { + return state.stateUUID(); + } + + @Override + public String toString() { + return String.format( + Locale.ROOT, + "[uuid[%s], v[%d], m[%s]]", + stateUUID(), + state.version(), + state.nodes().masterNodeId() + ); + } + } + +} diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java index c9fe8d688ff..4da6e42601b 100644 --- a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java +++ b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java @@ -57,32 +57,30 @@ public class PublishClusterStateAction extends AbstractComponent { public static final String SEND_ACTION_NAME = "internal:discovery/zen/publish/send"; public static final String COMMIT_ACTION_NAME = "internal:discovery/zen/publish/commit"; - public interface NewClusterStateListener { + public static final String SETTINGS_MAX_PENDING_CLUSTER_STATES = "discovery.zen.publish.max_pending_cluster_states"; - interface NewStateProcessed { + public interface NewPendingClusterStateListener { - void onNewClusterStateProcessed(); - - void onNewClusterStateFailed(Throwable t); - } - - void onNewClusterState(ClusterState clusterState, NewStateProcessed newStateProcessed); + /** a new cluster state has been committed and is ready to process via {@link #pendingStatesQueue()} */ + void onNewClusterState(String reason); } private final TransportService transportService; private final DiscoveryNodesProvider nodesProvider; - private final NewClusterStateListener listener; + private final NewPendingClusterStateListener newPendingClusterStatelistener; private final DiscoverySettings discoverySettings; private final ClusterName clusterName; + private final PendingClusterStatesQueue pendingStatesQueue; public PublishClusterStateAction(Settings settings, TransportService transportService, DiscoveryNodesProvider nodesProvider, - NewClusterStateListener listener, DiscoverySettings discoverySettings, ClusterName clusterName) { + NewPendingClusterStateListener listener, DiscoverySettings discoverySettings, ClusterName clusterName) { super(settings); this.transportService = transportService; this.nodesProvider = nodesProvider; - this.listener = listener; + this.newPendingClusterStatelistener = listener; this.discoverySettings = discoverySettings; this.clusterName = clusterName; + this.pendingStatesQueue = new PendingClusterStatesQueue(logger, settings.getAsInt(SETTINGS_MAX_PENDING_CLUSTER_STATES, 25)); transportService.registerRequestHandler(SEND_ACTION_NAME, BytesTransportRequest.class, ThreadPool.Names.SAME, new SendClusterStateRequestHandler()); transportService.registerRequestHandler(COMMIT_ACTION_NAME, CommitClusterStateRequest.class, ThreadPool.Names.SAME, new CommitClusterStateRequestHandler()); } @@ -92,6 +90,10 @@ public class PublishClusterStateAction extends AbstractComponent { transportService.removeHandler(COMMIT_ACTION_NAME); } + public PendingClusterStatesQueue pendingStatesQueue() { + return pendingStatesQueue; + } + /** * publishes a cluster change event to other nodes. if at least minMasterNodes acknowledge the change it is committed and will * be processed by the master and the other nodes. @@ -359,6 +361,7 @@ public class PublishClusterStateAction extends AbstractComponent { // sanity check incoming state validateIncomingState(incomingState, lastSeenClusterState); + pendingStatesQueue.addPending(incomingState); lastSeenClusterState = incomingState; lastSeenClusterState.status(ClusterState.ClusterStateStatus.RECEIVED); } @@ -382,56 +385,34 @@ public class PublishClusterStateAction extends AbstractComponent { logger.warn("received a cluster state from [{}] and not part of the cluster, should not happen", incomingState.nodes().masterNode()); throw new IllegalStateException("received state from a node that is not part of the cluster"); } - // state from another master requires more subtle checks, so we let it pass for now (it will be checked in ZenDiscovery) - if (currentNodes.localNodeMaster() == false) { - ZenDiscovery.validateStateIsFromCurrentMaster(logger, currentNodes, incomingState); - } - if (lastSeenClusterState != null - && Objects.equals(lastSeenClusterState.nodes().masterNodeId(), incomingState.nodes().masterNodeId()) - && lastSeenClusterState.version() > incomingState.version()) { - logger.debug("received an older cluster state from master, rejecting (received version [{}], last version is [{}])", - incomingState.version(), lastSeenClusterState.version()); - throw new IllegalStateException("cluster state version [" + incomingState.version() + "] is old (last seen version [" + lastSeenClusterState.version() + "])"); - } + ZenDiscovery.validateStateIsFromCurrentMaster(logger, currentNodes, incomingState); } protected void handleCommitRequest(CommitClusterStateRequest request, final TransportChannel channel) { - ClusterState committedClusterState; - synchronized (lastSeenClusterStateMutex) { - committedClusterState = lastSeenClusterState; - } - - // if this message somehow comes without a previous send, we won't have a cluster state - String lastSeenUUID = committedClusterState == null ? null : committedClusterState.stateUUID(); - if (request.stateUUID.equals(lastSeenUUID) == false) { - throw new IllegalStateException("tried to commit cluster state UUID [" + request.stateUUID + "], but last seen UUID is [" + lastSeenUUID + "]"); - } - - try { - listener.onNewClusterState(committedClusterState, new NewClusterStateListener.NewStateProcessed() { - @Override - public void onNewClusterStateProcessed() { - try { - channel.sendResponse(TransportResponse.Empty.INSTANCE); - } catch (Throwable e) { - logger.debug("failed to send response on cluster state processed", e); - onNewClusterStateFailed(e); - } + final ClusterState state = pendingStatesQueue.markAsCommitted(request.stateUUID, new PendingClusterStatesQueue.StateProcessedListener() { + @Override + public void onNewClusterStateProcessed() { + try { + // send a response to the master to indicate that this cluster state has been processed post committing it. + channel.sendResponse(TransportResponse.Empty.INSTANCE); + } catch (Throwable e) { + logger.debug("failed to send response on cluster state processed", e); + onNewClusterStateFailed(e); } + } - @Override - public void onNewClusterStateFailed(Throwable t) { - try { - channel.sendResponse(t); - } catch (Throwable e) { - logger.debug("failed to send response on cluster state processed", e); - } + @Override + public void onNewClusterStateFailed(Throwable t) { + try { + channel.sendResponse(t); + } catch (Throwable e) { + logger.debug("failed to send response on cluster state processed", e); } - }); - } catch (Exception e) { - logger.warn("unexpected error while processing cluster state version [{}]", e, lastSeenClusterState.version()); - throw e; + } + }); + if (state != null) { + newPendingClusterStatelistener.onNewClusterState("master " + state.nodes().masterNode() + " committed version [" + state.version() + "]"); } } diff --git a/core/src/test/java/org/elasticsearch/cluster/ClusterStateTests.java b/core/src/test/java/org/elasticsearch/cluster/ClusterStateTests.java new file mode 100644 index 00000000000..19f90f2962c --- /dev/null +++ b/core/src/test/java/org/elasticsearch/cluster/ClusterStateTests.java @@ -0,0 +1,54 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.cluster; + +import com.carrotsearch.randomizedtesting.annotations.Repeat; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.common.transport.DummyTransportAddress; +import org.elasticsearch.test.ESTestCase; + +import static org.hamcrest.Matchers.equalTo; + +public class ClusterStateTests extends ESTestCase { + + public void testSupersedes() { + final DiscoveryNode node1 = new DiscoveryNode("node1", DummyTransportAddress.INSTANCE, Version.CURRENT); + final DiscoveryNode node2 = new DiscoveryNode("node2", DummyTransportAddress.INSTANCE, Version.CURRENT); + final DiscoveryNodes nodes = DiscoveryNodes.builder().put(node1).put(node2).build(); + ClusterState noMaster1 = ClusterState.builder(ClusterName.DEFAULT).version(randomInt(5)).nodes(nodes).build(); + ClusterState noMaster2 = ClusterState.builder(ClusterName.DEFAULT).version(randomInt(5)).nodes(nodes).build(); + ClusterState withMaster1a = ClusterState.builder(ClusterName.DEFAULT).version(randomInt(5)).nodes(DiscoveryNodes.builder(nodes).masterNodeId(node1.id())).build(); + ClusterState withMaster1b = ClusterState.builder(ClusterName.DEFAULT).version(randomInt(5)).nodes(DiscoveryNodes.builder(nodes).masterNodeId(node1.id())).build(); + ClusterState withMaster2 = ClusterState.builder(ClusterName.DEFAULT).version(randomInt(5)).nodes(DiscoveryNodes.builder(nodes).masterNodeId(node2.id())).build(); + + // states with no master should never supersede anything + assertFalse(noMaster1.supersedes(noMaster2)); + assertFalse(noMaster1.supersedes(withMaster1a)); + + // states should never supersede states from another master + assertFalse(withMaster1a.supersedes(withMaster2)); + assertFalse(withMaster1a.supersedes(noMaster1)); + + // state from the same master compare by version + assertThat(withMaster1a.supersedes(withMaster1b), equalTo(withMaster1a.version() > withMaster1b.version())); + + } +} diff --git a/core/src/test/java/org/elasticsearch/cluster/ack/AckIT.java b/core/src/test/java/org/elasticsearch/cluster/ack/AckIT.java index 84b31e3be30..987b8295310 100644 --- a/core/src/test/java/org/elasticsearch/cluster/ack/AckIT.java +++ b/core/src/test/java/org/elasticsearch/cluster/ack/AckIT.java @@ -19,6 +19,9 @@ package org.elasticsearch.cluster.ack; +import com.carrotsearch.hppc.cursors.ObjectObjectCursor; +import com.google.common.base.Predicate; +import com.google.common.collect.ImmutableList; import org.elasticsearch.action.admin.cluster.reroute.ClusterRerouteResponse; import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse; import org.elasticsearch.action.admin.indices.alias.IndicesAliasesResponse; @@ -34,8 +37,8 @@ import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.metadata.AliasMetaData; import org.elasticsearch.cluster.metadata.AliasOrIndex; import org.elasticsearch.cluster.metadata.IndexMetaData; -import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.RoutingNode; +import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand; import org.elasticsearch.common.settings.Settings; @@ -44,9 +47,6 @@ import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.search.warmer.IndexWarmersMetaData; import org.elasticsearch.test.ESIntegTestCase; import org.junit.Test; -import com.carrotsearch.hppc.cursors.ObjectObjectCursor; -import com.google.common.base.Predicate; -import com.google.common.collect.ImmutableList; import java.util.concurrent.TimeUnit; @@ -72,7 +72,7 @@ public class AckIT extends ESIntegTestCase { createIndex("test"); assertAcked(client().admin().indices().prepareUpdateSettings("test") - .setSettings(Settings.builder().put("refresh_interval", 9999, TimeUnit.MILLISECONDS))); + .setSettings(Settings.builder().put("refresh_interval", 9999, TimeUnit.MILLISECONDS))); for (Client client : clients()) { String refreshInterval = getLocalClusterState(client).metaData().index("test").settings().get("index.refresh_interval"); @@ -178,9 +178,9 @@ public class AckIT extends ESIntegTestCase { @Test public void testClusterRerouteAcknowledgement() throws InterruptedException { assertAcked(prepareCreate("test").setSettings(Settings.builder() - .put(indexSettings()) - .put(SETTING_NUMBER_OF_SHARDS, between(cluster().numDataNodes(), DEFAULT_MAX_NUM_SHARDS)) - .put(SETTING_NUMBER_OF_REPLICAS, 0) + .put(indexSettings()) + .put(SETTING_NUMBER_OF_SHARDS, between(cluster().numDataNodes(), DEFAULT_MAX_NUM_SHARDS)) + .put(SETTING_NUMBER_OF_REPLICAS, 0) )); ensureGreen(); diff --git a/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java b/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java index 6d4b0a9ee45..e3c97a74f67 100644 --- a/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java +++ b/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java @@ -22,8 +22,6 @@ package org.elasticsearch.discovery; import com.google.common.base.Predicate; import org.apache.lucene.util.LuceneTestCase; import org.elasticsearch.ElasticsearchException; -import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; -import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse; import org.elasticsearch.action.get.GetResponse; import org.elasticsearch.action.index.IndexResponse; import org.elasticsearch.client.Client; @@ -54,7 +52,11 @@ import org.elasticsearch.test.discovery.ClusterDiscoveryConfiguration; import org.elasticsearch.test.disruption.*; import org.elasticsearch.test.junit.annotations.TestLogging; import org.elasticsearch.test.transport.MockTransportService; -import org.elasticsearch.transport.*; +import org.elasticsearch.transport.TransportException; +import org.elasticsearch.transport.TransportRequest; +import org.elasticsearch.transport.TransportRequestOptions; +import org.elasticsearch.transport.TransportService; +import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -734,7 +736,7 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase { */ @Test public void unicastSinglePingResponseContainsMaster() throws Exception { - List nodes = startCluster(4, -1, new int[] {0}); + List nodes = startCluster(4, -1, new int[]{0}); // Figure out what is the elected master node final String masterNode = internalCluster().getMasterName(); logger.info("---> legit elected master node=" + masterNode); @@ -853,6 +855,10 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase { nonMasterTransportService.clearRule(discoveryNodes.masterNode()); ensureStableCluster(2); + + // shutting down the nodes, to avoid the leakage check tripping + // on the states associated with the commit requests we may have dropped + internalCluster().stopRandomNonMasterNode(); } @@ -943,7 +949,7 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase { @Test public void testIndicesDeleted() throws Exception { configureUnicastCluster(3, null, 2); - Future> masterNodes= internalCluster().startMasterOnlyNodesAsync(2); + Future> masterNodes = internalCluster().startMasterOnlyNodesAsync(2); Future dataNode = internalCluster().startDataOnlyNodeAsync(); dataNode.get(); masterNodes.get(); diff --git a/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java b/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java index 61b1062dbed..707ff87960e 100644 --- a/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java +++ b/core/src/test/java/org/elasticsearch/discovery/zen/ZenDiscoveryUnitTest.java @@ -27,14 +27,8 @@ import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.common.transport.DummyTransportAddress; import org.elasticsearch.test.ESTestCase; -import java.util.Collections; -import java.util.LinkedList; -import java.util.Queue; - -import static org.elasticsearch.discovery.zen.ZenDiscovery.ProcessClusterState; import static org.elasticsearch.discovery.zen.ZenDiscovery.shouldIgnoreOrRejectNewClusterState; -import static org.hamcrest.Matchers.*; -import static org.hamcrest.core.IsNull.nullValue; +import static org.hamcrest.Matchers.containsString; /** */ @@ -95,53 +89,4 @@ public class ZenDiscoveryUnitTest extends ESTestCase { } assertFalse("should not ignore, because current state doesn't have a master", shouldIgnoreOrRejectNewClusterState(logger, currentState.build(), newState.build())); } - - public void testSelectNextStateToProcess_empty() { - Queue queue = new LinkedList<>(); - assertThat(ZenDiscovery.selectNextStateToProcess(queue), nullValue()); - } - - public void testSelectNextStateToProcess() { - ClusterName clusterName = new ClusterName("abc"); - DiscoveryNodes nodes = DiscoveryNodes.builder().masterNodeId("a").build(); - - int numUpdates = scaledRandomIntBetween(50, 100); - LinkedList queue = new LinkedList<>(); - for (int i = 0; i < numUpdates; i++) { - queue.add(new ProcessClusterState(ClusterState.builder(clusterName).version(i).nodes(nodes).build())); - } - ProcessClusterState mostRecent = queue.get(numUpdates - 1); - Collections.shuffle(queue, getRandom()); - - assertThat(ZenDiscovery.selectNextStateToProcess(queue), sameInstance(mostRecent.clusterState)); - assertThat(mostRecent.processed, is(true)); - assertThat(queue.size(), equalTo(0)); - } - - public void testSelectNextStateToProcess_differentMasters() { - ClusterName clusterName = new ClusterName("abc"); - DiscoveryNodes nodes1 = DiscoveryNodes.builder().masterNodeId("a").build(); - DiscoveryNodes nodes2 = DiscoveryNodes.builder().masterNodeId("b").build(); - - LinkedList queue = new LinkedList<>(); - ProcessClusterState thirdMostRecent = new ProcessClusterState(ClusterState.builder(clusterName).version(1).nodes(nodes1).build()); - queue.offer(thirdMostRecent); - ProcessClusterState secondMostRecent = new ProcessClusterState(ClusterState.builder(clusterName).version(2).nodes(nodes1).build()); - queue.offer(secondMostRecent); - ProcessClusterState mostRecent = new ProcessClusterState(ClusterState.builder(clusterName).version(3).nodes(nodes1).build()); - queue.offer(mostRecent); - Collections.shuffle(queue, getRandom()); - queue.offer(new ProcessClusterState(ClusterState.builder(clusterName).version(4).nodes(nodes2).build())); - queue.offer(new ProcessClusterState(ClusterState.builder(clusterName).version(5).nodes(nodes1).build())); - - - assertThat(ZenDiscovery.selectNextStateToProcess(queue), sameInstance(mostRecent.clusterState)); - assertThat(thirdMostRecent.processed, is(true)); - assertThat(secondMostRecent.processed, is(true)); - assertThat(mostRecent.processed, is(true)); - assertThat(queue.size(), equalTo(2)); - assertThat(queue.get(0).processed, is(false)); - assertThat(queue.get(1).processed, is(false)); - } - } diff --git a/core/src/test/java/org/elasticsearch/discovery/zen/publish/PendingClusterStatesQueueTests.java b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PendingClusterStatesQueueTests.java new file mode 100644 index 00000000000..a8e9f00eb7f --- /dev/null +++ b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PendingClusterStatesQueueTests.java @@ -0,0 +1,224 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.discovery.zen.publish; + +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.ClusterName; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.common.transport.DummyTransportAddress; +import org.elasticsearch.discovery.zen.publish.PendingClusterStatesQueue.ClusterStateContext; +import org.elasticsearch.test.ESTestCase; + +import java.util.*; + +import static org.hamcrest.Matchers.*; + +public class PendingClusterStatesQueueTests extends ESTestCase { + + public void testSelectNextStateToProcess_empty() { + PendingClusterStatesQueue queue = new PendingClusterStatesQueue(logger, randomIntBetween(1, 200)); + assertThat(queue.getNextClusterStateToProcess(), nullValue()); + } + + public void testDroppingStatesAtCapacity() { + List states = randomStates(scaledRandomIntBetween(10, 300), "master1", "master2", "master3", "master4"); + Collections.shuffle(states, random()); + // insert half of the states + final int numberOfStateToDrop = states.size() / 2; + List stateToDrop = states.subList(0, numberOfStateToDrop); + final int queueSize = states.size() - numberOfStateToDrop; + PendingClusterStatesQueue queue = createQueueWithStates(stateToDrop, queueSize); + List committedContexts = randomCommitStates(queue); + for (ClusterState state : states.subList(numberOfStateToDrop, states.size())) { + queue.addPending(state); + } + + assertThat(queue.pendingClusterStates().length, equalTo(queueSize)); + // check all committed states got a failure due to the drop + for (ClusterStateContext context : committedContexts) { + assertThat(((MockListener) context.listener).failure, notNullValue()); + } + + // all states that should have dropped are indeed dropped. + for (ClusterState state : stateToDrop) { + assertThat(queue.findState(state.stateUUID()), nullValue()); + } + + } + + public void testSimpleQueueSameMaster() { + final int numUpdates = scaledRandomIntBetween(50, 100); + List states = randomStates(numUpdates, "master"); + Collections.shuffle(states, random()); + PendingClusterStatesQueue queue; + queue = createQueueWithStates(states); + + // no state is committed yet + assertThat(queue.getNextClusterStateToProcess(), nullValue()); + + ClusterState highestCommitted = null; + for (ClusterStateContext context : randomCommitStates(queue)) { + if (highestCommitted == null || context.state.supersedes(highestCommitted)) { + highestCommitted = context.state; + } + } + + assertThat(queue.getNextClusterStateToProcess(), sameInstance(highestCommitted)); + + queue.markAsProcessed(highestCommitted); + + // now there is nothing more to process + assertThat(queue.getNextClusterStateToProcess(), nullValue()); + } + + public void testProcessedStateCleansStatesFromOtherMasters() { + List states = randomStates(scaledRandomIntBetween(10, 300), "master1", "master2", "master3", "master4"); + PendingClusterStatesQueue queue = createQueueWithStates(states); + List committedContexts = randomCommitStates(queue); + ClusterState randomCommitted = randomFrom(committedContexts).state; + queue.markAsProcessed(randomCommitted); + final String processedMaster = randomCommitted.nodes().masterNodeId(); + + // now check that queue doesn't contain anything pending from another master + for (ClusterStateContext context : queue.pendingStates) { + final String pendingMaster = context.state.nodes().masterNodeId(); + assertThat("found a cluster state from [" + pendingMaster + + "], after a state from [" + processedMaster + "] was proccessed", + pendingMaster, equalTo(processedMaster)); + } + // and check all committed contexts from another master were failed + for (ClusterStateContext context : committedContexts) { + if (context.state.nodes().masterNodeId().equals(processedMaster) == false) { + assertThat(((MockListener) context.listener).failure, notNullValue()); + } + } + } + + public void testFailedStateCleansSupersededStatesOnly() { + List states = randomStates(scaledRandomIntBetween(10, 50), "master1", "master2", "master3", "master4"); + PendingClusterStatesQueue queue = createQueueWithStates(states); + List committedContexts = randomCommitStates(queue); + ClusterState toFail = randomFrom(committedContexts).state; + queue.markAsFailed(toFail, new ElasticsearchException("boo!")); + final Map committedContextsById = new HashMap<>(); + for (ClusterStateContext context : committedContexts) { + committedContextsById.put(context.stateUUID(), context); + } + + // now check that queue doesn't contain superseded states + for (ClusterStateContext context : queue.pendingStates) { + if (context.committed()) { + assertFalse("found a committed cluster state, which is superseded by a failed state.\nFound:" + context.state + "\nfailed:" + toFail, + toFail.supersedes(context.state)); + } + } + // check no state has been erroneously removed + for (ClusterState state : states) { + ClusterStateContext pendingContext = queue.findState(state.stateUUID()); + if (pendingContext != null) { + continue; + } + if (state.equals(toFail)) { + continue; + } + assertThat("non-committed states should never be removed", committedContextsById, hasKey(state.stateUUID())); + final ClusterStateContext context = committedContextsById.get(state.stateUUID()); + assertThat("removed state is not superseded by failed state. \nRemoved state:" + context + "\nfailed: " + toFail, + toFail.supersedes(context.state), equalTo(true)); + assertThat("removed state was failed with wrong exception", ((MockListener) context.listener).failure, notNullValue()); + assertThat("removed state was failed with wrong exception", ((MockListener) context.listener).failure.getMessage(), containsString("boo")); + } + } + + public void testFailAllAndClear() { + List states = randomStates(scaledRandomIntBetween(10, 50), "master1", "master2", "master3", "master4"); + PendingClusterStatesQueue queue = createQueueWithStates(states); + List committedContexts = randomCommitStates(queue); + queue.failAllStatesAndClear(new ElasticsearchException("boo!")); + assertThat(queue.pendingStates, empty()); + assertThat(queue.getNextClusterStateToProcess(), nullValue()); + for (ClusterStateContext context : committedContexts) { + assertThat("state was failed with wrong exception", ((MockListener) context.listener).failure, notNullValue()); + assertThat("state was failed with wrong exception", ((MockListener) context.listener).failure.getMessage(), containsString("boo")); + } + } + + protected List randomCommitStates(PendingClusterStatesQueue queue) { + List committedContexts = new ArrayList<>(); + for (int iter = randomInt(queue.pendingStates.size() - 1); iter >= 0; iter--) { + ClusterState state = queue.markAsCommitted(randomFrom(queue.pendingStates).stateUUID(), new MockListener()); + if (state != null) { + // null cluster state means we committed twice + committedContexts.add(queue.findState(state.stateUUID())); + } + } + return committedContexts; + } + + PendingClusterStatesQueue createQueueWithStates(List states) { + return createQueueWithStates(states, states.size() * 2); // we don't care about limits (there are dedicated tests for that) + } + + PendingClusterStatesQueue createQueueWithStates(List states, int maxQueueSize) { + PendingClusterStatesQueue queue; + queue = new PendingClusterStatesQueue(logger, maxQueueSize); + for (ClusterState state : states) { + queue.addPending(state); + } + return queue; + } + + List randomStates(int count, String... masters) { + ArrayList states = new ArrayList<>(count); + ClusterState[] lastClusterStatePerMaster = new ClusterState[masters.length]; + for (; count > 0; count--) { + int masterIndex = randomInt(masters.length - 1); + ClusterState state = lastClusterStatePerMaster[masterIndex]; + if (state == null) { + state = ClusterState.builder(ClusterName.DEFAULT).nodes(DiscoveryNodes.builder() + .put(new DiscoveryNode(masters[masterIndex], DummyTransportAddress.INSTANCE, Version.CURRENT)).masterNodeId(masters[masterIndex]).build() + ).build(); + } else { + state = ClusterState.builder(state).incrementVersion().build(); + } + states.add(state); + lastClusterStatePerMaster[masterIndex] = state; + } + return states; + } + + static class MockListener implements PendingClusterStatesQueue.StateProcessedListener { + volatile boolean processed; + volatile Throwable failure; + + @Override + public void onNewClusterStateProcessed() { + processed = true; + } + + @Override + public void onNewClusterStateFailed(Throwable t) { + failure = t; + } + } + +} diff --git a/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java index 1a078171fdd..b9b41098877 100644 --- a/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java +++ b/core/src/test/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateActionTests.java @@ -29,7 +29,6 @@ import org.elasticsearch.cluster.metadata.MetaData; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.common.Nullable; -import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.collect.ImmutableOpenMap; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.io.stream.NamedWriteableRegistry; @@ -52,10 +51,7 @@ import org.junit.Before; import org.junit.Test; import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; @@ -71,7 +67,7 @@ public class PublishClusterStateActionTests extends ESTestCase { protected ThreadPool threadPool; protected Map nodes = newHashMap(); - public static class MockNode implements PublishClusterStateAction.NewClusterStateListener, DiscoveryNodesProvider { + public static class MockNode implements PublishClusterStateAction.NewPendingClusterStateListener, DiscoveryNodesProvider { public final DiscoveryNode discoveryNode; public final MockTransportService service; public MockPublishAction action; @@ -89,19 +85,33 @@ public class PublishClusterStateActionTests extends ESTestCase { this.clusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(DiscoveryNodes.builder().put(discoveryNode).localNodeId(discoveryNode.id()).build()).build(); } + public MockNode setAsMaster() { + this.clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).masterNodeId(discoveryNode.id())).build(); + return this; + } + + public MockNode resetMasterId() { + this.clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).masterNodeId(null)).build(); + return this; + } + + public void connectTo(DiscoveryNode node) { service.connectToNode(node); } @Override - public void onNewClusterState(ClusterState newClusterState, NewStateProcessed newStateProcessed) { + public void onNewClusterState(String reason) { + ClusterState newClusterState = action.pendingStatesQueue().getNextClusterStateToProcess(); logger.debug("[{}] received version [{}], uuid [{}]", discoveryNode.name(), newClusterState.version(), newClusterState.stateUUID()); if (listener != null) { ClusterChangedEvent event = new ClusterChangedEvent("", newClusterState, clusterState); listener.clusterChanged(event); } - clusterState = newClusterState; - newStateProcessed.onNewClusterStateProcessed(); + if (clusterState.nodes().masterNode() == null || newClusterState.supersedes(clusterState)) { + clusterState = newClusterState; + } + action.pendingStatesQueue().markAsProcessed(newClusterState); } @Override @@ -211,22 +221,21 @@ public class PublishClusterStateActionTests extends ESTestCase { } protected MockPublishAction buildPublishClusterStateAction(Settings settings, MockTransportService transportService, DiscoveryNodesProvider nodesProvider, - PublishClusterStateAction.NewClusterStateListener listener) { + PublishClusterStateAction.NewPendingClusterStateListener listener) { DiscoverySettings discoverySettings = new DiscoverySettings(settings, new NodeSettingsService(settings)); return new MockPublishAction(settings, transportService, nodesProvider, listener, discoverySettings, ClusterName.DEFAULT); } @Test public void testSimpleClusterStatePublishing() throws Exception { - MockNode nodeA = createMockNode("nodeA", Settings.EMPTY, Version.CURRENT); + MockNode nodeA = createMockNode("nodeA", Settings.EMPTY, Version.CURRENT).setAsMaster(); MockNode nodeB = createMockNode("nodeB", Settings.EMPTY, Version.CURRENT); // Initial cluster state - DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); - ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); + ClusterState clusterState = nodeA.clusterState; // cluster state update - add nodeB - discoveryNodes = DiscoveryNodes.builder(discoveryNodes).put(nodeB.discoveryNode).build(); + DiscoveryNodes discoveryNodes = DiscoveryNodes.builder(clusterState.nodes()).put(nodeB.discoveryNode).build(); ClusterState previousClusterState = clusterState; clusterState = ClusterState.builder(clusterState).nodes(discoveryNodes).incrementVersion().build(); publishStateAndWait(nodeA.action, clusterState, previousClusterState); @@ -277,6 +286,11 @@ public class PublishClusterStateActionTests extends ESTestCase { assertSameStateFromFull(nodeC.clusterState, clusterState); assertFalse(nodeC.clusterState.wasReadFromDiff()); + // node A steps down from being master + nodeA.resetMasterId(); + nodeB.resetMasterId(); + nodeC.resetMasterId(); + // node B becomes the master and sends a version of the cluster state that goes back discoveryNodes = DiscoveryNodes.builder(discoveryNodes) .put(nodeA.discoveryNode) @@ -300,12 +314,12 @@ public class PublishClusterStateActionTests extends ESTestCase { public void clusterChanged(ClusterChangedEvent event) { fail("Shouldn't send cluster state to myself"); } - }); + }).setAsMaster(); MockNode nodeB = createMockNode("nodeB", Settings.EMPTY, Version.CURRENT); // Initial cluster state with both states - the second node still shouldn't get diff even though it's present in the previous cluster state - DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).put(nodeB.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); + DiscoveryNodes discoveryNodes = DiscoveryNodes.builder(nodeA.nodes()).put(nodeB.discoveryNode).build(); ClusterState previousClusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); ClusterState clusterState = ClusterState.builder(previousClusterState).incrementVersion().build(); publishStateAndWait(nodeA.action, clusterState, previousClusterState); @@ -337,7 +351,7 @@ public class PublishClusterStateActionTests extends ESTestCase { }); // Initial cluster state - DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); + DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).localNodeId(nodeA.discoveryNode.id()).masterNodeId(nodeA.discoveryNode.id()).build(); ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); // cluster state update - add nodeB @@ -354,16 +368,21 @@ public class PublishClusterStateActionTests extends ESTestCase { /** - * Test not waiting publishing works correctly (i.e., publishing times out) + * Test not waiting on publishing works correctly (i.e., publishing times out) */ @Test public void testSimultaneousClusterStatePublishing() throws Exception { int numberOfNodes = randomIntBetween(2, 10); int numberOfIterations = scaledRandomIntBetween(5, 50); Settings settings = Settings.builder().put(DiscoverySettings.PUBLISH_DIFF_ENABLE, randomBoolean()).build(); - DiscoveryNodes.Builder discoveryNodesBuilder = DiscoveryNodes.builder(); - MockNode master = null; - for (int i = 0; i < numberOfNodes; i++) { + MockNode master = createMockNode("node0", settings, Version.CURRENT, new ClusterStateListener() { + @Override + public void clusterChanged(ClusterChangedEvent event) { + assertProperMetaDataForVersion(event.state().metaData(), event.state().version()); + } + }).setAsMaster(); + DiscoveryNodes.Builder discoveryNodesBuilder = DiscoveryNodes.builder(master.nodes()); + for (int i = 1; i < numberOfNodes; i++) { final String name = "node" + i; final MockNode node = createMockNode(name, settings, Version.CURRENT, new ClusterStateListener() { @Override @@ -371,14 +390,10 @@ public class PublishClusterStateActionTests extends ESTestCase { assertProperMetaDataForVersion(event.state().metaData(), event.state().version()); } }); - if (i == 0) { - master = node; - } discoveryNodesBuilder.put(node.discoveryNode); } AssertingAckListener[] listeners = new AssertingAckListener[numberOfIterations]; - discoveryNodesBuilder.localNodeId(master.discoveryNode.id()); DiscoveryNodes discoveryNodes = discoveryNodesBuilder.build(); MetaData metaData = MetaData.EMPTY_META_DATA; ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).metaData(metaData).build(); @@ -398,8 +413,7 @@ public class PublishClusterStateActionTests extends ESTestCase { master.clusterState = clusterState; for (MockNode node : nodes.values()) { - assertThat(node.discoveryNode + " misses a cluster state", node.clusterState, notNullValue()); - assertThat(node.discoveryNode + " unexpected cluster state: " + node.clusterState, node.clusterState.version(), equalTo(clusterState.version())); + assertSameState(node.clusterState, clusterState); assertThat(node.clusterState.nodes().localNode(), equalTo(node.discoveryNode)); } } @@ -412,12 +426,12 @@ public class PublishClusterStateActionTests extends ESTestCase { public void clusterChanged(ClusterChangedEvent event) { fail("Shouldn't send cluster state to myself"); } - }); + }).setAsMaster(); MockNode nodeB = createMockNode("nodeB", Settings.EMPTY, Version.CURRENT); // Initial cluster state with both states - the second node still shouldn't get diff even though it's present in the previous cluster state - DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().put(nodeA.discoveryNode).put(nodeB.discoveryNode).localNodeId(nodeA.discoveryNode.id()).build(); + DiscoveryNodes discoveryNodes = DiscoveryNodes.builder(nodeA.nodes()).put(nodeB.discoveryNode).build(); ClusterState previousClusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(discoveryNodes).build(); ClusterState clusterState = ClusterState.builder(previousClusterState).incrementVersion().build(); publishStateAndWait(nodeA.action, clusterState, previousClusterState); @@ -612,69 +626,59 @@ public class PublishClusterStateActionTests extends ESTestCase { } catch (IllegalStateException OK) { } - logger.info("--> testing rejection of an old cluster state"); + logger.info("--> testing acceptance of an old cluster state"); state = node.clusterState; node.clusterState = ClusterState.builder(node.clusterState).incrementVersion().build(); - try { - node.action.validateIncomingState(state, node.clusterState); - fail("node accepted state with an older version"); - } catch (IllegalStateException OK) { - } + node.action.validateIncomingState(state, node.clusterState); - // an older version from a *new* master is OK! + // an older version from a *new* master is also OK! ClusterState previousState = ClusterState.builder(node.clusterState).incrementVersion().build(); state = ClusterState.builder(node.clusterState) .nodes(DiscoveryNodes.builder(node.clusterState.nodes()).masterNodeId("_new_master_").build()) .build(); // remove the master of the node (but still have a previous cluster state with it)! - node.clusterState = ClusterState.builder(node.clusterState) - .nodes(DiscoveryNodes.builder(node.clusterState.nodes()).masterNodeId(null).build()) - .build(); + node.resetMasterId(); node.action.validateIncomingState(state, previousState); } public void testInterleavedPublishCommit() throws Throwable { - MockNode node = createMockNode("node"); - final ClusterState state1 = ClusterState.builder(node.clusterState).incrementVersion().build(); - final ClusterState state2 = ClusterState.builder(state1).incrementVersion().build(); - final BytesReference state1Bytes = PublishClusterStateAction.serializeFullClusterState(state1, Version.CURRENT); - final BytesReference state2Bytes = PublishClusterStateAction.serializeFullClusterState(state2, Version.CURRENT); + MockNode node = createMockNode("node").setAsMaster(); final CapturingTransportChannel channel = new CapturingTransportChannel(); - node.action.handleIncomingClusterStateRequest(new BytesTransportRequest(state1Bytes, Version.CURRENT), channel); - assertThat(channel.response.get(), equalTo((TransportResponse) TransportResponse.Empty.INSTANCE)); - assertThat(channel.error.get(), nullValue()); - channel.clear(); + List states = new ArrayList<>(); + final int numOfStates = scaledRandomIntBetween(3, 10); + for (int i = 1; i <= numOfStates; i++) { + states.add(ClusterState.builder(node.clusterState).version(i).stateUUID(ClusterState.UNKNOWN_UUID).build()); + } - // another incoming state is OK. Should just override pending state - node.action.handleIncomingClusterStateRequest(new BytesTransportRequest(state2Bytes, Version.CURRENT), channel); - assertThat(channel.response.get(), equalTo((TransportResponse) TransportResponse.Empty.INSTANCE)); - assertThat(channel.error.get(), nullValue()); - channel.clear(); + final ClusterState finalState = states.get(numOfStates - 1); + Collections.shuffle(states, random()); - // committing previous state should fail - try { - node.action.handleCommitRequest(new PublishClusterStateAction.CommitClusterStateRequest(state1.stateUUID()), channel); - // sadly, there are ways to percolate errors - assertThat(channel.response.get(), nullValue()); - assertThat(channel.error.get(), notNullValue()); - if (channel.error.get() instanceof IllegalStateException == false) { + logger.info("--> publishing states"); + for (ClusterState state : states) { + node.action.handleIncomingClusterStateRequest( + new BytesTransportRequest(PublishClusterStateAction.serializeFullClusterState(state, Version.CURRENT), Version.CURRENT), + channel); + assertThat(channel.response.get(), equalTo((TransportResponse) TransportResponse.Empty.INSTANCE)); + assertThat(channel.error.get(), nullValue()); + channel.clear(); + } + + logger.info("--> committing states"); + + Collections.shuffle(states, random()); + for (ClusterState state : states) { + node.action.handleCommitRequest(new PublishClusterStateAction.CommitClusterStateRequest(state.stateUUID()), channel); + assertThat(channel.response.get(), equalTo((TransportResponse) TransportResponse.Empty.INSTANCE)); + if (channel.error.get() != null) { throw channel.error.get(); } - } catch (IllegalStateException OK) { - } channel.clear(); - // committing second state should succeed - node.action.handleCommitRequest(new PublishClusterStateAction.CommitClusterStateRequest(state2.stateUUID()), channel); - assertThat(channel.response.get(), equalTo((TransportResponse) TransportResponse.Empty.INSTANCE)); - assertThat(channel.error.get(), nullValue()); - channel.clear(); - - // now check it was really committed - assertSameState(node.clusterState, state2); + //now check the last state held + assertSameState(node.clusterState, finalState); } /** @@ -809,7 +813,7 @@ public class PublishClusterStateActionTests extends ESTestCase { AtomicBoolean timeoutOnCommit = new AtomicBoolean(); AtomicBoolean errorOnCommit = new AtomicBoolean(); - public MockPublishAction(Settings settings, TransportService transportService, DiscoveryNodesProvider nodesProvider, NewClusterStateListener listener, DiscoverySettings discoverySettings, ClusterName clusterName) { + public MockPublishAction(Settings settings, TransportService transportService, DiscoveryNodesProvider nodesProvider, NewPendingClusterStateListener listener, DiscoverySettings discoverySettings, ClusterName clusterName) { super(settings, transportService, nodesProvider, listener, discoverySettings, clusterName); } diff --git a/core/src/test/java/org/elasticsearch/test/ESIntegTestCase.java b/core/src/test/java/org/elasticsearch/test/ESIntegTestCase.java index 55fae7fc3fd..0398aae4493 100644 --- a/core/src/test/java/org/elasticsearch/test/ESIntegTestCase.java +++ b/core/src/test/java/org/elasticsearch/test/ESIntegTestCase.java @@ -91,6 +91,8 @@ import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentHelper; import org.elasticsearch.common.xcontent.json.JsonXContent; import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.discovery.Discovery; +import org.elasticsearch.discovery.zen.ZenDiscovery; import org.elasticsearch.discovery.zen.elect.ElectMasterService; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexService; @@ -126,33 +128,15 @@ import org.junit.BeforeClass; import java.io.IOException; import java.io.InputStream; -import java.lang.annotation.Annotation; -import java.lang.annotation.ElementType; -import java.lang.annotation.Inherited; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; +import java.lang.annotation.*; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.UnknownHostException; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; -import java.util.IdentityHashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; -import java.util.Set; -import java.util.concurrent.Callable; -import java.util.concurrent.CopyOnWriteArrayList; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.TimeUnit; +import java.util.*; +import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; @@ -588,6 +572,20 @@ public abstract class ESIntegTestCase extends ESTestCase { } ensureClusterSizeConsistency(); ensureClusterStateConsistency(); + if (isInternalCluster()) { + // check no pending cluster states are leaked + for (Discovery discovery : internalCluster().getInstances(Discovery.class)) { + if (discovery instanceof ZenDiscovery) { + final ZenDiscovery zenDiscovery = (ZenDiscovery) discovery; + assertBusy(new Runnable() { + @Override + public void run() { + assertThat(zenDiscovery.pendingClusterStates(), emptyArray()); + } + }); + } + } + } beforeIndexDeletion(); cluster().wipe(); // wipe after to make sure we fail in the test that didn't ack the delete if (afterClass || currentClusterScope == Scope.TEST) { @@ -1615,7 +1613,6 @@ public abstract class ESIntegTestCase extends ESTestCase { } - private Scope getCurrentClusterScope() { return getCurrentClusterScope(this.getClass()); } @@ -1750,7 +1747,7 @@ public abstract class ESIntegTestCase extends ESTestCase { String nodeMode = InternalTestCluster.configuredNodeMode(); if (noLocal != null && noNetwork != null) { throw new IllegalStateException("Can't suppress both network and local mode"); - } else if (noLocal != null){ + } else if (noLocal != null) { nodeMode = "network"; } else if (noNetwork != null) { nodeMode = "local"; @@ -2042,13 +2039,15 @@ public abstract class ESIntegTestCase extends ESTestCase { */ @Retention(RetentionPolicy.RUNTIME) @Inherited - public @interface SuppressLocalMode {} + public @interface SuppressLocalMode { + } /** * If used the test will never run in network mode */ @Retention(RetentionPolicy.RUNTIME) @Inherited - public @interface SuppressNetworkMode {} + public @interface SuppressNetworkMode { + } } From e370b836851357c4a433fc94ab1433d8caaf4dca Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Fri, 11 Sep 2015 17:10:41 +0200 Subject: [PATCH 21/21] fix old style plugin injection --- .../cluster/MinimumMasterNodesIT.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java b/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java index 66b4072c66d..ab01ea16ea7 100644 --- a/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java +++ b/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java @@ -31,6 +31,7 @@ import org.elasticsearch.discovery.zen.ZenDiscovery; import org.elasticsearch.discovery.zen.elect.ElectMasterService; import org.elasticsearch.discovery.zen.fd.FaultDetection; import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase.ClusterScope; import org.elasticsearch.test.disruption.NetworkDelaysPartition; @@ -38,10 +39,7 @@ import org.elasticsearch.test.junit.annotations.TestLogging; import org.elasticsearch.test.transport.MockTransportService; import org.junit.Test; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; +import java.util.*; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; @@ -57,6 +55,13 @@ import static org.hamcrest.Matchers.*; @ESIntegTestCase.SuppressLocalMode public class MinimumMasterNodesIT extends ESIntegTestCase { + @Override + protected Collection> nodePlugins() { + final HashSet> classes = new HashSet<>(super.nodePlugins()); + classes.add(MockTransportService.TestPlugin.class); + return classes; + } + @Test @TestLogging("cluster.service:TRACE,discovery.zen:TRACE,gateway:TRACE,transport.tracer:TRACE") public void simpleMinimumMasterNodes() throws Exception { @@ -353,7 +358,6 @@ public class MinimumMasterNodesIT extends ESIntegTestCase { .put(ZenDiscovery.SETTING_PING_TIMEOUT, "200ms") .put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, 2) .put(DiscoverySettings.COMMIT_TIMEOUT, "100ms") // speed things up - .put("plugin.types", MockTransportService.TestPlugin.class.getName()) .build(); internalCluster().startNodesAsync(3, settings).get();