[Discovery] accumulated improvements to ZenDiscovery

Merging the accumulated work from the feautre/improve_zen branch. Here are the highlights of the changes:

__Testing infra__
- Networking:
    - all symmetric partitioning
    - dropping packets
    - hard disconnects
    - Jepsen Tests
- Single node service disruptions:
    - Long GC / Halt
    - Slow cluster state updates
- Discovery settings
    - Easy to setup unicast with partial host list

__Zen Discovery__
- Pinging after master loss (no local elects)
- Fixes the split brain issue: #2488
- Batching join requests
- More resilient joining process (wait on a publish from master)

Closes #7493
This commit is contained in:
Boaz Leskes 2014-09-01 16:13:57 +02:00
commit 598854dd72
63 changed files with 3890 additions and 802 deletions

468
pom.xml
View File

@ -184,7 +184,7 @@
<version>0.8.13</version>
<optional>true</optional>
</dependency>
<!-- Lucene spatial -->
<!-- Lucene spatial -->
<!-- START: dependencies that are shaded -->
@ -485,7 +485,8 @@
<haltOnFailure>${tests.failfast}</haltOnFailure>
<uniqueSuiteNames>false</uniqueSuiteNames>
<systemProperties>
<java.io.tmpdir>.</java.io.tmpdir> <!-- we use '.' since this is different per JVM-->
<java.io.tmpdir>.</java.io.tmpdir>
<!-- we use '.' since this is different per JVM-->
<!-- RandomizedTesting library system properties -->
<tests.bwc>${tests.bwc}</tests.bwc>
<tests.bwc.path>${tests.bwc.path}</tests.bwc.path>
@ -539,15 +540,15 @@
<version>1.7</version>
<executions>
<execution>
<phase>validate</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<target>
<echo>Using ${java.runtime.name} ${java.runtime.version} ${java.vendor}</echo>
</target>
</configuration>
<phase>validate</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<target>
<echo>Using ${java.runtime.name} ${java.runtime.version} ${java.vendor}</echo>
</target>
</configuration>
</execution>
<execution>
<id>invalid-patterns</id>
@ -575,7 +576,9 @@
</fileset>
<map from="${basedir}${file.separator}" to="* "/>
</pathconvert>
<fail if="validate.patternsFound">The following files contain tabs or nocommits:${line.separator}${validate.patternsFound}</fail>
<fail if="validate.patternsFound">The following files contain tabs or
nocommits:${line.separator}${validate.patternsFound}
</fail>
</target>
</configuration>
</execution>
@ -583,7 +586,8 @@
<id>tests</id>
<phase>test</phase>
<configuration>
<skip>${skipTests}</skip> <!-- don't run if we skip the tests -->
<skip>${skipTests}</skip>
<!-- don't run if we skip the tests -->
<failOnError>false</failOnError>
<target>
<property name="runtime_classpath" refid="maven.runtime.classpath"/>
@ -597,7 +601,7 @@
</classpath>
</taskdef>
<tophints max="${tests.topn}">
<file file="${basedir}/${execution.hint.file}" />
<file file="${basedir}/${execution.hint.file}"/>
</tophints>
</target>
</configuration>
@ -710,7 +714,7 @@
<shadedPattern>org.elasticsearch.common.compress</shadedPattern>
</relocation>
<relocation>
<pattern>com.github.mustachejava</pattern>
<pattern>com.github.mustachejava</pattern>
<shadedPattern>org.elasticsearch.common.mustache</shadedPattern>
</relocation>
<relocation>
@ -1221,6 +1225,11 @@
<bundledSignature>jdk-unsafe</bundledSignature>
<bundledSignature>jdk-deprecated</bundledSignature>
</bundledSignatures>
<excludes>
<!-- start exclude for test GC simulation using Thread.suspend -->
<exclude>org/elasticsearch/test/disruption/LongGCDisruption.class</exclude>
<!-- end exclude for GC simulation -->
</excludes>
<signaturesFiles>
<signaturesFile>test-signatures.txt</signaturesFile>
<signaturesFile>all-signatures.txt</signaturesFile>
@ -1345,219 +1354,220 @@
</pluginManagement>
</build>
<profiles>
<!-- default profile, with randomization setting kicks in -->
<profile>
<id>default</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>com.carrotsearch.randomizedtesting</groupId>
<artifactId>junit4-maven-plugin</artifactId>
<configuration>
<argLine>${tests.jvm.argline}</argLine>
</configuration>
</plugin>
<plugin>
<groupId>com.mycila</groupId>
<artifactId>license-maven-plugin</artifactId>
<version>2.5</version>
<configuration>
<header>dev-tools/elasticsearch_license_header.txt</header>
<headerDefinitions>
<headerDefinition>dev-tools/license_header_definition.xml</headerDefinition>
</headerDefinitions>
<includes>
<include>src/main/java/org/elasticsearch/**/*.java</include>
<include>src/test/java/org/elasticsearch/**/*.java</include>
</includes>
<excludes>
<exclude>src/main/java/org/elasticsearch/common/inject/**</exclude>
<!-- Guice -->
<exclude>src/main/java/org/elasticsearch/common/geo/GeoHashUtils.java</exclude>
<exclude>src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java</exclude>
<exclude>src/main/java/org/elasticsearch/common/lucene/search/XFilteredQuery.java</exclude>
<exclude>src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java</exclude>
<exclude>src/main/java/org/apache/lucene/**/X*.java</exclude>
<!-- t-digest -->
<exclude>src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigestState.java</exclude>
<exclude>src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTree.java</exclude>
</excludes>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<!-- profile for development that doesn't check forbidden-apis, no-commit validation or license headers run with mvn -Pdev -->
<profile>
<id>dev</id>
<properties>
<validate.skip>true</validate.skip>
</properties>
<build>
<plugins>
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<version>1.5.1</version>
<executions>
<execution>
<id>check-forbidden-apis</id>
<phase>none</phase>
</execution>
<execution>
<id>check-forbidden-test-apis</id>
<phase>none</phase>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<!-- license profile, to generate third party license file -->
<profile>
<id>license</id>
<activation>
<property>
<name>license.generation</name>
<value>true</value>
</property>
</activation>
<!-- not including license-maven-plugin is sufficent to expose default license -->
</profile>
<!-- jacoco coverage profile. This will insert -jagent -->
<profile>
<id>coverage</id>
<activation>
<property>
<name>tests.coverage</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
<!-- must be on the classpath -->
<groupId>org.jacoco</groupId>
<artifactId>org.jacoco.agent</artifactId>
<classifier>runtime</classifier>
<version>0.6.4.201312101107</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>0.6.4.201312101107</version>
<executions>
<execution>
<id>default-prepare-agent</id>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>default-report</id>
<phase>prepare-package</phase>
<goals>
<goal>report</goal>
</goals>
</execution>
<execution>
<id>default-check</id>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
<configuration>
<excludes>
<exclude>jsr166e/**</exclude>
<exclude>org/apache/lucene/**</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>static</id>
<activation>
<property>
<name>tests.static</name>
<value>true</value>
</property>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>findbugs-maven-plugin</artifactId>
<version>2.5.3</version>
</plugin>
</plugins>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jxr-plugin</artifactId>
<version>2.3</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
<version>3.0.1</version>
<configuration>
<rulesets>
<ruleset>${basedir}/dev-tools/pmd/custom.xml</ruleset>
</rulesets>
<targetJdk>1.7</targetJdk>
<excludes>
<exclude>**/jsr166e/**</exclude>
<exclude>**/org/apache/lucene/**</exclude>
<exclude>**/org/apache/elasticsearch/common/Base64.java</exclude>
</excludes>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>findbugs-maven-plugin</artifactId>
<version>2.5.3</version>
<configuration>
<xmlOutput>true</xmlOutput>
<xmlOutputDirectory>target/site</xmlOutputDirectory>
<fork>true</fork>
<maxHeap>2048</maxHeap>
<timeout>1800000</timeout>
<onlyAnalyze>org.elasticsearch.-</onlyAnalyze>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>2.7</version>
<reportSets>
<reportSet>
<reports>
<report>index</report>
</reports>
</reportSet>
</reportSets>
</plugin>
</plugins>
</reporting>
</profile>
<!-- default profile, with randomization setting kicks in -->
<profile>
<id>default</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>com.carrotsearch.randomizedtesting</groupId>
<artifactId>junit4-maven-plugin</artifactId>
<configuration>
<argLine>${tests.jvm.argline}</argLine>
</configuration>
</plugin>
<plugin>
<groupId>com.mycila</groupId>
<artifactId>license-maven-plugin</artifactId>
<version>2.5</version>
<configuration>
<header>dev-tools/elasticsearch_license_header.txt</header>
<headerDefinitions>
<headerDefinition>dev-tools/license_header_definition.xml</headerDefinition>
</headerDefinitions>
<includes>
<include>src/main/java/org/elasticsearch/**/*.java</include>
<include>src/test/java/org/elasticsearch/**/*.java</include>
</includes>
<excludes>
<exclude>src/main/java/org/elasticsearch/common/inject/**</exclude>
<!-- Guice -->
<exclude>src/main/java/org/elasticsearch/common/geo/GeoHashUtils.java</exclude>
<exclude>src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java</exclude>
<exclude>src/main/java/org/elasticsearch/common/lucene/search/XFilteredQuery.java</exclude>
<exclude>src/main/java/org/apache/lucene/queryparser/XSimpleQueryParser.java</exclude>
<exclude>src/main/java/org/apache/lucene/**/X*.java</exclude>
<!-- t-digest -->
<exclude>src/main/java/org/elasticsearch/search/aggregations/metrics/percentiles/tdigest/TDigestState.java
</exclude>
<exclude>src/test/java/org/elasticsearch/search/aggregations/metrics/GroupTree.java</exclude>
</excludes>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<!-- profile for development that doesn't check forbidden-apis, no-commit validation or license headers run with mvn -Pdev -->
<profile>
<id>dev</id>
<properties>
<validate.skip>true</validate.skip>
</properties>
<build>
<plugins>
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<version>1.5.1</version>
<executions>
<execution>
<id>check-forbidden-apis</id>
<phase>none</phase>
</execution>
<execution>
<id>check-forbidden-test-apis</id>
<phase>none</phase>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<!-- license profile, to generate third party license file -->
<profile>
<id>license</id>
<activation>
<property>
<name>license.generation</name>
<value>true</value>
</property>
</activation>
<!-- not including license-maven-plugin is sufficent to expose default license -->
</profile>
<!-- jacoco coverage profile. This will insert -jagent -->
<profile>
<id>coverage</id>
<activation>
<property>
<name>tests.coverage</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
<!-- must be on the classpath -->
<groupId>org.jacoco</groupId>
<artifactId>org.jacoco.agent</artifactId>
<classifier>runtime</classifier>
<version>0.6.4.201312101107</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>0.6.4.201312101107</version>
<executions>
<execution>
<id>default-prepare-agent</id>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>default-report</id>
<phase>prepare-package</phase>
<goals>
<goal>report</goal>
</goals>
</execution>
<execution>
<id>default-check</id>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
<configuration>
<excludes>
<exclude>jsr166e/**</exclude>
<exclude>org/apache/lucene/**</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>static</id>
<activation>
<property>
<name>tests.static</name>
<value>true</value>
</property>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>findbugs-maven-plugin</artifactId>
<version>2.5.3</version>
</plugin>
</plugins>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jxr-plugin</artifactId>
<version>2.3</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-pmd-plugin</artifactId>
<version>3.0.1</version>
<configuration>
<rulesets>
<ruleset>${basedir}/dev-tools/pmd/custom.xml</ruleset>
</rulesets>
<targetJdk>1.7</targetJdk>
<excludes>
<exclude>**/jsr166e/**</exclude>
<exclude>**/org/apache/lucene/**</exclude>
<exclude>**/org/apache/elasticsearch/common/Base64.java</exclude>
</excludes>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>findbugs-maven-plugin</artifactId>
<version>2.5.3</version>
<configuration>
<xmlOutput>true</xmlOutput>
<xmlOutputDirectory>target/site</xmlOutputDirectory>
<fork>true</fork>
<maxHeap>2048</maxHeap>
<timeout>1800000</timeout>
<onlyAnalyze>org.elasticsearch.-</onlyAnalyze>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>2.7</version>
<reportSets>
<reportSet>
<reports>
<report>index</report>
</reports>
</reportSet>
</reportSets>
</plugin>
</plugins>
</reporting>
</profile>
</profiles>
</project>

View File

@ -137,6 +137,12 @@ public class TransportClusterUpdateSettingsAction extends TransportMasterNodeOpe
return new ClusterUpdateSettingsResponse(updateSettingsAcked && acknowledged, transientUpdates.build(), persistentUpdates.build());
}
@Override
public void onNoLongerMaster(String source) {
logger.debug("failed to preform reroute after cluster settings were updated - current node is no longer a master");
listener.onResponse(new ClusterUpdateSettingsResponse(updateSettingsAcked, transientUpdates.build(), persistentUpdates.build()));
}
@Override
public void onFailure(String source, Throwable t) {
//if the reroute fails we only log

View File

@ -173,12 +173,12 @@ public class TransportRecoveryAction extends
@Override
protected ClusterBlockException checkGlobalBlock(ClusterState state, RecoveryRequest request) {
return state.blocks().globalBlockedException(ClusterBlockLevel.METADATA);
return state.blocks().globalBlockedException(ClusterBlockLevel.READ);
}
@Override
protected ClusterBlockException checkRequestBlock(ClusterState state, RecoveryRequest request, String[] concreteIndices) {
return state.blocks().indicesBlockedException(ClusterBlockLevel.METADATA, concreteIndices);
return state.blocks().indicesBlockedException(ClusterBlockLevel.READ, concreteIndices);
}
static class ShardRecoveryRequest extends BroadcastShardOperationRequest {

View File

@ -66,11 +66,11 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
/**
* Constructs a service component for running benchmarks
*
* @param settings Settings
* @param clusterService Cluster service
* @param threadPool Thread pool
* @param client Client
* @param transportService Transport service
* @param settings Settings
* @param clusterService Cluster service
* @param threadPool Thread pool
* @param client Client
* @param transportService Transport service
*/
@Inject
public BenchmarkService(Settings settings, ClusterService clusterService, ThreadPool threadPool,
@ -86,19 +86,22 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
}
@Override
protected void doStart() throws ElasticsearchException { }
protected void doStart() throws ElasticsearchException {
}
@Override
protected void doStop() throws ElasticsearchException { }
protected void doStop() throws ElasticsearchException {
}
@Override
protected void doClose() throws ElasticsearchException { }
protected void doClose() throws ElasticsearchException {
}
/**
* Lists actively running benchmarks on the cluster
*
* @param request Status request
* @param listener Response listener
* @param request Status request
* @param listener Response listener
*/
public void listBenchmarks(final BenchmarkStatusRequest request, final ActionListener<BenchmarkStatusResponse> listener) {
@ -171,8 +174,8 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
/**
* Executes benchmarks on the cluster
*
* @param request Benchmark request
* @param listener Response listener
* @param request Benchmark request
* @param listener Response listener
*/
public void startBenchmark(final BenchmarkRequest request, final ActionListener<BenchmarkResponse> listener) {
@ -228,7 +231,7 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
listener.onFailure(t);
}
}, (benchmarkResponse.state() != BenchmarkResponse.State.ABORTED) &&
(benchmarkResponse.state() != BenchmarkResponse.State.FAILED)));
(benchmarkResponse.state() != BenchmarkResponse.State.FAILED)));
}
private final boolean isBenchmarkNode(DiscoveryNode node) {
@ -403,6 +406,7 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
}
public abstract T newInstance();
protected abstract void sendResponse();
@Override
@ -593,7 +597,7 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
if (bmd != null) {
for (BenchmarkMetaData.Entry entry : bmd.entries()) {
if (request.benchmarkName().equals(entry.benchmarkId())){
if (request.benchmarkName().equals(entry.benchmarkId())) {
if (entry.state() != BenchmarkMetaData.State.SUCCESS && entry.state() != BenchmarkMetaData.State.FAILED) {
throw new ElasticsearchException("A benchmark with ID [" + request.benchmarkName() + "] is already running in state [" + entry.state() + "]");
}
@ -648,7 +652,7 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
@Override
protected BenchmarkMetaData.Entry process(BenchmarkMetaData.Entry entry) {
BenchmarkMetaData.State state = entry.state();
assert state == BenchmarkMetaData.State.STARTED || state == BenchmarkMetaData.State.ABORTED : "Expected state: STARTED or ABORTED but was: " + entry.state();
assert state == BenchmarkMetaData.State.STARTED || state == BenchmarkMetaData.State.ABORTED : "Expected state: STARTED or ABORTED but was: " + entry.state();
if (success) {
return new BenchmarkMetaData.Entry(entry, BenchmarkMetaData.State.SUCCESS);
} else {
@ -661,7 +665,7 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
private final String[] patterns;
public AbortBenchmarkTask(String[] patterns, BenchmarkStateListener listener) {
super("abort_benchmark", null , listener);
super("abort_benchmark", null, listener);
this.patterns = patterns;
}
@ -675,7 +679,7 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
}
}
public abstract class UpdateBenchmarkStateTask implements ProcessedClusterStateUpdateTask {
public abstract class UpdateBenchmarkStateTask extends ProcessedClusterStateUpdateTask {
private final String reason;
protected final String benchmarkId;
@ -702,7 +706,7 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
ImmutableList.Builder<BenchmarkMetaData.Entry> builder = new ImmutableList.Builder<BenchmarkMetaData.Entry>();
for (BenchmarkMetaData.Entry e : bmd.entries()) {
if (benchmarkId == null || match(e)) {
e = process(e) ;
e = process(e);
instances.add(e);
}
// Don't keep finished benchmarks around in cluster state
@ -741,7 +745,7 @@ public class BenchmarkService extends AbstractLifecycleComponent<BenchmarkServic
}
}
public abstract class BenchmarkStateChangeAction<R extends MasterNodeOperationRequest> implements TimeoutClusterStateUpdateTask {
public abstract class BenchmarkStateChangeAction<R extends MasterNodeOperationRequest> extends TimeoutClusterStateUpdateTask {
protected final R request;
public BenchmarkStateChangeAction(R request) {

View File

@ -28,7 +28,7 @@ import org.elasticsearch.common.unit.TimeValue;
* An extension interface to {@link ClusterStateUpdateTask} that allows to be notified when
* all the nodes have acknowledged a cluster state update request
*/
public abstract class AckedClusterStateUpdateTask<Response> implements TimeoutClusterStateUpdateTask {
public abstract class AckedClusterStateUpdateTask<Response> extends TimeoutClusterStateUpdateTask {
private final ActionListener<Response> listener;
private final AckedRequest request;
@ -40,6 +40,7 @@ public abstract class AckedClusterStateUpdateTask<Response> implements TimeoutCl
/**
* Called to determine which nodes the acknowledgement is expected from
*
* @param discoveryNode a node
* @return true if the node is expected to send ack back, false otherwise
*/
@ -50,6 +51,7 @@ public abstract class AckedClusterStateUpdateTask<Response> implements TimeoutCl
/**
* Called once all the nodes have acknowledged the cluster state update request. Must be
* very lightweight execution, since it gets executed on the cluster service thread.
*
* @param t optional error that might have been thrown
*/
public void onAllNodesAcked(@Nullable Throwable t) {

View File

@ -110,4 +110,5 @@ public interface ClusterService extends LifecycleComponent<ClusterService> {
* Returns the tasks that are pending.
*/
List<PendingClusterTask> pendingTasks();
}

View File

@ -115,6 +115,8 @@ public class ClusterState implements ToXContent {
}
public static final long UNKNOWN_VERSION = -1;
private final long version;
private final RoutingTable routingTable;

View File

@ -0,0 +1,32 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster;
/**
* This is a marker interface to indicate that the task should be executed
* even if the current node is not a master.
*/
public abstract class ClusterStateNonMasterUpdateTask extends ClusterStateUpdateTask {
@Override
public boolean runOnlyOnMaster() {
return false;
}
}

View File

@ -19,19 +19,37 @@
package org.elasticsearch.cluster;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
/**
* A task that can update the cluster state.
*/
public interface ClusterStateUpdateTask {
abstract public class ClusterStateUpdateTask {
/**
* Update the cluster state based on the current state. Return the *same instance* if no state
* should be changed.
*/
ClusterState execute(ClusterState currentState) throws Exception;
abstract public ClusterState execute(ClusterState currentState) throws Exception;
/**
* A callback called when execute fails.
*/
void onFailure(String source, Throwable t);
abstract public void onFailure(String source, @Nullable Throwable t);
/**
* indicates whether this task should only run if current node is master
*/
public boolean runOnlyOnMaster() {
return true;
}
/**
* called when the task was rejected because the local node is no longer master
*/
public void onNoLongerMaster(String source) {
onFailure(source, new EsRejectedExecutionException("no longer master. source: [" + source + "]"));
}
}

View File

@ -0,0 +1,31 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster;
/**
* A combination between {@link org.elasticsearch.cluster.ProcessedClusterStateUpdateTask} and
* {@link org.elasticsearch.cluster.ClusterStateNonMasterUpdateTask} to allow easy creation of anonymous classes
*/
abstract public class ProcessedClusterStateNonMasterUpdateTask extends ProcessedClusterStateUpdateTask {
@Override
public boolean runOnlyOnMaster() {
return false;
}
}

View File

@ -23,11 +23,11 @@ package org.elasticsearch.cluster;
* An extension interface to {@link ClusterStateUpdateTask} that allows to be notified when
* the cluster state update has been processed.
*/
public interface ProcessedClusterStateUpdateTask extends ClusterStateUpdateTask {
public abstract class ProcessedClusterStateUpdateTask extends ClusterStateUpdateTask {
/**
* Called when the result of the {@link #execute(ClusterState)} have been processed
* properly by all listeners.
*/
void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState);
public abstract void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState);
}

View File

@ -25,11 +25,11 @@ import org.elasticsearch.common.unit.TimeValue;
* An extension interface to {@link org.elasticsearch.cluster.ClusterStateUpdateTask} that allows to associate
* a timeout.
*/
public interface TimeoutClusterStateUpdateTask extends ProcessedClusterStateUpdateTask {
abstract public class TimeoutClusterStateUpdateTask extends ProcessedClusterStateUpdateTask {
/**
* If the cluster state update task wasn't processed by the provided timeout, call
* {@link #onFailure(String, Throwable)}
*/
TimeValue timeout();
abstract public TimeValue timeout();
}

View File

@ -108,6 +108,19 @@ public class ClusterBlocks {
return global.contains(block);
}
public boolean hasGlobalBlock(int blockId) {
for (ClusterBlock clusterBlock : global) {
if (clusterBlock.id() == blockId) {
return true;
}
}
return false;
}
public boolean hasGlobalBlock(ClusterBlockLevel level) {
return global(level).size() > 0;
}
/**
* Is there a global block with the provided status?
*/

View File

@ -149,10 +149,15 @@ public class RoutingService extends AbstractLifecycleComponent<RoutingService> i
return ClusterState.builder(currentState).routingResult(routingResult).build();
}
@Override
public void onNoLongerMaster(String source) {
// no biggie
}
@Override
public void onFailure(String source, Throwable t) {
ClusterState state = clusterService.state();
logger.error("unexpected failure during [{}], current state:\n{}", t, source, state.prettyPrint());
ClusterState state = clusterService.state();
logger.error("unexpected failure during [{}], current state:\n{}", t, source, state.prettyPrint());
}
});
routingTableDirty = false;

View File

@ -84,7 +84,7 @@ public class InternalClusterService extends AbstractLifecycleComponent<ClusterSe
private volatile ClusterState clusterState;
private final ClusterBlocks.Builder initialBlocks = ClusterBlocks.builder().addGlobalBlock(Discovery.NO_MASTER_BLOCK);
private final ClusterBlocks.Builder initialBlocks;
private volatile ScheduledFuture reconnectToNodes;
@ -104,6 +104,8 @@ public class InternalClusterService extends AbstractLifecycleComponent<ClusterSe
this.reconnectInterval = componentSettings.getAsTime("reconnect_interval", TimeValue.timeValueSeconds(10));
localNodeMasterListeners = new LocalNodeMasterListeners(threadPool);
initialBlocks = ClusterBlocks.builder().addGlobalBlock(discoveryService.getNoMasterBlock());
}
public NodeSettingsService settingsService() {
@ -134,7 +136,7 @@ public class InternalClusterService extends AbstractLifecycleComponent<ClusterSe
discoveryService.addLifecycleListener(new LifecycleListener() {
@Override
public void afterStart() {
submitStateUpdateTask("update local node", Priority.IMMEDIATE, new ClusterStateUpdateTask() {
submitStateUpdateTask("update local node", Priority.IMMEDIATE, new ClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
return ClusterState.builder(currentState)
@ -144,7 +146,7 @@ public class InternalClusterService extends AbstractLifecycleComponent<ClusterSe
@Override
public void onFailure(String source, Throwable t) {
logger.warn("failed ot update local node", t);
logger.warn("failed to update local node", t);
}
});
}
@ -323,6 +325,11 @@ public class InternalClusterService extends AbstractLifecycleComponent<ClusterSe
}
logger.debug("processing [{}]: execute", source);
ClusterState previousClusterState = clusterState;
if (!previousClusterState.nodes().localNodeMaster() && updateTask.runOnlyOnMaster()) {
logger.debug("failing [{}]: local node is no longer master", source);
updateTask.onNoLongerMaster(source);
return;
}
ClusterState newClusterState;
try {
newClusterState = updateTask.execute(previousClusterState);
@ -379,20 +386,6 @@ public class InternalClusterService extends AbstractLifecycleComponent<ClusterSe
}
}
}
} else {
if (previousClusterState.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK) && !newClusterState.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK)) {
// force an update, its a fresh update from the master as we transition from a start of not having a master to having one
// have a fresh instances of routing and metadata to remove the chance that version might be the same
Builder builder = ClusterState.builder(newClusterState);
builder.routingTable(RoutingTable.builder(newClusterState.routingTable()));
builder.metaData(MetaData.builder(newClusterState.metaData()));
newClusterState = builder.build();
logger.debug("got first state from fresh master [{}]", newClusterState.nodes().masterNodeId());
} else if (newClusterState.version() < previousClusterState.version()) {
// we got a cluster state with older version, when we are *not* the master, let it in since it might be valid
// we check on version where applicable, like at ZenDiscovery#handleNewClusterStateFromMaster
logger.debug("got smaller cluster state when not master [" + newClusterState.version() + "<" + previousClusterState.version() + "] from source [" + source + "]");
}
}
newClusterState.status(ClusterState.ClusterStateStatus.BEING_APPLIED);
@ -720,5 +713,4 @@ public class InternalClusterService extends AbstractLifecycleComponent<ClusterSe
}
}
}
}

View File

@ -27,6 +27,7 @@ import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllo
import org.elasticsearch.cluster.routing.allocation.decider.*;
import org.elasticsearch.common.inject.AbstractModule;
import org.elasticsearch.discovery.DiscoverySettings;
import org.elasticsearch.discovery.zen.ZenDiscovery;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.indices.breaker.HierarchyCircuitBreakerService;
import org.elasticsearch.indices.cache.filter.IndicesFilterCache;
@ -57,6 +58,8 @@ public class ClusterDynamicSettingsModule extends AbstractModule {
clusterDynamicSettings.addDynamicSetting(DisableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_DISABLE_ALLOCATION);
clusterDynamicSettings.addDynamicSetting(DisableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_DISABLE_REPLICA_ALLOCATION);
clusterDynamicSettings.addDynamicSetting(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, Validator.INTEGER);
clusterDynamicSettings.addDynamicSetting(ZenDiscovery.SETTING_REJOIN_ON_MASTER_GONE, Validator.BOOLEAN);
clusterDynamicSettings.addDynamicSetting(DiscoverySettings.NO_MASTER_BLOCK);
clusterDynamicSettings.addDynamicSetting(FilterAllocationDecider.CLUSTER_ROUTING_INCLUDE_GROUP + "*");
clusterDynamicSettings.addDynamicSetting(FilterAllocationDecider.CLUSTER_ROUTING_EXCLUDE_GROUP + "*");
clusterDynamicSettings.addDynamicSetting(FilterAllocationDecider.CLUSTER_ROUTING_REQUIRE_GROUP + "*");

View File

@ -20,14 +20,11 @@
package org.elasticsearch.discovery;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.block.ClusterBlock;
import org.elasticsearch.cluster.block.ClusterBlockLevel;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.allocation.AllocationService;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.component.LifecycleComponent;
import org.elasticsearch.node.service.NodeService;
import org.elasticsearch.rest.RestStatus;
/**
* A pluggable module allowing to implement discovery of other nodes, publishing of the cluster
@ -36,8 +33,6 @@ import org.elasticsearch.rest.RestStatus;
*/
public interface Discovery extends LifecycleComponent<Discovery> {
final ClusterBlock NO_MASTER_BLOCK = new ClusterBlock(2, "no master", true, true, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.ALL);
DiscoveryNode localNode();
void addListener(InitialStateDiscoveryListener listener);

View File

@ -22,6 +22,7 @@ package org.elasticsearch.discovery;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchTimeoutException;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.block.ClusterBlock;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
@ -38,6 +39,8 @@ import java.util.concurrent.TimeUnit;
*/
public class DiscoveryService extends AbstractLifecycleComponent<DiscoveryService> {
public static final String SETTING_INITIAL_STATE_TIMEOUT = "discovery.initial_state_timeout";
private static class InitialStateListener implements InitialStateDiscoveryListener {
private final CountDownLatch latch = new CountDownLatch(1);
@ -60,12 +63,18 @@ public class DiscoveryService extends AbstractLifecycleComponent<DiscoveryServic
private final TimeValue initialStateTimeout;
private final Discovery discovery;
private InitialStateListener initialStateListener;
private final DiscoverySettings discoverySettings;
@Inject
public DiscoveryService(Settings settings, Discovery discovery) {
public DiscoveryService(Settings settings, DiscoverySettings discoverySettings, Discovery discovery) {
super(settings);
this.discoverySettings = discoverySettings;
this.discovery = discovery;
this.initialStateTimeout = componentSettings.getAsTime("initial_state_timeout", TimeValue.timeValueSeconds(30));
this.initialStateTimeout = settings.getAsTime(SETTING_INITIAL_STATE_TIMEOUT, TimeValue.timeValueSeconds(30));
}
public ClusterBlock getNoMasterBlock() {
return discoverySettings.getNoMasterBlock();
}
@Override

View File

@ -19,11 +19,17 @@
package org.elasticsearch.discovery;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.cluster.block.ClusterBlock;
import org.elasticsearch.cluster.block.ClusterBlockLevel;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.node.settings.NodeSettingsService;
import org.elasticsearch.rest.RestStatus;
import java.util.EnumSet;
/**
* Exposes common discovery settings that may be supported by all the different discovery implementations
@ -31,15 +37,24 @@ import org.elasticsearch.node.settings.NodeSettingsService;
public class DiscoverySettings extends AbstractComponent {
public static final String PUBLISH_TIMEOUT = "discovery.zen.publish_timeout";
public static final String NO_MASTER_BLOCK = "discovery.zen.no_master_block";
public static final TimeValue DEFAULT_PUBLISH_TIMEOUT = TimeValue.timeValueSeconds(30);
public static final String DEFAULT_NO_MASTER_BLOCK = "write";
public final static int NO_MASTER_BLOCK_ID = 2;
public final static ClusterBlock NO_MASTER_BLOCK_ALL = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, true, RestStatus.SERVICE_UNAVAILABLE, ClusterBlockLevel.ALL);
public final static ClusterBlock NO_MASTER_BLOCK_WRITES = new ClusterBlock(NO_MASTER_BLOCK_ID, "no master", true, false, RestStatus.SERVICE_UNAVAILABLE, EnumSet.of(ClusterBlockLevel.WRITE, ClusterBlockLevel.METADATA));
private volatile ClusterBlock noMasterBlock;
private volatile TimeValue publishTimeout = DEFAULT_PUBLISH_TIMEOUT;
@Inject
public DiscoverySettings(Settings settings, NodeSettingsService nodeSettingsService) {
super(settings);
nodeSettingsService.addListener(new ApplySettings());
this.noMasterBlock = parseNoMasterBlock(settings.get(NO_MASTER_BLOCK, DEFAULT_NO_MASTER_BLOCK));
this.publishTimeout = settings.getAsTime(PUBLISH_TIMEOUT, publishTimeout);
}
/**
@ -49,6 +64,10 @@ public class DiscoverySettings extends AbstractComponent {
return publishTimeout;
}
public ClusterBlock getNoMasterBlock() {
return noMasterBlock;
}
private class ApplySettings implements NodeSettingsService.Listener {
@Override
public void onRefreshSettings(Settings settings) {
@ -59,6 +78,24 @@ public class DiscoverySettings extends AbstractComponent {
publishTimeout = newPublishTimeout;
}
}
String newNoMasterBlockValue = settings.get(NO_MASTER_BLOCK);
if (newNoMasterBlockValue != null) {
ClusterBlock newNoMasterBlock = parseNoMasterBlock(newNoMasterBlockValue);
if (newNoMasterBlock != noMasterBlock) {
noMasterBlock = newNoMasterBlock;
}
}
}
}
private ClusterBlock parseNoMasterBlock(String value) {
switch (value) {
case "all":
return NO_MASTER_BLOCK_ALL;
case "write":
return NO_MASTER_BLOCK_WRITES;
default:
throw new ElasticsearchIllegalArgumentException("invalid master block [" + value + "]");
}
}
}

View File

@ -58,6 +58,7 @@ public class LocalDiscovery extends AbstractLifecycleComponent<Discovery> implem
private final TransportService transportService;
private final ClusterService clusterService;
private final DiscoveryService discoveryService;
private final DiscoveryNodeService discoveryNodeService;
private AllocationService allocationService;
private final ClusterName clusterName;
@ -77,7 +78,7 @@ public class LocalDiscovery extends AbstractLifecycleComponent<Discovery> implem
@Inject
public LocalDiscovery(Settings settings, ClusterName clusterName, TransportService transportService, ClusterService clusterService,
DiscoveryNodeService discoveryNodeService, Version version, DiscoverySettings discoverySettings) {
DiscoveryNodeService discoveryNodeService, Version version, DiscoverySettings discoverySettings, DiscoveryService discoveryService) {
super(settings);
this.clusterName = clusterName;
this.clusterService = clusterService;
@ -85,6 +86,7 @@ public class LocalDiscovery extends AbstractLifecycleComponent<Discovery> implem
this.discoveryNodeService = discoveryNodeService;
this.version = version;
this.discoverySettings = discoverySettings;
this.discoveryService = discoveryService;
}
@Override
@ -123,7 +125,7 @@ public class LocalDiscovery extends AbstractLifecycleComponent<Discovery> implem
// we are the first master (and the master)
master = true;
final LocalDiscovery master = firstMaster;
clusterService.submitStateUpdateTask("local-disco-initial_connect(master)", new ProcessedClusterStateUpdateTask() {
clusterService.submitStateUpdateTask("local-disco-initial_connect(master)", new ProcessedClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder();
@ -132,7 +134,7 @@ public class LocalDiscovery extends AbstractLifecycleComponent<Discovery> implem
}
nodesBuilder.localNodeId(master.localNode().id()).masterNodeId(master.localNode().id());
// remove the NO_MASTER block in this case
ClusterBlocks.Builder blocks = ClusterBlocks.builder().blocks(currentState.blocks()).removeGlobalBlock(Discovery.NO_MASTER_BLOCK);
ClusterBlocks.Builder blocks = ClusterBlocks.builder().blocks(currentState.blocks()).removeGlobalBlock(discoverySettings.getNoMasterBlock());
return ClusterState.builder(currentState).nodes(nodesBuilder).blocks(blocks).build();
}
@ -149,7 +151,7 @@ public class LocalDiscovery extends AbstractLifecycleComponent<Discovery> implem
} else if (firstMaster != null) {
// update as fast as we can the local node state with the new metadata (so we create indices for example)
final ClusterState masterState = firstMaster.clusterService.state();
clusterService.submitStateUpdateTask("local-disco(detected_master)", new ClusterStateUpdateTask() {
clusterService.submitStateUpdateTask("local-disco(detected_master)", new ClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
// make sure we have the local node id set, we might need it as a result of the new metadata
@ -165,7 +167,7 @@ public class LocalDiscovery extends AbstractLifecycleComponent<Discovery> implem
// tell the master to send the fact that we are here
final LocalDiscovery master = firstMaster;
firstMaster.clusterService.submitStateUpdateTask("local-disco-receive(from node[" + localNode + "])", new ProcessedClusterStateUpdateTask() {
firstMaster.clusterService.submitStateUpdateTask("local-disco-receive(from node[" + localNode + "])", new ProcessedClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder();
@ -225,7 +227,7 @@ public class LocalDiscovery extends AbstractLifecycleComponent<Discovery> implem
}
final LocalDiscovery master = firstMaster;
master.clusterService.submitStateUpdateTask("local-disco-update", new ClusterStateUpdateTask() {
master.clusterService.submitStateUpdateTask("local-disco-update", new ClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
DiscoveryNodes newNodes = currentState.nodes().removeDeadMembers(newMembers, master.localNode.id());
@ -305,13 +307,22 @@ public class LocalDiscovery extends AbstractLifecycleComponent<Discovery> implem
nodeSpecificClusterState.status(ClusterState.ClusterStateStatus.RECEIVED);
// ignore cluster state messages that do not include "me", not in the game yet...
if (nodeSpecificClusterState.nodes().localNode() != null) {
discovery.clusterService.submitStateUpdateTask("local-disco-receive(from master)", new ProcessedClusterStateUpdateTask() {
assert nodeSpecificClusterState.nodes().masterNode() != null : "received a cluster state without a master";
assert !nodeSpecificClusterState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock()) : "received a cluster state with a master block";
discovery.clusterService.submitStateUpdateTask("local-disco-receive(from master)", new ProcessedClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
if (nodeSpecificClusterState.version() < currentState.version() && Objects.equal(nodeSpecificClusterState.nodes().masterNodeId(), currentState.nodes().masterNodeId())) {
return currentState;
}
if (currentState.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock())) {
// its a fresh update from the master as we transition from a start of not having a master to having one
logger.debug("got first state from fresh master [{}]", nodeSpecificClusterState.nodes().masterNodeId());
return nodeSpecificClusterState;
}
ClusterState.Builder builder = ClusterState.builder(nodeSpecificClusterState);
// if the routing table did not change, use the original one
if (nodeSpecificClusterState.routingTable().version() == currentState.routingTable().version()) {

View File

@ -22,9 +22,7 @@ package org.elasticsearch.discovery.zen;
import com.google.common.base.Objects;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchIllegalStateException;
import org.elasticsearch.Version;
import org.elasticsearch.*;
import org.elasticsearch.cluster.*;
import org.elasticsearch.cluster.block.ClusterBlocks;
import org.elasticsearch.cluster.metadata.IndexMetaData;
@ -32,10 +30,10 @@ import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodeService;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.RoutingTable;
import org.elasticsearch.cluster.routing.allocation.AllocationService;
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.component.Lifecycle;
import org.elasticsearch.common.inject.Inject;
@ -45,6 +43,7 @@ import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.discovery.DiscoveryService;
import org.elasticsearch.discovery.DiscoverySettings;
@ -56,19 +55,20 @@ import org.elasticsearch.discovery.zen.membership.MembershipAction;
import org.elasticsearch.discovery.zen.ping.ZenPing;
import org.elasticsearch.discovery.zen.ping.ZenPingService;
import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction;
import org.elasticsearch.gateway.GatewayService;
import org.elasticsearch.node.service.NodeService;
import org.elasticsearch.node.settings.NodeSettingsService;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import static com.google.common.collect.Lists.newArrayList;
import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;
@ -78,6 +78,16 @@ import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;
*/
public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implements Discovery, DiscoveryNodesProvider {
public final static String SETTING_REJOIN_ON_MASTER_GONE = "discovery.zen.rejoin_on_master_gone";
public final static String SETTING_PING_TIMEOUT = "discovery.zen.ping.timeout";
public final static String SETTING_JOIN_TIMEOUT = "discovery.zen.join_timeout";
public final static String SETTING_JOIN_RETRY_ATTEMPTS = "discovery.zen.join_retry_attempts";
public final static String SETTING_JOIN_RETRY_DELAY = "discovery.zen.join_retry_delay";
public final static String SETTING_MAX_PINGS_FROM_ANOTHER_MASTER = "discovery.zen.max_pings_from_another_master";
public final static String SETTING_SEND_LEAVE_REQUEST = "discovery.zen.send_leave_request";
public final static String SETTING_MASTER_ELECTION_FILTER_CLIENT = "discovery.zen.master_election.filter_client";
public final static String SETTING_MASTER_ELECTION_FILTER_DATA = "discovery.zen.master_election.filter_data";
public static final String DISCOVERY_REJOIN_ACTION_NAME = "internal:discovery/zen/rejoin";
private final ThreadPool threadPool;
@ -86,6 +96,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
private AllocationService allocationService;
private final ClusterName clusterName;
private final DiscoveryNodeService discoveryNodeService;
private final DiscoverySettings discoverySettings;
private final ZenPingService pingService;
private final MasterFaultDetection masterFD;
private final NodesFaultDetection nodesFD;
@ -97,6 +108,14 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
private final TimeValue pingTimeout;
private final TimeValue joinTimeout;
/** how many retry attempts to perform if join request failed with an retriable error */
private final int joinRetryAttempts;
/** how long to wait before performing another join attempt after a join request failed with an retriable error */
private final TimeValue joinRetryDelay;
/** how many pings from *another* master to tolerate before forcing a rejoin on other or local master */
private final int maxPingsFromAnotherMaster;
// a flag that should be used only for testing
private final boolean sendLeaveRequest;
@ -118,41 +137,61 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
private final AtomicBoolean initialStateSent = new AtomicBoolean();
private volatile boolean rejoinOnMasterGone;
@Nullable
private NodeService nodeService;
private final BlockingQueue<Tuple<DiscoveryNode, MembershipAction.JoinCallback>> processJoinRequests = ConcurrentCollections.newBlockingQueue();
@Inject
public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threadPool,
TransportService transportService, ClusterService clusterService, NodeSettingsService nodeSettingsService,
DiscoveryNodeService discoveryNodeService, ZenPingService pingService, Version version, DiscoverySettings discoverySettings) {
DiscoveryNodeService discoveryNodeService, ZenPingService pingService, ElectMasterService electMasterService, Version version,
DiscoverySettings discoverySettings) {
super(settings);
this.clusterName = clusterName;
this.threadPool = threadPool;
this.clusterService = clusterService;
this.transportService = transportService;
this.discoveryNodeService = discoveryNodeService;
this.discoverySettings = discoverySettings;
this.pingService = pingService;
this.version = version;
this.electMaster = electMasterService;
// also support direct discovery.zen settings, for cases when it gets extended
this.pingTimeout = settings.getAsTime("discovery.zen.ping.timeout", settings.getAsTime("discovery.zen.ping_timeout", componentSettings.getAsTime("ping_timeout", componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3)))));
this.joinTimeout = settings.getAsTime("discovery.zen.join_timeout", TimeValue.timeValueMillis(pingTimeout.millis() * 20));
this.sendLeaveRequest = componentSettings.getAsBoolean("send_leave_request", true);
// keep using componentSettings for BWC, in case this class gets extended.
TimeValue pingTimeout = componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3));
pingTimeout = componentSettings.getAsTime("ping_timeout", pingTimeout);
pingTimeout = settings.getAsTime("discovery.zen.ping_timeout", pingTimeout);
this.pingTimeout = settings.getAsTime(SETTING_PING_TIMEOUT, pingTimeout);
this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true);
this.masterElectionFilterDataNodes = settings.getAsBoolean("discovery.zen.master_election.filter_data", false);
this.joinTimeout = settings.getAsTime(SETTING_JOIN_TIMEOUT, TimeValue.timeValueMillis(pingTimeout.millis() * 20));
this.joinRetryAttempts = settings.getAsInt(SETTING_JOIN_RETRY_ATTEMPTS, 3);
this.joinRetryDelay = settings.getAsTime(SETTING_JOIN_RETRY_DELAY, TimeValue.timeValueMillis(100));
this.maxPingsFromAnotherMaster = settings.getAsInt(SETTING_MAX_PINGS_FROM_ANOTHER_MASTER, 3);
this.sendLeaveRequest = settings.getAsBoolean(SETTING_SEND_LEAVE_REQUEST, true);
this.masterElectionFilterClientNodes = settings.getAsBoolean(SETTING_MASTER_ELECTION_FILTER_CLIENT, true);
this.masterElectionFilterDataNodes = settings.getAsBoolean(SETTING_MASTER_ELECTION_FILTER_DATA, false);
this.rejoinOnMasterGone = settings.getAsBoolean(SETTING_REJOIN_ON_MASTER_GONE, true);
if (this.joinRetryAttempts < 1) {
throw new ElasticsearchIllegalArgumentException("'" + SETTING_JOIN_RETRY_ATTEMPTS + "' must be a positive number. got [" + this.SETTING_JOIN_RETRY_ATTEMPTS + "]");
}
if (this.maxPingsFromAnotherMaster < 1) {
throw new ElasticsearchIllegalArgumentException("'" + SETTING_MAX_PINGS_FROM_ANOTHER_MASTER + "' must be a positive number. got [" + this.maxPingsFromAnotherMaster + "]");
}
logger.debug("using ping.timeout [{}], join.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, joinTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes);
this.electMaster = new ElectMasterService(settings);
nodeSettingsService.addListener(new ApplySettings());
this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this);
this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this, clusterName);
this.masterFD.addListener(new MasterNodeFailureListener());
this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService);
this.nodesFD.addListener(new NodeFailureListener());
this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService, clusterName);
this.nodesFD.addListener(new NodeFaultDetectionListener());
this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewClusterStateListener(), discoverySettings, clusterName);
this.pingService.setNodesProvider(this);
@ -178,7 +217,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
final String nodeId = DiscoveryService.generateNodeId(settings);
localNode = new DiscoveryNode(settings.get("name"), nodeId, transportService.boundAddress().publishAddress(), nodeAttributes, version);
latestDiscoNodes = new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build();
nodesFD.updateNodes(latestDiscoNodes);
nodesFD.updateNodes(latestDiscoNodes, ClusterState.UNKNOWN_VERSION);
pingService.start();
// do the join on a different thread, the DiscoveryService waits for 30s anyhow till it is discovered
@ -272,7 +311,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
throw new ElasticsearchIllegalStateException("Shouldn't publish state when not master");
}
latestDiscoNodes = clusterState.nodes();
nodesFD.updateNodes(clusterState.nodes());
nodesFD.updateNodes(clusterState.nodes(), clusterState.version());
publishClusterState.publish(clusterState, ackListener);
}
@ -295,6 +334,15 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
});
}
/**
* returns true if there is a currently a background thread active for (re)joining the cluster
* used for testing.
*/
public boolean joiningCluster() {
return currentJoinThread != null;
}
private void innerJoinCluster() {
boolean retry = true;
while (retry) {
@ -311,18 +359,24 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
if (localNode.equals(masterNode)) {
this.master = true;
nodesFD.start(); // start the nodes FD
clusterService.submitStateUpdateTask("zen-disco-join (elected_as_master)", Priority.URGENT, new ProcessedClusterStateUpdateTask() {
clusterService.submitStateUpdateTask("zen-disco-join (elected_as_master)", Priority.URGENT, new ProcessedClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
DiscoveryNodes.Builder builder = new DiscoveryNodes.Builder()
// Take into account the previous known nodes, if they happen not to be available
// then fault detection will remove these nodes.
DiscoveryNodes.Builder builder = new DiscoveryNodes.Builder(latestDiscoNodes)
.localNodeId(localNode.id())
.masterNodeId(localNode.id())
// put our local node
.put(localNode);
// update the fact that we are the master...
latestDiscoNodes = builder.build();
ClusterBlocks clusterBlocks = ClusterBlocks.builder().blocks(currentState.blocks()).removeGlobalBlock(NO_MASTER_BLOCK).build();
return ClusterState.builder(currentState).nodes(latestDiscoNodes).blocks(clusterBlocks).build();
ClusterBlocks clusterBlocks = ClusterBlocks.builder().blocks(currentState.blocks()).removeGlobalBlock(discoverySettings.getNoMasterBlock()).build();
currentState = ClusterState.builder(currentState).nodes(latestDiscoNodes).blocks(clusterBlocks).build();
// eagerly run reroute to remove dead nodes from routing table
RoutingAllocation.Result result = allocationService.reroute(currentState);
return ClusterState.builder(currentState).routingResult(result).build();
}
@Override
@ -337,30 +391,18 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
});
} else {
this.master = false;
try {
// first, make sure we can connect to the master
transportService.connectToNode(masterNode);
} catch (Exception e) {
logger.warn("failed to connect to master [{}], retrying...", e, masterNode);
retry = true;
continue;
}
// send join request
try {
membership.sendJoinRequestBlocking(masterNode, localNode, joinTimeout);
} catch (Exception e) {
if (e instanceof ElasticsearchException) {
logger.info("failed to send join request to master [{}], reason [{}]", masterNode, ((ElasticsearchException) e).getDetailedMessage());
} else {
logger.info("failed to send join request to master [{}], reason [{}]", masterNode, e.getMessage());
}
if (logger.isTraceEnabled()) {
logger.trace("detailed failed reason", e);
}
// failed to send the join request, retry
retry = !joinElectedMaster(masterNode);
if (retry) {
continue;
}
if (latestDiscoNodes.masterNode() == null) {
logger.debug("no master node is set, despite of join request completing. retrying pings");
retry = true;
continue;
}
masterFD.start(masterNode, "initial_join");
// no need to submit the received cluster state, we will get it from the master when it publishes
// the fact that we joined
@ -368,6 +410,52 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
}
}
/**
* Join a newly elected master.
*
* @return true if successful
*/
private boolean joinElectedMaster(DiscoveryNode masterNode) {
try {
// first, make sure we can connect to the master
transportService.connectToNode(masterNode);
} catch (Exception e) {
logger.warn("failed to connect to master [{}], retrying...", e, masterNode);
return false;
}
int joinAttempt = 0; // we retry on illegal state if the master is not yet ready
while (true) {
try {
logger.trace("joining master {}", masterNode);
membership.sendJoinRequestBlocking(masterNode, localNode, joinTimeout);
return true;
} catch (Throwable t) {
Throwable unwrap = ExceptionsHelper.unwrapCause(t);
if (unwrap instanceof ElasticsearchIllegalStateException) {
if (++joinAttempt == this.joinRetryAttempts) {
logger.info("failed to send join request to master [{}], reason [{}], tried [{}] times", masterNode, ExceptionsHelper.detailedMessage(t), joinAttempt);
return false;
} else {
logger.trace("master {} failed with [{}]. retrying... (attempts done: [{}])", masterNode, ExceptionsHelper.detailedMessage(t), joinAttempt);
}
} else {
if (logger.isTraceEnabled()) {
logger.trace("failed to send join request to master [{}]", t, masterNode);
} else {
logger.info("failed to send join request to master [{}], reason [{}]", masterNode, ExceptionsHelper.detailedMessage(t));
}
return false;
}
}
try {
Thread.sleep(this.joinRetryDelay.millis());
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
private void handleLeaveRequest(final DiscoveryNode node) {
if (lifecycleState() != Lifecycle.State.STARTED) {
// not started, ignore a node failure
@ -389,6 +477,11 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
return ClusterState.builder(currentState).routingResult(routingResult).build();
}
@Override
public void onNoLongerMaster(String source) {
// ignoring (already logged)
}
@Override
public void onFailure(String source, Throwable t) {
logger.error("unexpected failure during [{}]", t, source);
@ -424,6 +517,11 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
return ClusterState.builder(currentState).routingResult(routingResult).build();
}
@Override
public void onNoLongerMaster(String source) {
// already logged
}
@Override
public void onFailure(String source, Throwable t) {
logger.error("unexpected failure during [{}]", t, source);
@ -457,6 +555,12 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
return currentState;
}
@Override
public void onNoLongerMaster(String source) {
// ignoring (already logged)
}
@Override
public void onFailure(String source, Throwable t) {
logger.error("unexpected failure during [{}]", t, source);
@ -481,7 +585,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
logger.info("master_left [{}], reason [{}]", masterNode, reason);
clusterService.submitStateUpdateTask("zen-disco-master_failed (" + masterNode + ")", Priority.IMMEDIATE, new ProcessedClusterStateUpdateTask() {
clusterService.submitStateUpdateTask("zen-disco-master_failed (" + masterNode + ")", Priority.IMMEDIATE, new ProcessedClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
if (!masterNode.id().equals(currentState.nodes().masterNodeId())) {
@ -493,6 +597,16 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
// make sure the old master node, which has failed, is not part of the nodes we publish
.remove(masterNode.id())
.masterNodeId(null).build();
latestDiscoNodes = discoveryNodes;
// flush any pending cluster states from old master, so it will not be set as master again
ArrayList<ProcessClusterState> pendingNewClusterStates = new ArrayList<>();
processNewClusterStates.drainTo(pendingNewClusterStates);
logger.trace("removed [{}] pending cluster states", pendingNewClusterStates.size());
if (rejoinOnMasterGone) {
return rejoin(ClusterState.builder(currentState).nodes(discoveryNodes).build(), "master left (reason = " + reason + ")");
}
if (!electMaster.hasEnoughMasterNodes(discoveryNodes)) {
return rejoin(ClusterState.builder(currentState).nodes(discoveryNodes).build(), "not enough master nodes after master left (reason = " + reason + ")");
@ -561,29 +675,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
clusterService.submitStateUpdateTask("zen-disco-master_receive_cluster_state_from_another_master [" + newState.nodes().masterNode() + "]", Priority.URGENT, new ProcessedClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
if (newState.version() > currentState.version()) {
logger.warn("received cluster state from [{}] which is also master but with a newer cluster_state, rejoining to cluster...", newState.nodes().masterNode());
return rejoin(currentState, "zen-disco-master_receive_cluster_state_from_another_master [" + newState.nodes().masterNode() + "]");
} else {
logger.warn("received cluster state from [{}] which is also master but with an older cluster_state, telling [{}] to rejoin the cluster", newState.nodes().masterNode(), newState.nodes().masterNode());
try {
// make sure we're connected to this node (connect to node does nothing if we're already connected)
// since the network connections are asymmetric, it may be that we received a state but have disconnected from the node
// in the past (after a master failure, for example)
transportService.connectToNode(newState.nodes().masterNode());
transportService.sendRequest(newState.nodes().masterNode(), DISCOVERY_REJOIN_ACTION_NAME, new RejoinClusterRequest(currentState.nodes().localNodeId()), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) {
@Override
public void handleException(TransportException exp) {
logger.warn("failed to send rejoin request to [{}]", exp, newState.nodes().masterNode());
}
});
} catch (Exception e) {
logger.warn("failed to send rejoin request to [{}]", e, newState.nodes().masterNode());
}
return currentState;
}
return handleAnotherMaster(currentState, newState.nodes().masterNode(), newState.version(), "via a new cluster state");
}
@Override
@ -610,7 +702,11 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
final ProcessClusterState processClusterState = new ProcessClusterState(newClusterState, newStateProcessed);
processNewClusterStates.add(processClusterState);
clusterService.submitStateUpdateTask("zen-disco-receive(from master [" + newClusterState.nodes().masterNode() + "])", Priority.URGENT, new ProcessedClusterStateUpdateTask() {
assert newClusterState.nodes().masterNode() != null : "received a cluster state without a master";
assert !newClusterState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock()) : "received a cluster state with a master block";
clusterService.submitStateUpdateTask("zen-disco-receive(from master [" + newClusterState.nodes().masterNode() + "])", Priority.URGENT, new ProcessedClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
// we already processed it in a previous event
@ -642,6 +738,11 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
// we are going to use it for sure, poll (remove) it
potentialState = processNewClusterStates.poll();
if (potentialState == null) {
// might happen if the queue is drained
break;
}
potentialState.processed = true;
if (potentialState.clusterState.version() > stateToProcess.clusterState.version()) {
@ -670,7 +771,16 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
masterFD.restart(latestDiscoNodes.masterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]");
}
if (currentState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock())) {
// its a fresh update from the master as we transition from a start of not having a master to having one
logger.debug("got first state from fresh master [{}]", updatedState.nodes().masterNodeId());
return updatedState;
}
// some optimizations to make sure we keep old objects where possible
ClusterState.Builder builder = ClusterState.builder(updatedState);
// if the routing table did not change, use the original one
if (updatedState.routingTable().version() == currentState.routingTable().version()) {
builder.routingTable(currentState.routingTable());
@ -726,37 +836,75 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
// validate the join request, will throw a failure if it fails, which will get back to the
// node calling the join request
membership.sendValidateJoinRequestBlocking(node, joinTimeout);
processJoinRequests.add(new Tuple<>(node, callback));
clusterService.submitStateUpdateTask("zen-disco-receive(join from node[" + node + "])", Priority.IMMEDIATE, new ProcessedClusterStateUpdateTask() {
private final List<Tuple<DiscoveryNode, MembershipAction.JoinCallback>> drainedTasks = new ArrayList<>();
@Override
public ClusterState execute(ClusterState currentState) {
if (currentState.nodes().nodeExists(node.id())) {
// the node already exists in the cluster
logger.info("received a join request for an existing node [{}]", node);
// still send a new cluster state, so it will be re published and possibly update the other node
return ClusterState.builder(currentState).build();
processJoinRequests.drainTo(drainedTasks);
if (drainedTasks.isEmpty()) {
return currentState;
}
DiscoveryNodes.Builder builder = DiscoveryNodes.builder(currentState.nodes());
for (DiscoveryNode existingNode : currentState.nodes()) {
if (node.address().equals(existingNode.address())) {
builder.remove(existingNode.id());
logger.warn("received join request from node [{}], but found existing node {} with same address, removing existing node", node, existingNode);
boolean modified = false;
DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder(currentState.nodes());
for (Tuple<DiscoveryNode, MembershipAction.JoinCallback> task : drainedTasks) {
DiscoveryNode node = task.v1();
if (currentState.nodes().nodeExists(node.id())) {
logger.debug("received a join request for an existing node [{}]", node);
} else {
modified = true;
nodesBuilder.put(node);
for (DiscoveryNode existingNode : currentState.nodes()) {
if (node.address().equals(existingNode.address())) {
nodesBuilder.remove(existingNode.id());
logger.warn("received join request from node [{}], but found existing node {} with same address, removing existing node", node, existingNode);
}
}
}
}
ClusterState.Builder stateBuilder = ClusterState.builder(currentState);
if (modified) {
latestDiscoNodes = nodesBuilder.build();
stateBuilder.nodes(latestDiscoNodes);
}
return stateBuilder.build();
}
@Override
public void onNoLongerMaster(String source) {
Exception e = new EsRejectedExecutionException("no longer master. source: [" + source + "]");
innerOnFailure(e);
}
void innerOnFailure(Throwable t) {
for (Tuple<DiscoveryNode, MembershipAction.JoinCallback> drainedTask : drainedTasks) {
try {
drainedTask.v2().onFailure(t);
} catch (Exception e) {
logger.error("error during task failure", e);
}
}
latestDiscoNodes = builder.build();
// add the new node now (will update latestDiscoNodes on publish)
return ClusterState.builder(currentState).nodes(latestDiscoNodes.newNode(node)).build();
}
@Override
public void onFailure(String source, Throwable t) {
logger.error("unexpected failure during [{}]", t, source);
callback.onFailure(t);
innerOnFailure(t);
}
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
callback.onSuccess();
for (Tuple<DiscoveryNode, MembershipAction.JoinCallback> drainedTask : drainedTasks) {
try {
drainedTask.v2().onSuccess();
} catch (Exception e) {
logger.error("unexpected error during [{}]", e, source);
}
}
}
});
}
@ -807,35 +955,36 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
List<DiscoveryNode> pingMasters = newArrayList();
for (ZenPing.PingResponse pingResponse : pingResponses) {
if (pingResponse.master() != null) {
pingMasters.add(pingResponse.master());
// We can't include the local node in pingMasters list, otherwise we may up electing ourselves without
// any check / verifications from other nodes in ZenDiscover#innerJoinCluster()
if (!localNode.equals(pingResponse.master())) {
pingMasters.add(pingResponse.master());
}
}
}
Set<DiscoveryNode> possibleMasterNodes = Sets.newHashSet();
possibleMasterNodes.add(localNode);
if (localNode.masterNode()) {
possibleMasterNodes.add(localNode);
}
for (ZenPing.PingResponse pingResponse : pingResponses) {
possibleMasterNodes.add(pingResponse.target());
}
// if we don't have enough master nodes, we bail, even if we get a response that indicates
// there is a master by other node, we don't see enough...
if (!electMaster.hasEnoughMasterNodes(possibleMasterNodes)) {
logger.trace("not enough master nodes [{}]", possibleMasterNodes);
return null;
}
if (pingMasters.isEmpty()) {
// lets tie break between discovered nodes
DiscoveryNode electedMaster = electMaster.electMaster(possibleMasterNodes);
if (localNode.equals(electedMaster)) {
return localNode;
// if we don't have enough master nodes, we bail, because there are not enough master to elect from
if (electMaster.hasEnoughMasterNodes(possibleMasterNodes)) {
return electMaster.electMaster(possibleMasterNodes);
} else {
logger.trace("not enough master nodes [{}]", possibleMasterNodes);
return null;
}
} else {
DiscoveryNode electedMaster = electMaster.electMaster(pingMasters);
if (electedMaster != null) {
return electedMaster;
}
assert !pingMasters.contains(localNode) : "local node should never be elected as master when other nodes indicate an active master";
// lets tie break between discovered nodes
return electMaster.electMaster(pingMasters);
}
return null;
}
private ClusterState rejoin(ClusterState clusterState, String reason) {
@ -845,28 +994,45 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
master = false;
ClusterBlocks clusterBlocks = ClusterBlocks.builder().blocks(clusterState.blocks())
.addGlobalBlock(NO_MASTER_BLOCK)
.addGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)
.addGlobalBlock(discoverySettings.getNoMasterBlock())
.build();
// clear the routing table, we have no master, so we need to recreate the routing when we reform the cluster
RoutingTable routingTable = RoutingTable.builder().build();
// we also clean the metadata, since we are going to recover it if we become master
MetaData metaData = MetaData.builder().build();
// clean the nodes, we are now not connected to anybody, since we try and reform the cluster
latestDiscoNodes = new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build();
latestDiscoNodes = new DiscoveryNodes.Builder(latestDiscoNodes).masterNodeId(null).build();
asyncJoinCluster();
return ClusterState.builder(clusterState)
.blocks(clusterBlocks)
.nodes(latestDiscoNodes)
.routingTable(routingTable)
.metaData(metaData)
.build();
}
private ClusterState handleAnotherMaster(ClusterState localClusterState, final DiscoveryNode otherMaster, long otherClusterStateVersion, String reason) {
assert master : "handleAnotherMaster called but current node is not a master";
if (otherClusterStateVersion > localClusterState.version()) {
return rejoin(localClusterState, "zen-disco-discovered another master with a new cluster_state [" + otherMaster + "][" + reason + "]");
} else {
logger.warn("discovered [{}] which is also master but with an older cluster_state, telling [{}] to rejoin the cluster ([{}])", otherMaster, otherMaster, reason);
try {
// make sure we're connected to this node (connect to node does nothing if we're already connected)
// since the network connections are asymmetric, it may be that we received a state but have disconnected from the node
// in the past (after a master failure, for example)
transportService.connectToNode(otherMaster);
transportService.sendRequest(otherMaster, DISCOVERY_REJOIN_ACTION_NAME, new RejoinClusterRequest(localClusterState.nodes().localNodeId()), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) {
@Override
public void handleException(TransportException exp) {
logger.warn("failed to send rejoin request to [{}]", exp, otherMaster);
}
});
} catch (Exception e) {
logger.warn("failed to send rejoin request to [{}]", e, otherMaster);
}
return localClusterState;
}
}
private void sendInitialStateEventIfNeeded() {
if (initialStateSent.compareAndSet(false, true)) {
for (InitialStateDiscoveryListener listener : initialStateListeners) {
@ -895,12 +1061,48 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
}
}
private class NodeFailureListener implements NodesFaultDetection.Listener {
private class NodeFaultDetectionListener extends NodesFaultDetection.Listener {
private final AtomicInteger pingsWhileMaster = new AtomicInteger(0);
@Override
public void onNodeFailure(DiscoveryNode node, String reason) {
handleNodeFailure(node, reason);
}
@Override
public void onPingReceived(final NodesFaultDetection.PingRequest pingRequest) {
// if we are master, we don't expect any fault detection from another node. If we get it
// means we potentially have two masters in the cluster.
if (!master) {
pingsWhileMaster.set(0);
return;
}
// nodes pre 1.4.0 do not send this information
if (pingRequest.masterNode() == null) {
return;
}
if (pingsWhileMaster.incrementAndGet() < maxPingsFromAnotherMaster) {
logger.trace("got a ping from another master {}. current ping count: [{}]", pingRequest.masterNode(), pingsWhileMaster.get());
return;
}
logger.debug("got a ping from another master {}. resolving who should rejoin. current ping count: [{}]", pingRequest.masterNode(), pingsWhileMaster.get());
clusterService.submitStateUpdateTask("ping from another master", Priority.URGENT, new ClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
pingsWhileMaster.set(0);
return handleAnotherMaster(currentState, pingRequest.masterNode(), pingRequest.clusterStateVersion(), "node fd ping");
}
@Override
public void onFailure(String source, Throwable t) {
logger.debug("unexpected error during cluster state update task after pings from another master", t);
}
});
}
}
private class MasterNodeFailureListener implements MasterFaultDetection.Listener {
@ -922,6 +1124,10 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
}
}
boolean isRejoinOnMasterGone() {
return rejoinOnMasterGone;
}
static class RejoinClusterRequest extends TransportRequest {
private String fromNodeId;
@ -955,7 +1161,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
@Override
public void messageReceived(final RejoinClusterRequest request, final TransportChannel channel) throws Exception {
clusterService.submitStateUpdateTask("received a request to rejoin the cluster from [" + request.fromNodeId + "]", Priority.URGENT, new ClusterStateUpdateTask() {
clusterService.submitStateUpdateTask("received a request to rejoin the cluster from [" + request.fromNodeId + "]", Priority.URGENT, new ClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
try {
@ -966,6 +1172,11 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
return rejoin(currentState, "received a request to rejoin the cluster from [" + request.fromNodeId + "]");
}
@Override
public void onNoLongerMaster(String source) {
// already logged
}
@Override
public void onFailure(String source, Throwable t) {
logger.error("unexpected failure during [{}]", t, source);
@ -989,6 +1200,12 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
ZenDiscovery.this.electMaster.minimumMasterNodes(), minimumMasterNodes);
handleMinimumMasterNodesChanged(minimumMasterNodes);
}
boolean rejoinOnMasterGone = settings.getAsBoolean(SETTING_REJOIN_ON_MASTER_GONE, ZenDiscovery.this.rejoinOnMasterGone);
if (rejoinOnMasterGone != ZenDiscovery.this.rejoinOnMasterGone) {
logger.info("updating {} from [{}] to [{}]", SETTING_REJOIN_ON_MASTER_GONE, ZenDiscovery.this.rejoinOnMasterGone, rejoinOnMasterGone);
ZenDiscovery.this.rejoinOnMasterGone = rejoinOnMasterGone;
}
}
}
}

View File

@ -23,6 +23,7 @@ import com.google.common.collect.Lists;
import org.elasticsearch.common.inject.AbstractModule;
import org.elasticsearch.common.inject.multibindings.Multibinder;
import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.discovery.zen.ping.ZenPingService;
import org.elasticsearch.discovery.zen.ping.unicast.UnicastHostsProvider;
@ -44,6 +45,7 @@ public class ZenDiscoveryModule extends AbstractModule {
@Override
protected void configure() {
bind(ElectMasterService.class).asEagerSingleton();
bind(ZenPingService.class).asEagerSingleton();
Multibinder<UnicastHostsProvider> unicastHostsProviderMultibinder = Multibinder.newSetBinder(binder(), UnicastHostsProvider.class);
for (Class<? extends UnicastHostsProvider> unicastHostProvider : unicastHostProviders) {

View File

@ -24,12 +24,10 @@ import com.google.common.collect.Lists;
import org.apache.lucene.util.CollectionUtil;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.*;
/**
*
@ -42,6 +40,7 @@ public class ElectMasterService extends AbstractComponent {
private volatile int minimumMasterNodes;
@Inject
public ElectMasterService(Settings settings) {
super(settings);
this.minimumMasterNodes = settings.getAsInt(DISCOVERY_ZEN_MINIMUM_MASTER_NODES, -1);
@ -69,6 +68,18 @@ public class ElectMasterService extends AbstractComponent {
return count >= minimumMasterNodes;
}
/**
* Returns the given nodes sorted by likelyhood of being elected as master, most likely first.
* Non-master nodes are not removed but are rather put in the end
* @param nodes
* @return
*/
public List<DiscoveryNode> sortByMasterLikelihood(Iterable<DiscoveryNode> nodes) {
ArrayList<DiscoveryNode> sortedNodes = Lists.newArrayList(nodes);
CollectionUtil.introSort(sortedNodes, nodeComparator);
return sortedNodes;
}
/**
* Returns a list of the next possible masters.
*/
@ -120,6 +131,12 @@ public class ElectMasterService extends AbstractComponent {
@Override
public int compare(DiscoveryNode o1, DiscoveryNode o2) {
if (o1.masterNode() && !o2.masterNode()) {
return -1;
}
if (!o1.masterNode() && o2.masterNode()) {
return 1;
}
return o1.id().compareTo(o2.id());
}
}

View File

@ -0,0 +1,95 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.discovery.zen.fd;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.TransportConnectionListener;
import org.elasticsearch.transport.TransportService;
import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;
/**
* A base class for {@link org.elasticsearch.discovery.zen.fd.MasterFaultDetection} & {@link org.elasticsearch.discovery.zen.fd.NodesFaultDetection},
* making sure both use the same setting.
*/
public abstract class FaultDetection extends AbstractComponent {
public static final String SETTING_CONNECT_ON_NETWORK_DISCONNECT = "discovery.zen.fd.connect_on_network_disconnect";
public static final String SETTING_PING_INTERVAL = "discovery.zen.fd.ping_interval";
public static final String SETTING_PING_TIMEOUT = "discovery.zen.fd.ping_timeout";
public static final String SETTING_PING_RETRIES = "discovery.zen.fd.ping_retries";
public static final String SETTING_REGISTER_CONNECTION_LISTENER = "discovery.zen.fd.register_connection_listener";
protected final ThreadPool threadPool;
protected final ClusterName clusterName;
protected final TransportService transportService;
// used mainly for testing, should always be true
protected final boolean registerConnectionListener;
protected final FDConnectionListener connectionListener;
protected final boolean connectOnNetworkDisconnect;
protected final TimeValue pingInterval;
protected final TimeValue pingRetryTimeout;
protected final int pingRetryCount;
public FaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName) {
super(settings);
this.threadPool = threadPool;
this.transportService = transportService;
this.clusterName = clusterName;
this.connectOnNetworkDisconnect = settings.getAsBoolean(SETTING_CONNECT_ON_NETWORK_DISCONNECT, false);
this.pingInterval = settings.getAsTime(SETTING_PING_INTERVAL, timeValueSeconds(1));
this.pingRetryTimeout = settings.getAsTime(SETTING_PING_TIMEOUT, timeValueSeconds(30));
this.pingRetryCount = settings.getAsInt(SETTING_PING_RETRIES, 3);
this.registerConnectionListener = settings.getAsBoolean(SETTING_REGISTER_CONNECTION_LISTENER, true);
this.connectionListener = new FDConnectionListener();
if (registerConnectionListener) {
transportService.addConnectionListener(connectionListener);
}
}
public void close() {
transportService.removeConnectionListener(connectionListener);
}
/**
* This method will be called when the {@link org.elasticsearch.transport.TransportService} raised a node disconnected event
*/
abstract void handleTransportDisconnect(DiscoveryNode node);
private class FDConnectionListener implements TransportConnectionListener {
@Override
public void onNodeConnected(DiscoveryNode node) {
}
@Override
public void onNodeDisconnected(DiscoveryNode node) {
handleTransportDisconnect(node);
}
}
}

View File

@ -20,9 +20,10 @@
package org.elasticsearch.discovery.zen.fd;
import org.elasticsearch.ElasticsearchIllegalStateException;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.settings.Settings;
@ -35,13 +36,12 @@ import java.io.IOException;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicBoolean;
import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;
import static org.elasticsearch.transport.TransportRequestOptions.options;
/**
* A fault detection that pings the master periodically to see if its alive.
*/
public class MasterFaultDetection extends AbstractComponent {
public class MasterFaultDetection extends FaultDetection {
public static final String MASTER_PING_ACTION_NAME = "internal:discovery/zen/fd/master_ping";
@ -52,29 +52,10 @@ public class MasterFaultDetection extends AbstractComponent {
void onDisconnectedFromMaster();
}
private final ThreadPool threadPool;
private final TransportService transportService;
private final DiscoveryNodesProvider nodesProvider;
private final CopyOnWriteArrayList<Listener> listeners = new CopyOnWriteArrayList<>();
private final boolean connectOnNetworkDisconnect;
private final TimeValue pingInterval;
private final TimeValue pingRetryTimeout;
private final int pingRetryCount;
// used mainly for testing, should always be true
private final boolean registerConnectionListener;
private final FDConnectionListener connectionListener;
private volatile MasterPinger masterPinger;
private final Object masterNodeMutex = new Object();
@ -85,25 +66,13 @@ public class MasterFaultDetection extends AbstractComponent {
private final AtomicBoolean notifiedMasterFailure = new AtomicBoolean();
public MasterFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService, DiscoveryNodesProvider nodesProvider) {
super(settings);
this.threadPool = threadPool;
this.transportService = transportService;
public MasterFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService,
DiscoveryNodesProvider nodesProvider, ClusterName clusterName) {
super(settings, threadPool, transportService, clusterName);
this.nodesProvider = nodesProvider;
this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", true);
this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
this.pingRetryTimeout = componentSettings.getAsTime("ping_timeout", timeValueSeconds(30));
this.pingRetryCount = componentSettings.getAsInt("ping_retries", 3);
this.registerConnectionListener = componentSettings.getAsBoolean("register_connection_listener", true);
logger.debug("[master] uses ping_interval [{}], ping_timeout [{}], ping_retries [{}]", pingInterval, pingRetryTimeout, pingRetryCount);
this.connectionListener = new FDConnectionListener();
if (registerConnectionListener) {
transportService.addConnectionListener(connectionListener);
}
transportService.registerHandler(MASTER_PING_ACTION_NAME, new MasterPingRequestHandler());
}
@ -155,7 +124,8 @@ public class MasterFaultDetection extends AbstractComponent {
masterPinger.stop();
}
this.masterPinger = new MasterPinger();
// start the ping process
// we start pinging slightly later to allow the chosen master to complete it's own master election
threadPool.schedule(pingInterval, ThreadPool.Names.SAME, masterPinger);
}
@ -181,13 +151,14 @@ public class MasterFaultDetection extends AbstractComponent {
}
public void close() {
super.close();
stop("closing");
this.listeners.clear();
transportService.removeConnectionListener(connectionListener);
transportService.removeHandler(MASTER_PING_ACTION_NAME);
}
private void handleTransportDisconnect(DiscoveryNode node) {
@Override
protected void handleTransportDisconnect(DiscoveryNode node) {
synchronized (masterNodeMutex) {
if (!node.equals(this.masterNode)) {
return;
@ -200,7 +171,8 @@ public class MasterFaultDetection extends AbstractComponent {
masterPinger.stop();
}
this.masterPinger = new MasterPinger();
threadPool.schedule(pingInterval, ThreadPool.Names.SAME, masterPinger);
// we use schedule with a 0 time value to run the pinger on the pool as it will run on later
threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, masterPinger);
} catch (Exception e) {
logger.trace("[master] [{}] transport disconnected (with verified connect)", masterNode);
notifyMasterFailure(masterNode, "transport disconnected (with verified connect)");
@ -237,17 +209,6 @@ public class MasterFaultDetection extends AbstractComponent {
}
}
private class FDConnectionListener implements TransportConnectionListener {
@Override
public void onNodeConnected(DiscoveryNode node) {
}
@Override
public void onNodeDisconnected(DiscoveryNode node) {
handleTransportDisconnect(node);
}
}
private class MasterPinger implements Runnable {
private volatile boolean running = true;
@ -268,8 +229,10 @@ public class MasterFaultDetection extends AbstractComponent {
threadPool.schedule(pingInterval, ThreadPool.Names.SAME, MasterPinger.this);
return;
}
transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, new MasterPingRequest(nodesProvider.nodes().localNode().id(), masterToPing.id()), options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout),
new BaseTransportResponseHandler<MasterPingResponseResponse>() {
final MasterPingRequest request = new MasterPingRequest(nodesProvider.nodes().localNode().id(), masterToPing.id(), clusterName);
final TransportRequestOptions options = options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout);
transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, request, options, new BaseTransportResponseHandler<MasterPingResponseResponse>() {
@Override
public MasterPingResponseResponse newInstance() {
return new MasterPingResponseResponse();
@ -326,7 +289,7 @@ public class MasterFaultDetection extends AbstractComponent {
notifyMasterFailure(masterToPing, "failed to ping, tried [" + pingRetryCount + "] times, each with maximum [" + pingRetryTimeout + "] timeout");
} else {
// resend the request, not reschedule, rely on send timeout
transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, new MasterPingRequest(nodesProvider.nodes().localNode().id(), masterToPing.id()), options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout), this);
transportService.sendRequest(masterToPing, MASTER_PING_ACTION_NAME, request, options, this);
}
}
}
@ -349,6 +312,14 @@ public class MasterFaultDetection extends AbstractComponent {
}
static class NotMasterException extends ElasticsearchIllegalStateException {
NotMasterException(String msg) {
super(msg);
}
NotMasterException() {
}
@Override
public Throwable fillInStackTrace() {
return null;
@ -377,6 +348,13 @@ public class MasterFaultDetection extends AbstractComponent {
if (!request.masterNodeId.equals(nodes.localNodeId())) {
throw new NotMasterException();
}
// ping from nodes of version < 1.4.0 will have the clustername set to null
if (request.clusterName != null && !request.clusterName.equals(clusterName)) {
logger.trace("master fault detection ping request is targeted for a different [{}] cluster then us [{}]", request.clusterName, clusterName);
throw new NotMasterException("master fault detection ping request is targeted for a different [" + request.clusterName + "] cluster then us [" + clusterName + "]");
}
// if we are no longer master, fail...
if (!nodes.localNodeMaster()) {
throw new NoLongerMasterException();
@ -400,13 +378,15 @@ public class MasterFaultDetection extends AbstractComponent {
private String nodeId;
private String masterNodeId;
private ClusterName clusterName;
private MasterPingRequest() {
}
private MasterPingRequest(String nodeId, String masterNodeId) {
private MasterPingRequest(String nodeId, String masterNodeId, ClusterName clusterName) {
this.nodeId = nodeId;
this.masterNodeId = masterNodeId;
this.clusterName = clusterName;
}
@Override
@ -414,6 +394,9 @@ public class MasterFaultDetection extends AbstractComponent {
super.readFrom(in);
nodeId = in.readString();
masterNodeId = in.readString();
if (in.getVersion().onOrAfter(Version.V_1_4_0)) {
clusterName = ClusterName.readClusterName(in);
}
}
@Override
@ -421,6 +404,9 @@ public class MasterFaultDetection extends AbstractComponent {
super.writeTo(out);
out.writeString(nodeId);
out.writeString(masterNodeId);
if (out.getVersion().onOrAfter(Version.V_1_4_0)) {
clusterName.writeTo(out);
}
}
}

View File

@ -20,9 +20,11 @@
package org.elasticsearch.discovery.zen.fd;
import org.elasticsearch.ElasticsearchIllegalStateException;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.settings.Settings;
@ -35,68 +37,40 @@ import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import static org.elasticsearch.cluster.node.DiscoveryNodes.EMPTY_NODES;
import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
import static org.elasticsearch.transport.TransportRequestOptions.options;
/**
* A fault detection of multiple nodes.
*/
public class NodesFaultDetection extends AbstractComponent {
public class NodesFaultDetection extends FaultDetection {
public static final String PING_ACTION_NAME = "internal:discovery/zen/fd/ping";
public abstract static class Listener {
public static interface Listener {
public void onNodeFailure(DiscoveryNode node, String reason) {}
public void onPingReceived(PingRequest pingRequest) {}
void onNodeFailure(DiscoveryNode node, String reason);
}
private final ThreadPool threadPool;
private final TransportService transportService;
private final boolean connectOnNetworkDisconnect;
private final TimeValue pingInterval;
private final TimeValue pingRetryTimeout;
private final int pingRetryCount;
// used mainly for testing, should always be true
private final boolean registerConnectionListener;
private final CopyOnWriteArrayList<Listener> listeners = new CopyOnWriteArrayList<>();
private final ConcurrentMap<DiscoveryNode, NodeFD> nodesFD = newConcurrentMap();
private final FDConnectionListener connectionListener;
private volatile DiscoveryNodes latestNodes = EMPTY_NODES;
private volatile long clusterStateVersion = ClusterState.UNKNOWN_VERSION;
private volatile boolean running = false;
public NodesFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService) {
super(settings);
this.threadPool = threadPool;
this.transportService = transportService;
this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", true);
this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
this.pingRetryTimeout = componentSettings.getAsTime("ping_timeout", timeValueSeconds(30));
this.pingRetryCount = componentSettings.getAsInt("ping_retries", 3);
this.registerConnectionListener = componentSettings.getAsBoolean("register_connection_listener", true);
public NodesFaultDetection(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName) {
super(settings, threadPool, transportService, clusterName);
logger.debug("[node ] uses ping_interval [{}], ping_timeout [{}], ping_retries [{}]", pingInterval, pingRetryTimeout, pingRetryCount);
transportService.registerHandler(PING_ACTION_NAME, new PingRequestHandler());
this.connectionListener = new FDConnectionListener();
if (registerConnectionListener) {
transportService.addConnectionListener(connectionListener);
}
}
public void addListener(Listener listener) {
@ -107,9 +81,10 @@ public class NodesFaultDetection extends AbstractComponent {
listeners.remove(listener);
}
public void updateNodes(DiscoveryNodes nodes) {
public void updateNodes(DiscoveryNodes nodes, long clusterStateVersion) {
DiscoveryNodes prevNodes = latestNodes;
this.latestNodes = nodes;
this.clusterStateVersion = clusterStateVersion;
if (!running) {
return;
}
@ -121,7 +96,8 @@ public class NodesFaultDetection extends AbstractComponent {
}
if (!nodesFD.containsKey(newNode)) {
nodesFD.put(newNode, new NodeFD());
threadPool.schedule(pingInterval, ThreadPool.Names.SAME, new SendPingRequest(newNode));
// we use schedule with a 0 time value to run the pinger on the pool as it will run on later
threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, new SendPingRequest(newNode));
}
}
for (DiscoveryNode removedNode : delta.removedNodes()) {
@ -146,12 +122,13 @@ public class NodesFaultDetection extends AbstractComponent {
}
public void close() {
super.close();
stop();
transportService.removeHandler(PING_ACTION_NAME);
transportService.removeConnectionListener(connectionListener);
}
private void handleTransportDisconnect(DiscoveryNode node) {
@Override
protected void handleTransportDisconnect(DiscoveryNode node) {
if (!latestNodes.nodeExists(node.id())) {
return;
}
@ -167,7 +144,8 @@ public class NodesFaultDetection extends AbstractComponent {
try {
transportService.connectToNode(node);
nodesFD.put(node, new NodeFD());
threadPool.schedule(pingInterval, ThreadPool.Names.SAME, new SendPingRequest(node));
// we use schedule with a 0 time value to run the pinger on the pool as it will run on later
threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, new SendPingRequest(node));
} catch (Exception e) {
logger.trace("[node ] [{}] transport disconnected (with verified connect)", node);
notifyNodeFailure(node, "transport disconnected (with verified connect)");
@ -189,6 +167,19 @@ public class NodesFaultDetection extends AbstractComponent {
});
}
private void notifyPingReceived(final PingRequest pingRequest) {
threadPool.generic().execute(new Runnable() {
@Override
public void run() {
for (Listener listener : listeners) {
listener.onPingReceived(pingRequest);
}
}
});
}
private class SendPingRequest implements Runnable {
private final DiscoveryNode node;
@ -202,8 +193,9 @@ public class NodesFaultDetection extends AbstractComponent {
if (!running) {
return;
}
transportService.sendRequest(node, PING_ACTION_NAME, new PingRequest(node.id()), options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout),
new BaseTransportResponseHandler<PingResponse>() {
final PingRequest pingRequest = new PingRequest(node.id(), clusterName, latestNodes.localNode(), clusterStateVersion);
final TransportRequestOptions options = options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout);
transportService.sendRequest(node, PING_ACTION_NAME, pingRequest, options, new BaseTransportResponseHandler<PingResponse>() {
@Override
public PingResponse newInstance() {
return new PingResponse();
@ -250,8 +242,7 @@ public class NodesFaultDetection extends AbstractComponent {
}
} else {
// resend the request, not reschedule, rely on send timeout
transportService.sendRequest(node, PING_ACTION_NAME, new PingRequest(node.id()),
options().withType(TransportRequestOptions.Type.PING).withTimeout(pingRetryTimeout), this);
transportService.sendRequest(node, PING_ACTION_NAME, pingRequest, options, this);
}
}
}
@ -270,18 +261,6 @@ public class NodesFaultDetection extends AbstractComponent {
volatile boolean running = true;
}
private class FDConnectionListener implements TransportConnectionListener {
@Override
public void onNodeConnected(DiscoveryNode node) {
}
@Override
public void onNodeDisconnected(DiscoveryNode node) {
handleTransportDisconnect(node);
}
}
class PingRequestHandler extends BaseTransportRequestHandler<PingRequest> {
@Override
@ -296,6 +275,15 @@ public class NodesFaultDetection extends AbstractComponent {
if (!latestNodes.localNodeId().equals(request.nodeId)) {
throw new ElasticsearchIllegalStateException("Got pinged as node [" + request.nodeId + "], but I am node [" + latestNodes.localNodeId() + "]");
}
// PingRequest will have clusterName set to null if it came from a node of version <1.4.0
if (request.clusterName != null && !request.clusterName.equals(clusterName)) {
// Don't introduce new exception for bwc reasons
throw new ElasticsearchIllegalStateException("Got pinged with cluster name [" + request.clusterName + "], but I'm part of cluster [" + clusterName + "]");
}
notifyPingReceived(request);
channel.sendResponse(new PingResponse());
}
@ -306,28 +294,63 @@ public class NodesFaultDetection extends AbstractComponent {
}
static class PingRequest extends TransportRequest {
public static class PingRequest extends TransportRequest {
// the (assumed) node id we are pinging
private String nodeId;
private ClusterName clusterName;
private DiscoveryNode masterNode;
private long clusterStateVersion = ClusterState.UNKNOWN_VERSION;
PingRequest() {
}
PingRequest(String nodeId) {
PingRequest(String nodeId, ClusterName clusterName, DiscoveryNode masterNode, long clusterStateVersion) {
this.nodeId = nodeId;
this.clusterName = clusterName;
this.masterNode = masterNode;
this.clusterStateVersion = clusterStateVersion;
}
public String nodeId() {
return nodeId;
}
public ClusterName clusterName() {
return clusterName;
}
public DiscoveryNode masterNode() {
return masterNode;
}
public long clusterStateVersion() {
return clusterStateVersion;
}
@Override
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
nodeId = in.readString();
if (in.getVersion().onOrAfter(Version.V_1_4_0)) {
clusterName = ClusterName.readClusterName(in);
masterNode = DiscoveryNode.readNode(in);
clusterStateVersion = in.readLong();
}
}
@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
out.writeString(nodeId);
if (out.getVersion().onOrAfter(Version.V_1_4_0)) {
clusterName.writeTo(out);
masterNode.writeTo(out);
out.writeLong(clusterStateVersion);
}
}
}

View File

@ -34,6 +34,7 @@ import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.discovery.zen.ping.multicast.MulticastZenPing;
import org.elasticsearch.discovery.zen.ping.unicast.UnicastHostsProvider;
import org.elasticsearch.discovery.zen.ping.unicast.UnicastZenPing;
@ -55,20 +56,20 @@ public class ZenPingService extends AbstractLifecycleComponent<ZenPing> implemen
// here for backward comp. with discovery plugins
public ZenPingService(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName, NetworkService networkService,
@Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
this(settings, threadPool, transportService, clusterName, networkService, Version.CURRENT, unicastHostsProviders);
ElectMasterService electMasterService, @Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
this(settings, threadPool, transportService, clusterName, networkService, Version.CURRENT, electMasterService, unicastHostsProviders);
}
@Inject
public ZenPingService(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName, NetworkService networkService,
Version version, @Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
Version version, ElectMasterService electMasterService, @Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
super(settings);
ImmutableList.Builder<ZenPing> zenPingsBuilder = ImmutableList.builder();
if (componentSettings.getAsBoolean("multicast.enabled", true)) {
zenPingsBuilder.add(new MulticastZenPing(settings, threadPool, transportService, clusterName, networkService, version));
}
// always add the unicast hosts, so it will be able to receive unicast requests even when working in multicast
zenPingsBuilder.add(new UnicastZenPing(settings, threadPool, transportService, clusterName, version, unicastHostsProviders));
zenPingsBuilder.add(new UnicastZenPing(settings, threadPool, transportService, clusterName, version, electMasterService, unicastHostsProviders));
this.zenPings = zenPingsBuilder.build();
}

View File

@ -19,8 +19,12 @@
package org.elasticsearch.discovery.zen.ping.unicast;
import com.carrotsearch.hppc.cursors.ObjectCursor;
import com.google.common.collect.Lists;
import org.elasticsearch.*;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.ElasticsearchIllegalStateException;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
@ -35,6 +39,7 @@ import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.common.util.concurrent.EsExecutors;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.discovery.zen.ping.ZenPing;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.*;
@ -62,10 +67,11 @@ public class UnicastZenPing extends AbstractLifecycleComponent<ZenPing> implemen
private final ThreadPool threadPool;
private final TransportService transportService;
private final ClusterName clusterName;
private final ElectMasterService electMasterService;
private final int concurrentConnects;
private final DiscoveryNode[] nodes;
private final DiscoveryNode[] configuredTargetNodes;
private volatile DiscoveryNodesProvider nodesProvider;
@ -73,16 +79,18 @@ public class UnicastZenPing extends AbstractLifecycleComponent<ZenPing> implemen
private final Map<Integer, ConcurrentMap<DiscoveryNode, PingResponse>> receivedResponses = newConcurrentMap();
// a list of temporal responses a node will return for a request (holds requests from other nodes)
// a list of temporal responses a node will return for a request (holds requests from other configuredTargetNodes)
private final Queue<PingResponse> temporalResponses = ConcurrentCollections.newQueue();
private final CopyOnWriteArrayList<UnicastHostsProvider> hostsProviders = new CopyOnWriteArrayList<>();
public UnicastZenPing(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName, Version version, @Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
public UnicastZenPing(Settings settings, ThreadPool threadPool, TransportService transportService, ClusterName clusterName,
Version version, ElectMasterService electMasterService, @Nullable Set<UnicastHostsProvider> unicastHostsProviders) {
super(settings);
this.threadPool = threadPool;
this.transportService = transportService;
this.clusterName = clusterName;
this.electMasterService = electMasterService;
if (unicastHostsProviders != null) {
for (UnicastHostsProvider unicastHostsProvider : unicastHostsProviders) {
@ -99,20 +107,20 @@ public class UnicastZenPing extends AbstractLifecycleComponent<ZenPing> implemen
List<String> hosts = Lists.newArrayList(hostArr);
logger.debug("using initial hosts {}, with concurrent_connects [{}]", hosts, concurrentConnects);
List<DiscoveryNode> nodes = Lists.newArrayList();
List<DiscoveryNode> configuredTargetNodes = Lists.newArrayList();
int idCounter = 0;
for (String host : hosts) {
try {
TransportAddress[] addresses = transportService.addressesFromString(host);
// we only limit to 1 addresses, makes no sense to ping 100 ports
for (int i = 0; (i < addresses.length && i < LIMIT_PORTS_COUNT); i++) {
nodes.add(new DiscoveryNode("#zen_unicast_" + (++idCounter) + "#", addresses[i], version.minimumCompatibilityVersion()));
configuredTargetNodes.add(new DiscoveryNode("#zen_unicast_" + (++idCounter) + "#", addresses[i], version.minimumCompatibilityVersion()));
}
} catch (Exception e) {
throw new ElasticsearchIllegalArgumentException("Failed to resolve address for [" + host + "]", e);
}
}
this.nodes = nodes.toArray(new DiscoveryNode[nodes.size()]);
this.configuredTargetNodes = configuredTargetNodes.toArray(new DiscoveryNode[configuredTargetNodes.size()]);
transportService.registerHandler(ACTION_NAME, new UnicastPingRequestHandler());
}
@ -143,6 +151,13 @@ public class UnicastZenPing extends AbstractLifecycleComponent<ZenPing> implemen
this.nodesProvider = nodesProvider;
}
/**
* Clears the list of cached ping responses.
*/
public void clearTemporalReponses() {
temporalResponses.clear();
}
public PingResponse[] pingAndWait(TimeValue timeout) {
final AtomicReference<PingResponse[]> response = new AtomicReference<>();
final CountDownLatch latch = new CountDownLatch(1);
@ -237,18 +252,30 @@ public class UnicastZenPing extends AbstractLifecycleComponent<ZenPing> implemen
DiscoveryNodes discoNodes = nodesProvider.nodes();
pingRequest.pingResponse = new PingResponse(discoNodes.localNode(), discoNodes.masterNode(), clusterName);
HashSet<DiscoveryNode> nodesToPing = new HashSet<>(Arrays.asList(nodes));
HashSet<DiscoveryNode> nodesToPingSet = new HashSet<>();
for (PingResponse temporalResponse : temporalResponses) {
// Only send pings to nodes that have the same cluster name.
if (clusterName.equals(temporalResponse.clusterName())) {
nodesToPing.add(temporalResponse.target());
nodesToPingSet.add(temporalResponse.target());
}
}
for (UnicastHostsProvider provider : hostsProviders) {
nodesToPing.addAll(provider.buildDynamicNodes());
nodesToPingSet.addAll(provider.buildDynamicNodes());
}
// add all possible master nodes that were active in the last known cluster configuration
for (ObjectCursor<DiscoveryNode> masterNode : discoNodes.getMasterNodes().values()) {
nodesToPingSet.add(masterNode.value);
}
// sort the nodes by likelihood of being an active master
List<DiscoveryNode> sortedNodesToPing = electMasterService.sortByMasterLikelihood(nodesToPingSet);
// new add the the unicast targets first
ArrayList<DiscoveryNode> nodesToPing = Lists.newArrayList(configuredTargetNodes);
nodesToPing.addAll(sortedNodesToPing);
final CountDownLatch latch = new CountDownLatch(nodesToPing.size());
for (final DiscoveryNode node : nodesToPing) {
// make sure we are connected

View File

@ -40,6 +40,7 @@ import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.*;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
/**
*
@ -85,12 +86,15 @@ public class PublishClusterStateAction extends AbstractComponent {
publish(clusterState, new AckClusterStatePublishResponseHandler(clusterState.nodes().size() - 1, ackListener));
}
private void publish(ClusterState clusterState, final ClusterStatePublishResponseHandler publishResponseHandler) {
private void publish(final ClusterState clusterState, final ClusterStatePublishResponseHandler publishResponseHandler) {
DiscoveryNode localNode = nodesProvider.nodes().localNode();
Map<Version, BytesReference> serializedStates = Maps.newHashMap();
final AtomicBoolean timedOutWaitingForNodes = new AtomicBoolean(false);
final TimeValue publishTimeout = discoverySettings.getPublishTimeout();
for (final DiscoveryNode node : clusterState.nodes()) {
if (node.equals(localNode)) {
continue;
@ -125,28 +129,30 @@ public class PublishClusterStateAction extends AbstractComponent {
@Override
public void handleResponse(TransportResponse.Empty response) {
if (timedOutWaitingForNodes.get()) {
logger.debug("node {} responded for cluster state [{}] (took longer than [{}])", node, clusterState.version(), publishTimeout);
}
publishResponseHandler.onResponse(node);
}
@Override
public void handleException(TransportException exp) {
logger.debug("failed to send cluster state to [{}]", exp, node);
logger.debug("failed to send cluster state to {}", exp, node);
publishResponseHandler.onFailure(node, exp);
}
});
} catch (Throwable t) {
logger.debug("error sending cluster state to [{}]", t, node);
logger.debug("error sending cluster state to {}", t, node);
publishResponseHandler.onFailure(node, t);
}
}
TimeValue publishTimeout = discoverySettings.getPublishTimeout();
if (publishTimeout.millis() > 0) {
// only wait if the publish timeout is configured...
try {
boolean awaited = publishResponseHandler.awaitAllNodes(publishTimeout);
if (!awaited) {
logger.debug("awaiting all nodes to process published state {} timed out, timeout {}", clusterState.version(), publishTimeout);
timedOutWaitingForNodes.set(!publishResponseHandler.awaitAllNodes(publishTimeout));
if (timedOutWaitingForNodes.get()) {
logger.debug("timed out waiting for all nodes to process published state [{}] (timeout [{}])", clusterState.version(), publishTimeout);
}
} catch (InterruptedException e) {
// ignore & restore interrupt

View File

@ -35,7 +35,6 @@ import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.discovery.DiscoveryService;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.threadpool.ThreadPool;
@ -134,12 +133,6 @@ public class GatewayService extends AbstractLifecycleComponent<GatewayService> i
if (lifecycle.stoppedOrClosed()) {
return;
}
if (event.state().blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK)) {
// we need to clear those flags, since we might need to recover again in case we disconnect
// from the cluster and then reconnect
recovered.set(false);
scheduledRecovery.set(false);
}
if (event.localNodeMaster() && event.state().blocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK)) {
checkStateMeetsSettingsAndMaybeRecover(event.state(), true);
}
@ -147,7 +140,7 @@ public class GatewayService extends AbstractLifecycleComponent<GatewayService> i
protected void checkStateMeetsSettingsAndMaybeRecover(ClusterState state, boolean asyncRecovery) {
DiscoveryNodes nodes = state.nodes();
if (state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK)) {
if (state.blocks().hasGlobalBlock(discoveryService.getNoMasterBlock())) {
logger.debug("not recovering from gateway, no master elected yet");
} else if (recoverAfterNodes != -1 && (nodes.masterAndDataNodes().size()) < recoverAfterNodes) {
logger.debug("not recovering from gateway, nodes_size (data+master) [" + nodes.masterAndDataNodes().size() + "] < recover_after_nodes [" + recoverAfterNodes + "]");

View File

@ -307,7 +307,7 @@ public class IndicesStore extends AbstractComponent implements ClusterStateListe
return;
}
clusterService.submitStateUpdateTask("indices_store", new ClusterStateUpdateTask() {
clusterService.submitStateUpdateTask("indices_store", new ClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
if (clusterState.getVersion() != currentState.getVersion()) {

View File

@ -245,6 +245,10 @@ public class TransportService extends AbstractLifecycleComponent<TransportServic
}
}
protected TransportRequestHandler getHandler(String action) {
return serverHandlers.get(action);
}
class Adapter implements TransportServiceAdapter {
final MeanMetric rxMetric = new MeanMetric();

View File

@ -33,6 +33,7 @@ import org.elasticsearch.common.transport.BoundTransportAddress;
import org.elasticsearch.common.transport.LocalTransportAddress;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.common.util.concurrent.EsExecutors;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.*;
import org.elasticsearch.transport.support.TransportStatus;
@ -40,6 +41,8 @@ import org.elasticsearch.transport.support.TransportStatus;
import java.io.IOException;
import java.util.Map;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
@ -50,6 +53,7 @@ import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.new
public class LocalTransport extends AbstractLifecycleComponent<Transport> implements Transport {
private final ThreadPool threadPool;
private final ThreadPoolExecutor workers;
private final Version version;
private volatile TransportServiceAdapter transportServiceAdapter;
private volatile BoundTransportAddress boundAddress;
@ -58,13 +62,20 @@ public class LocalTransport extends AbstractLifecycleComponent<Transport> implem
private static final AtomicLong transportAddressIdGenerator = new AtomicLong();
private final ConcurrentMap<DiscoveryNode, LocalTransport> connectedNodes = newConcurrentMap();
public static final String TRANSPORT_LOCAL_ADDRESS = "transport.local_address";
public static final String TRANSPORT_LOCAL_ADDRESS = "transport.local.address";
public static final String TRANSPORT_LOCAL_WORKERS = "transport.local.workers";
public static final String TRANSPORT_LOCAL_QUEUE = "transport.local.queue";
@Inject
public LocalTransport(Settings settings, ThreadPool threadPool, Version version) {
super(settings);
this.threadPool = threadPool;
this.version = version;
int workerCount = this.settings.getAsInt(TRANSPORT_LOCAL_WORKERS, EsExecutors.boundedNumberOfProcessors(settings));
int queueSize = this.settings.getAsInt(TRANSPORT_LOCAL_QUEUE, -1);
logger.debug("creating [{}] workers, queue_size [{}]", workerCount, queueSize);
this.workers = EsExecutors.newFixed(workerCount, queueSize, EsExecutors.daemonThreadFactory(this.settings, "local_transport"));
}
@Override
@ -106,6 +117,13 @@ public class LocalTransport extends AbstractLifecycleComponent<Transport> implem
@Override
protected void doClose() throws ElasticsearchException {
workers.shutdown();
try {
workers.awaitTermination(10, TimeUnit.SECONDS);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
workers.shutdownNow();
}
@Override
@ -185,7 +203,7 @@ public class LocalTransport extends AbstractLifecycleComponent<Transport> implem
transportServiceAdapter.sent(data.length);
threadPool.generic().execute(new Runnable() {
targetTransport.workers().execute(new Runnable() {
@Override
public void run() {
targetTransport.messageReceived(data, action, LocalTransport.this, version, requestId);
@ -193,8 +211,8 @@ public class LocalTransport extends AbstractLifecycleComponent<Transport> implem
});
}
ThreadPool threadPool() {
return this.threadPool;
ThreadPoolExecutor workers() {
return this.workers;
}
protected void messageReceived(byte[] data, String action, LocalTransport sourceTransport, Version version, @Nullable final Long sendRequestId) {

View File

@ -72,7 +72,7 @@ public class LocalTransportChannel implements TransportChannel {
response.writeTo(stream);
stream.close();
final byte[] data = bStream.bytes().toBytes();
targetTransport.threadPool().generic().execute(new Runnable() {
targetTransport.workers().execute(new Runnable() {
@Override
public void run() {
targetTransport.messageReceived(data, action, sourceTransport, version, null);
@ -98,7 +98,7 @@ public class LocalTransportChannel implements TransportChannel {
too.close();
}
final byte[] data = stream.bytes().toBytes();
targetTransport.threadPool().generic().execute(new Runnable() {
targetTransport.workers().execute(new Runnable() {
@Override
public void run() {
targetTransport.messageReceived(data, action, sourceTransport, version, null);

View File

@ -23,7 +23,6 @@ import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchIllegalStateException;
import org.elasticsearch.action.support.master.TransportMasterNodeReadOperationAction;
import org.elasticsearch.cluster.*;
import org.elasticsearch.cluster.block.ClusterBlock;
@ -43,7 +42,7 @@ import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.discovery.DiscoveryService;
import org.elasticsearch.gateway.GatewayService;
import org.elasticsearch.node.NodeBuilder;
import org.elasticsearch.node.internal.InternalNode;
@ -53,7 +52,6 @@ import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
/**
* The tribe service holds a list of node clients connected to a list of tribe members, and uses their
@ -121,7 +119,7 @@ public class TribeService extends AbstractLifecycleComponent<TribeService> {
private final List<InternalNode> nodes = Lists.newCopyOnWriteArrayList();
@Inject
public TribeService(Settings settings, ClusterService clusterService) {
public TribeService(Settings settings, ClusterService clusterService, DiscoveryService discoveryService) {
super(settings);
this.clusterService = clusterService;
Map<String, Settings> nodesSettings = Maps.newHashMap(settings.getGroups("tribe", true));
@ -143,7 +141,7 @@ public class TribeService extends AbstractLifecycleComponent<TribeService> {
if (!nodes.isEmpty()) {
// remove the initial election / recovery blocks since we are not going to have a
// master elected in this single tribe node local "cluster"
clusterService.removeInitialStateBlock(Discovery.NO_MASTER_BLOCK);
clusterService.removeInitialStateBlock(discoveryService.getNoMasterBlock());
clusterService.removeInitialStateBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK);
if (settings.getAsBoolean("tribe.blocks.write", false)) {
clusterService.addInitialStateBlock(TRIBE_WRITE_BLOCK);
@ -222,7 +220,7 @@ public class TribeService extends AbstractLifecycleComponent<TribeService> {
@Override
public void clusterChanged(final ClusterChangedEvent event) {
logger.debug("[{}] received cluster event, [{}]", tribeName, event.source());
clusterService.submitStateUpdateTask("cluster event from " + tribeName + ", " + event.source(), new ClusterStateUpdateTask() {
clusterService.submitStateUpdateTask("cluster event from " + tribeName + ", " + event.source(), new ClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
ClusterState tribeState = event.state();

View File

@ -19,6 +19,7 @@
package org.elasticsearch.cluster;
import com.google.common.base.Predicate;
import com.google.common.util.concurrent.ListenableFuture;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.admin.cluster.tasks.PendingClusterTasksResponse;
@ -256,6 +257,58 @@ public class ClusterServiceTests extends ElasticsearchIntegrationTest {
assertThat(processedLatch.await(1, TimeUnit.SECONDS), equalTo(true));
}
@Test
public void testMasterAwareExecution() throws Exception {
Settings settings = settingsBuilder()
.put("discovery.type", "local")
.build();
ListenableFuture<String> master = internalCluster().startNodeAsync(settings);
ListenableFuture<String> nonMaster = internalCluster().startNodeAsync(settingsBuilder().put(settings).put("node.master", false).build());
master.get();
ensureGreen(); // make sure we have a cluster
ClusterService clusterService = internalCluster().getInstance(ClusterService.class, nonMaster.get());
final boolean[] taskFailed = {false};
final CountDownLatch latch1 = new CountDownLatch(1);
clusterService.submitStateUpdateTask("test", new ClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
latch1.countDown();
return currentState;
}
@Override
public void onFailure(String source, Throwable t) {
taskFailed[0] = true;
latch1.countDown();
}
});
latch1.await();
assertTrue("cluster state update task was executed on a non-master", taskFailed[0]);
taskFailed[0] = true;
final CountDownLatch latch2 = new CountDownLatch(1);
clusterService.submitStateUpdateTask("test", new ClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
taskFailed[0] = false;
latch2.countDown();
return currentState;
}
@Override
public void onFailure(String source, Throwable t) {
taskFailed[0] = true;
latch2.countDown();
}
});
latch2.await();
assertFalse("non-master cluster state update task was not executed", taskFailed[0]);
}
@Test
public void testAckedUpdateTaskNoAckExpected() throws Exception {
Settings settings = settingsBuilder()
@ -655,7 +708,7 @@ public class ClusterServiceTests extends ElasticsearchIntegrationTest {
}
}
private static class BlockingTask implements ClusterStateUpdateTask {
private static class BlockingTask extends ClusterStateUpdateTask {
private final CountDownLatch latch = new CountDownLatch(1);
@Override
@ -674,7 +727,7 @@ public class ClusterServiceTests extends ElasticsearchIntegrationTest {
}
private static class PrioritiezedTask implements ClusterStateUpdateTask {
private static class PrioritiezedTask extends ClusterStateUpdateTask {
private final Priority priority;
private final CountDownLatch latch;

View File

@ -25,7 +25,7 @@ import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.discovery.DiscoverySettings;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
@ -60,7 +60,7 @@ public class MinimumMasterNodesTests extends ElasticsearchIntegrationTest {
logger.info("--> should be blocked, no master...");
ClusterState state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(true));
assertThat(state.nodes().size(), equalTo(1)); // verify that we still see the local node in the cluster state
logger.info("--> start second node, cluster should be formed");
@ -70,9 +70,9 @@ public class MinimumMasterNodesTests extends ElasticsearchIntegrationTest {
assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
state = client().admin().cluster().prepareState().execute().actionGet().getState();
assertThat(state.nodes().size(), equalTo(2));
@ -98,11 +98,11 @@ public class MinimumMasterNodesTests extends ElasticsearchIntegrationTest {
awaitBusy(new Predicate<Object>() {
public boolean apply(Object obj) {
ClusterState state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
return state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK);
return state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
}
});
state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(true));
assertThat(state.nodes().size(), equalTo(1)); // verify that we still see the local node in the cluster state
logger.info("--> starting the previous master node again...");
@ -112,9 +112,9 @@ public class MinimumMasterNodesTests extends ElasticsearchIntegrationTest {
assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
state = client().admin().cluster().prepareState().execute().actionGet().getState();
assertThat(state.nodes().size(), equalTo(2));
@ -135,7 +135,7 @@ public class MinimumMasterNodesTests extends ElasticsearchIntegrationTest {
assertThat(awaitBusy(new Predicate<Object>() {
public boolean apply(Object obj) {
ClusterState state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
return state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK);
return state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
}
}), equalTo(true));
@ -146,9 +146,9 @@ public class MinimumMasterNodesTests extends ElasticsearchIntegrationTest {
assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(false));
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(false));
state = client().admin().cluster().prepareState().execute().actionGet().getState();
assertThat(state.nodes().size(), equalTo(2));
@ -183,21 +183,21 @@ public class MinimumMasterNodesTests extends ElasticsearchIntegrationTest {
awaitBusy(new Predicate<Object>() {
public boolean apply(Object obj) {
ClusterState state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
return state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK);
return state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
}
});
awaitBusy(new Predicate<Object>() {
public boolean apply(Object obj) {
ClusterState state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
return state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK);
return state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
}
});
state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(true));
state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK), equalTo(true));
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(true));
logger.info("--> start two more nodes");
internalCluster().startNode(settings);
@ -298,9 +298,9 @@ public class MinimumMasterNodesTests extends ElasticsearchIntegrationTest {
boolean success = true;
for (Client client : internalCluster()) {
ClusterState state = client.admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
success &= state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK);
success &= state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
if (logger.isDebugEnabled()) {
logger.debug("Checking for NO_MASTER_BLOCK on client: {} NO_MASTER_BLOCK: [{}]", client, state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK));
logger.debug("Checking for NO_MASTER_BLOCK on client: {} NO_MASTER_BLOCK: [{}]", client, state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID));
}
}
return success;

View File

@ -19,14 +19,20 @@
package org.elasticsearch.cluster;
import com.google.common.base.Predicate;
import org.elasticsearch.action.ActionRequestBuilder;
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.count.CountResponse;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.percolate.PercolateSourceBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.cluster.block.ClusterBlockException;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.discovery.DiscoverySettings;
import org.elasticsearch.discovery.MasterNotDiscoveredException;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.script.ScriptService;
@ -40,7 +46,7 @@ import java.util.HashMap;
import static org.elasticsearch.action.percolate.PercolateSourceBuilder.docBuilder;
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.*;
import static org.hamcrest.Matchers.*;
/**
@ -61,6 +67,7 @@ public class NoMasterNodeTests extends ElasticsearchIntegrationTest {
.put("discovery.zen.minimum_master_nodes", 2)
.put("discovery.zen.ping_timeout", "200ms")
.put("discovery.initial_state_timeout", "500ms")
.put(DiscoverySettings.NO_MASTER_BLOCK, "all")
.build();
TimeValue timeout = TimeValue.timeValueMillis(200);
@ -75,7 +82,7 @@ public class NoMasterNodeTests extends ElasticsearchIntegrationTest {
@Override
public void run() {
ClusterState state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertTrue(state.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK));
assertTrue(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID));
}
});
@ -128,7 +135,7 @@ public class NoMasterNodeTests extends ElasticsearchIntegrationTest {
ClusterBlockException.class, RestStatus.SERVICE_UNAVAILABLE
);
checkWriteAction(autoCreateIndex, timeout,
checkWriteAction(false, timeout,
client().prepareUpdate("test", "type1", "1").setScript("test script", ScriptService.ScriptType.INLINE).setTimeout(timeout));
@ -136,7 +143,7 @@ public class NoMasterNodeTests extends ElasticsearchIntegrationTest {
client().prepareUpdate("no_index", "type1", "1").setScript("test script", ScriptService.ScriptType.INLINE).setTimeout(timeout));
checkWriteAction(autoCreateIndex, timeout,
checkWriteAction(false, timeout,
client().prepareIndex("test", "type1", "1").setSource(XContentFactory.jsonBuilder().startObject().endObject()).setTimeout(timeout));
checkWriteAction(autoCreateIndex, timeout,
@ -145,9 +152,7 @@ public class NoMasterNodeTests extends ElasticsearchIntegrationTest {
BulkRequestBuilder bulkRequestBuilder = client().prepareBulk();
bulkRequestBuilder.add(client().prepareIndex("test", "type1", "1").setSource(XContentFactory.jsonBuilder().startObject().endObject()));
bulkRequestBuilder.add(client().prepareIndex("test", "type1", "2").setSource(XContentFactory.jsonBuilder().startObject().endObject()));
// today, we clear the metadata on when there is no master, so it will go through the auto create logic and
// add it... (if autoCreate is set to true)
checkBulkAction(autoCreateIndex, bulkRequestBuilder);
checkBulkAction(false, bulkRequestBuilder);
bulkRequestBuilder = client().prepareBulk();
bulkRequestBuilder.add(client().prepareIndex("no_index", "type1", "1").setSource(XContentFactory.jsonBuilder().startObject().endObject()));
@ -203,4 +208,75 @@ public class NoMasterNodeTests extends ElasticsearchIntegrationTest {
}
}
}
@Test
public void testNoMasterActions_writeMasterBlock() throws Exception {
Settings settings = settingsBuilder()
.put("discovery.type", "zen")
.put("action.auto_create_index", false)
.put("discovery.zen.minimum_master_nodes", 2)
.put("discovery.zen.ping_timeout", "200ms")
.put("discovery.initial_state_timeout", "500ms")
.put(DiscoverySettings.NO_MASTER_BLOCK, "write")
.build();
internalCluster().startNode(settings);
// start a second node, create an index, and then shut it down so we have no master block
internalCluster().startNode(settings);
prepareCreate("test1").setSettings(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).get();
prepareCreate("test2").setSettings(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 2, IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0).get();
client().admin().cluster().prepareHealth("_all").setWaitForGreenStatus().get();
client().prepareIndex("test1", "type1", "1").setSource("field", "value1").get();
client().prepareIndex("test2", "type1", "1").setSource("field", "value1").get();
refresh();
ensureSearchable("test1", "test2");
ClusterStateResponse clusterState = client().admin().cluster().prepareState().get();
logger.info("Cluster state:\n" + clusterState.getState().prettyPrint());
internalCluster().stopRandomDataNode();
assertThat(awaitBusy(new Predicate<Object>() {
public boolean apply(Object o) {
ClusterState state = client().admin().cluster().prepareState().setLocal(true).get().getState();
return state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID);
}
}), equalTo(true));
GetResponse getResponse = client().prepareGet("test1", "type1", "1").get();
assertExists(getResponse);
CountResponse countResponse = client().prepareCount("test1").get();
assertHitCount(countResponse, 1l);
SearchResponse searchResponse = client().prepareSearch("test1").get();
assertHitCount(searchResponse, 1l);
countResponse = client().prepareCount("test2").get();
assertThat(countResponse.getTotalShards(), equalTo(2));
assertThat(countResponse.getSuccessfulShards(), equalTo(1));
TimeValue timeout = TimeValue.timeValueMillis(200);
long now = System.currentTimeMillis();
try {
client().prepareUpdate("test1", "type1", "1").setDoc("field", "value2").setTimeout(timeout).get();
fail("Expected ClusterBlockException");
} catch (ClusterBlockException e) {
assertThat(System.currentTimeMillis() - now, greaterThan(timeout.millis() - 50));
assertThat(e.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
}
now = System.currentTimeMillis();
try {
client().prepareIndex("test1", "type1", "1").setSource(XContentFactory.jsonBuilder().startObject().endObject()).setTimeout(timeout).get();
fail("Expected ClusterBlockException");
} catch (ClusterBlockException e) {
assertThat(System.currentTimeMillis() - now, greaterThan(timeout.millis() - 50));
assertThat(e.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
}
internalCluster().startNode(settings);
client().admin().cluster().prepareHealth().setWaitForGreenStatus().setWaitForNodes("2").get();
}
}

View File

@ -0,0 +1,141 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.discovery;
import com.carrotsearch.randomizedtesting.RandomizedTest;
import com.google.common.primitives.Ints;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
import org.elasticsearch.test.InternalTestCluster;
import org.elasticsearch.test.SettingsSource;
import org.elasticsearch.transport.local.LocalTransport;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
public class ClusterDiscoveryConfiguration extends SettingsSource {
public static Settings DEFAULT_SETTINGS = ImmutableSettings.settingsBuilder()
.put("gateway.type", "local")
.put("discovery.type", "zen")
.build();
final int numOfNodes;
final Settings baseSettings;
public ClusterDiscoveryConfiguration(int numOfNodes) {
this(numOfNodes, ImmutableSettings.EMPTY);
}
public ClusterDiscoveryConfiguration(int numOfNodes, Settings extraSettings) {
this.numOfNodes = numOfNodes;
this.baseSettings = ImmutableSettings.builder().put(DEFAULT_SETTINGS).put(extraSettings).build();
}
@Override
public Settings node(int nodeOrdinal) {
return baseSettings;
}
@Override
public Settings transportClient() {
return baseSettings;
}
public static class UnicastZen extends ClusterDiscoveryConfiguration {
private final static AtomicInteger portRangeCounter = new AtomicInteger();
private final int[] unicastHostOrdinals;
private final int basePort;
public UnicastZen(int numOfNodes) {
this(numOfNodes, numOfNodes);
}
public UnicastZen(int numOfNodes, Settings extraSettings) {
this(numOfNodes, numOfNodes, extraSettings);
}
public UnicastZen(int numOfNodes, int numOfUnicastHosts) {
this(numOfNodes, numOfUnicastHosts, ImmutableSettings.EMPTY);
}
public UnicastZen(int numOfNodes, int numOfUnicastHosts, Settings extraSettings) {
super(numOfNodes, extraSettings);
if (numOfUnicastHosts == numOfNodes) {
unicastHostOrdinals = new int[numOfNodes];
for (int i = 0; i < numOfNodes; i++) {
unicastHostOrdinals[i] = i;
}
} else {
Set<Integer> ordinals = new HashSet<>(numOfUnicastHosts);
while (ordinals.size() != numOfUnicastHosts) {
ordinals.add(RandomizedTest.randomInt(numOfNodes - 1));
}
unicastHostOrdinals = Ints.toArray(ordinals);
}
this.basePort = calcBasePort();
}
public UnicastZen(int numOfNodes, int[] unicastHostOrdinals) {
this(numOfNodes, ImmutableSettings.EMPTY, unicastHostOrdinals);
}
public UnicastZen(int numOfNodes, Settings extraSettings, int[] unicastHostOrdinals) {
super(numOfNodes, extraSettings);
this.unicastHostOrdinals = unicastHostOrdinals;
this.basePort = calcBasePort();
}
private final static int calcBasePort() {
return 10000 +
1000 * (ElasticsearchIntegrationTest.CHILD_JVM_ID % 60) + // up to 60 jvms
100 * portRangeCounter.incrementAndGet(); // up to 100 nodes
}
@Override
public Settings node(int nodeOrdinal) {
ImmutableSettings.Builder builder = ImmutableSettings.builder()
.put("discovery.zen.ping.multicast.enabled", false);
String[] unicastHosts = new String[unicastHostOrdinals.length];
String mode = baseSettings.get("node.mode", InternalTestCluster.NODE_MODE);
if (mode.equals("local")) {
builder.put(LocalTransport.TRANSPORT_LOCAL_ADDRESS, "node_" + nodeOrdinal);
for (int i = 0; i < unicastHosts.length; i++) {
unicastHosts[i] = "node_" + unicastHostOrdinals[i];
}
} else {
// we need to pin the node port & host so we'd know where to point things
builder.put("transport.tcp.port", basePort + nodeOrdinal);
builder.put("transport.host", "localhost");
for (int i = 0; i < unicastHosts.length; i++) {
unicastHosts[i] = "localhost:" + (basePort + unicastHostOrdinals[i]);
}
}
builder.putArray("discovery.zen.ping.unicast.hosts", unicastHosts);
return builder.put(super.node(nodeOrdinal)).build();
}
}
}

View File

@ -1,145 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.discovery;
import com.google.common.base.Predicate;
import org.apache.lucene.util.LuceneTestCase;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
import org.elasticsearch.test.transport.MockTransportService;
import org.elasticsearch.transport.TransportModule;
import org.elasticsearch.transport.TransportService;
import org.junit.Test;
import java.util.Arrays;
import java.util.List;
import static org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
import static org.hamcrest.Matchers.*;
/**
*/
@ClusterScope(scope= Scope.SUITE, numDataNodes =0)
public class DiscoveryWithNetworkFailuresTests extends ElasticsearchIntegrationTest {
@Test
@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elasticsearch/elasticsearch/issues/2488")
public void failWithMinimumMasterNodesConfigured() throws Exception {
final Settings settings = ImmutableSettings.settingsBuilder()
.put("discovery.zen.minimum_master_nodes", 2)
.put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
.put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
.build();
List<String>nodes = internalCluster().startNodesAsync(3, settings).get();
// Wait until a green status has been reaches and 3 nodes are part of the cluster
List<String> nodesList = Arrays.asList(nodes.toArray(new String[3]));
ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
.setWaitForEvents(Priority.LANGUID)
.setWaitForNodes("3")
.get();
assertThat(clusterHealthResponse.isTimedOut(), is(false));
// Figure out what is the elected master node
DiscoveryNode masterDiscoNode = null;
for (String node : nodesList) {
ClusterState state = internalCluster().client(node).admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.nodes().size(), equalTo(3));
if (masterDiscoNode == null) {
masterDiscoNode = state.nodes().masterNode();
} else {
assertThat(state.nodes().masterNode(), equalTo(masterDiscoNode));
}
}
assert masterDiscoNode != null;
logger.info("---> legit elected master node=" + masterDiscoNode);
final Client masterClient = internalCluster().masterClient();
// Everything is stable now, it is now time to simulate evil...
// Pick a node that isn't the elected master.
String unluckyNode = null;
for (String node : nodesList) {
if (!node.equals(masterDiscoNode.getName())) {
unluckyNode = node;
}
}
assert unluckyNode != null;
// Simulate a network issue between the unlucky node and elected master node in both directions.
addFailToSendNoConnectRule(masterDiscoNode.getName(), unluckyNode);
addFailToSendNoConnectRule(unluckyNode, masterDiscoNode.getName());
try {
// Wait until elected master has removed that the unlucky node...
awaitBusy(new Predicate<Object>() {
@Override
public boolean apply(Object input) {
return masterClient.admin().cluster().prepareState().setLocal(true).get().getState().nodes().size() == 2;
}
});
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved.
Client isolatedNodeClient = internalCluster().client(unluckyNode);
ClusterState localClusterState = isolatedNodeClient.admin().cluster().prepareState().setLocal(true).get().getState();
DiscoveryNodes localDiscoveryNodes = localClusterState.nodes();
assertThat(localDiscoveryNodes.masterNode(), nullValue());
} finally {
// stop simulating network failures, from this point on the unlucky node is able to rejoin
// We also need to do this even if assertions fail, since otherwise the test framework can't work properly
clearNoConnectRule(masterDiscoNode.getName(), unluckyNode);
clearNoConnectRule(unluckyNode, masterDiscoNode.getName());
}
// Wait until the master node sees all 3 nodes again.
clusterHealthResponse = masterClient.admin().cluster().prepareHealth()
.setWaitForEvents(Priority.LANGUID)
.setWaitForNodes("3")
.get();
assertThat(clusterHealthResponse.isTimedOut(), is(false));
for (String node : nodesList) {
ClusterState state = internalCluster().client(node).admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
assertThat(state.nodes().size(), equalTo(3));
// The elected master shouldn't have changed, since the unlucky node never could have elected himself as
// master since m_m_n of 2 could never be satisfied.
assertThat(state.nodes().masterNode(), equalTo(masterDiscoNode));
}
}
private void addFailToSendNoConnectRule(String fromNode, String toNode) {
TransportService mockTransportService = internalCluster().getInstance(TransportService.class, fromNode);
((MockTransportService) mockTransportService).addFailToSendNoConnectRule(internalCluster().getInstance(Discovery.class, toNode).localNode());
}
private void clearNoConnectRule(String fromNode, String toNode) {
TransportService mockTransportService = internalCluster().getInstance(TransportService.class, fromNode);
((MockTransportService) mockTransportService).clearRule(internalCluster().getInstance(Discovery.class, toNode).localNode());
}
}

View File

@ -0,0 +1,863 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.discovery;
import com.google.common.base.Predicate;
import org.apache.lucene.util.LuceneTestCase;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterService;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.block.ClusterBlock;
import org.elasticsearch.cluster.block.ClusterBlockLevel;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.operation.hash.djb.DjbHashFunction;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.discovery.zen.ZenDiscovery;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.discovery.zen.fd.FaultDetection;
import org.elasticsearch.discovery.zen.membership.MembershipAction;
import org.elasticsearch.discovery.zen.ping.ZenPing;
import org.elasticsearch.discovery.zen.ping.ZenPingService;
import org.elasticsearch.discovery.zen.ping.unicast.UnicastZenPing;
import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
import org.elasticsearch.test.InternalTestCluster;
import org.elasticsearch.test.disruption.*;
import org.elasticsearch.test.junit.annotations.TestLogging;
import org.elasticsearch.test.transport.MockTransportService;
import org.elasticsearch.transport.*;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import static org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.hamcrest.Matchers.*;
/**
*/
@LuceneTestCase.Slow
@TestLogging("discovery.zen:TRACE")
@ClusterScope(scope = Scope.TEST, numDataNodes = 0, transportClientRatio = 0)
public class DiscoveryWithServiceDisruptions extends ElasticsearchIntegrationTest {
private static final TimeValue DISRUPTION_HEALING_OVERHEAD = TimeValue.timeValueSeconds(40); // we use 30s as timeout in many places.
private ClusterDiscoveryConfiguration discoveryConfig;
@Override
protected Settings nodeSettings(int nodeOrdinal) {
return discoveryConfig.node(nodeOrdinal);
}
@Before
public void clearConfig() {
discoveryConfig = null;
}
@Override
protected int numberOfShards() {
return 3;
}
@Override
protected int numberOfReplicas() {
return 1;
}
private List<String> startCluster(int numberOfNodes) throws ExecutionException, InterruptedException {
return startCluster(numberOfNodes, -1);
}
private List<String> startCluster(int numberOfNodes, int minimumMasterNode) throws ExecutionException, InterruptedException {
if (randomBoolean()) {
return startMulticastCluster(numberOfNodes, minimumMasterNode);
} else {
return startUnicastCluster(numberOfNodes, null, minimumMasterNode);
}
}
final static Settings DEFAULT_SETTINGS = ImmutableSettings.builder()
.put(FaultDetection.SETTING_PING_TIMEOUT, "1s") // for hitting simulated network failures quickly
.put(FaultDetection.SETTING_PING_RETRIES, "1") // for hitting simulated network failures quickly
.put("discovery.zen.join_timeout", "10s") // still long to induce failures but to long so test won't time out
.put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
.put("http.enabled", false) // just to make test quicker
.put("gateway.local.list_timeout", "10s") // still long to induce failures but to long so test won't time out
.put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
.build();
private List<String> startMulticastCluster(int numberOfNodes, int minimumMasterNode) throws ExecutionException, InterruptedException {
if (minimumMasterNode < 0) {
minimumMasterNode = numberOfNodes / 2 + 1;
}
// TODO: Rarely use default settings form some of these
Settings settings = ImmutableSettings.builder()
.put(DEFAULT_SETTINGS)
.put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, minimumMasterNode)
.build();
if (discoveryConfig == null) {
discoveryConfig = new ClusterDiscoveryConfiguration(numberOfNodes, settings);
}
List<String> nodes = internalCluster().startNodesAsync(numberOfNodes).get();
ensureStableCluster(numberOfNodes);
return nodes;
}
private List<String> startUnicastCluster(int numberOfNodes, @Nullable int[] unicastHostsOrdinals, int minimumMasterNode) throws ExecutionException, InterruptedException {
if (minimumMasterNode < 0) {
minimumMasterNode = numberOfNodes / 2 + 1;
}
// TODO: Rarely use default settings form some of these
Settings nodeSettings = ImmutableSettings.builder()
.put(DEFAULT_SETTINGS)
.put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES, minimumMasterNode)
.build();
if (discoveryConfig == null) {
if (unicastHostsOrdinals == null) {
discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, nodeSettings);
} else {
discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(numberOfNodes, nodeSettings, unicastHostsOrdinals);
}
}
List<String> nodes = internalCluster().startNodesAsync(numberOfNodes).get();
ensureStableCluster(numberOfNodes);
// TODO: this is a temporary solution so that nodes will not base their reaction to a partition based on previous successful results
for (ZenPingService pingService : internalCluster().getInstances(ZenPingService.class)) {
for (ZenPing zenPing : pingService.zenPings()) {
if (zenPing instanceof UnicastZenPing) {
((UnicastZenPing) zenPing).clearTemporalReponses();
}
}
}
return nodes;
}
/**
* Test that no split brain occurs under partial network partition. See https://github.com/elasticsearch/elasticsearch/issues/2488
*
* @throws Exception
*/
@Test
public void failWithMinimumMasterNodesConfigured() throws Exception {
List<String> nodes = startCluster(3);
// Figure out what is the elected master node
final String masterNode = internalCluster().getMasterName();
logger.info("---> legit elected master node=" + masterNode);
// Pick a node that isn't the elected master.
Set<String> nonMasters = new HashSet<>(nodes);
nonMasters.remove(masterNode);
final String unluckyNode = randomFrom(nonMasters.toArray(Strings.EMPTY_ARRAY));
// Simulate a network issue between the unlucky node and elected master node in both directions.
NetworkDisconnectPartition networkDisconnect = new NetworkDisconnectPartition(masterNode, unluckyNode, getRandom());
setDisruptionScheme(networkDisconnect);
networkDisconnect.startDisrupting();
// Wait until elected master has removed that the unlucky node...
ensureStableCluster(2, masterNode);
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved. However
// It may a take a bit before the node detects it has been cut off from the elected master
assertNoMaster(unluckyNode);
networkDisconnect.stopDisrupting();
// Wait until the master node sees all 3 nodes again.
ensureStableCluster(3);
// The elected master shouldn't have changed, since the unlucky node never could have elected himself as
// master since m_m_n of 2 could never be satisfied.
assertMaster(masterNode, nodes);
}
/**
* Verify that the proper block is applied when nodes loose their master
*/
@Test
@TestLogging(value = "cluster.service:TRACE,indices.recovery:TRACE")
public void testVerifyApiBlocksDuringPartition() throws Exception {
startCluster(3);
// Makes sure that the get request can be executed on each node locally:
assertAcked(prepareCreate("test").setSettings(ImmutableSettings.builder()
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)
));
// Everything is stable now, it is now time to simulate evil...
// but first make sure we have no initializing shards and all is green
// (waiting for green here, because indexing / search in a yellow index is fine as long as no other nodes go down)
ensureGreen("test");
NetworkPartition networkPartition = addRandomPartition();
final String isolatedNode = networkPartition.getMinoritySide().get(0);
final String nonIsolatedNode = networkPartition.getMajoritySide().get(0);
// Simulate a network issue between the unlucky node and the rest of the cluster.
networkPartition.startDisrupting();
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved. However
// It may a take a bit before the node detects it has been cut off from the elected master
logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_WRITES, TimeValue.timeValueSeconds(10));
logger.info("wait until elected master has been removed and a new 2 node cluster was from (via [{}])", isolatedNode);
ensureStableCluster(2, nonIsolatedNode);
for (String node : networkPartition.getMajoritySide()) {
ClusterState nodeState = getNodeClusterState(node);
boolean success = true;
if (nodeState.nodes().getMasterNode() == null) {
success = false;
}
if (!nodeState.blocks().global().isEmpty()) {
success = false;
}
if (!success) {
fail("node [" + node + "] has no master or has blocks, despite of being on the right side of the partition. State dump:\n"
+ nodeState.prettyPrint());
}
}
networkPartition.stopDisrupting();
// Wait until the master node sees al 3 nodes again.
ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkPartition.expectedTimeToHeal().millis()));
logger.info("Verify no master block with {} set to {}", DiscoverySettings.NO_MASTER_BLOCK, "all");
client().admin().cluster().prepareUpdateSettings()
.setTransientSettings(ImmutableSettings.builder().put(DiscoverySettings.NO_MASTER_BLOCK, "all"))
.get();
networkPartition.startDisrupting();
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved. However
// It may a take a bit before the node detects it has been cut off from the elected master
logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_ALL, TimeValue.timeValueSeconds(10));
// make sure we have stable cluster & cross partition recoveries are canceled by the removal of the missing node
// the unresponsive partition causes recoveries to only time out after 15m (default) and these will cause
// the test to fail due to unfreed resources
ensureStableCluster(2, nonIsolatedNode);
}
/**
* This test isolates the master from rest of the cluster, waits for a new master to be elected, restores the partition
* and verifies that all node agree on the new cluster state
*/
@Test
@TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
final List<String> nodes = startCluster(3);
assertAcked(prepareCreate("test")
.setSettings(ImmutableSettings.builder()
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2))
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, randomInt(2))
));
ensureGreen();
String isolatedNode = internalCluster().getMasterName();
NetworkPartition networkPartition = addRandomIsolation(isolatedNode);
networkPartition.startDisrupting();
String nonIsolatedNode = networkPartition.getMajoritySide().get(0);
// make sure cluster reforms
ensureStableCluster(2, nonIsolatedNode);
// make sure isolated need picks up on things.
assertNoMaster(isolatedNode, TimeValue.timeValueSeconds(40));
// restore isolation
networkPartition.stopDisrupting();
ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkPartition.expectedTimeToHeal().millis()));
logger.info("issue a reroute");
// trigger a reroute now, instead of waiting for the background reroute of RerouteService
assertAcked(client().admin().cluster().prepareReroute());
// and wait for it to finish and for the cluster to stabilize
ensureGreen("test");
// verify all cluster states are the same
ClusterState state = null;
for (String node : nodes) {
ClusterState nodeState = getNodeClusterState(node);
if (state == null) {
state = nodeState;
continue;
}
// assert nodes are identical
try {
assertEquals("unequal versions", state.version(), nodeState.version());
assertEquals("unequal node count", state.nodes().size(), nodeState.nodes().size());
assertEquals("different masters ", state.nodes().masterNodeId(), nodeState.nodes().masterNodeId());
assertEquals("different meta data version", state.metaData().version(), nodeState.metaData().version());
if (!state.routingTable().prettyPrint().equals(nodeState.routingTable().prettyPrint())) {
fail("different routing");
}
} catch (AssertionError t) {
fail("failed comparing cluster state: " + t.getMessage() + "\n" +
"--- cluster state of node [" + nodes.get(0) + "]: ---\n" + state.prettyPrint() +
"\n--- cluster state [" + node + "]: ---\n" + nodeState.prettyPrint());
}
}
}
/**
* Test the we do not loose document whose indexing request was successful, under a randomly selected disruption scheme
* We also collect & report the type of indexing failures that occur.
*/
@Test
@LuceneTestCase.AwaitsFix(bugUrl = "needs some more work to stabilize")
@TestLogging("action.index:TRACE,action.get:TRACE,discovery:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
public void testAckedIndexing() throws Exception {
final List<String> nodes = startCluster(3);
assertAcked(prepareCreate("test")
.setSettings(ImmutableSettings.builder()
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2))
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, randomInt(2))
));
ensureGreen();
ServiceDisruptionScheme disruptionScheme = addRandomDisruptionScheme();
logger.info("disruption scheme [{}] added", disruptionScheme);
final ConcurrentHashMap<String, String> ackedDocs = new ConcurrentHashMap<>(); // id -> node sent.
final AtomicBoolean stop = new AtomicBoolean(false);
List<Thread> indexers = new ArrayList<>(nodes.size());
List<Semaphore> semaphores = new ArrayList<>(nodes.size());
final AtomicInteger idGenerator = new AtomicInteger(0);
final AtomicReference<CountDownLatch> countDownLatchRef = new AtomicReference<>();
final List<Exception> exceptedExceptions = Collections.synchronizedList(new ArrayList<Exception>());
logger.info("starting indexers");
try {
for (final String node : nodes) {
final Semaphore semaphore = new Semaphore(0);
semaphores.add(semaphore);
final Client client = client(node);
final String name = "indexer_" + indexers.size();
final int numPrimaries = getNumShards("test").numPrimaries;
Thread thread = new Thread(new Runnable() {
@Override
public void run() {
while (!stop.get()) {
String id = null;
try {
if (!semaphore.tryAcquire(10, TimeUnit.SECONDS)) {
continue;
}
logger.info("[{}] Acquired semaphore and it has {} permits left", name, semaphore.availablePermits());
try {
id = Integer.toString(idGenerator.incrementAndGet());
int shard = ((InternalTestCluster) cluster()).getInstance(DjbHashFunction.class).hash(id) % numPrimaries;
logger.trace("[{}] indexing id [{}] through node [{}] targeting shard [{}]", name, id, node, shard);
IndexResponse response = client.prepareIndex("test", "type", id).setSource("{}").setTimeout("1s").get();
assertThat(response.getVersion(), equalTo(1l));
ackedDocs.put(id, node);
logger.trace("[{}] indexed id [{}] through node [{}]", name, id, node);
} catch (ElasticsearchException e) {
exceptedExceptions.add(e);
logger.trace("[{}] failed id [{}] through node [{}]", e, name, id, node);
} finally {
countDownLatchRef.get().countDown();
logger.trace("[{}] decreased counter : {}", name, countDownLatchRef.get().getCount());
}
} catch (InterruptedException e) {
// fine - semaphore interrupt
} catch (Throwable t) {
logger.info("unexpected exception in background thread of [{}]", t, node);
}
}
}
});
thread.setName(name);
thread.setDaemon(true);
thread.start();
indexers.add(thread);
}
int docsPerIndexer = randomInt(3);
logger.info("indexing " + docsPerIndexer + " docs per indexer before partition");
countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
for (Semaphore semaphore : semaphores) {
semaphore.release(docsPerIndexer);
}
assertTrue(countDownLatchRef.get().await(1, TimeUnit.MINUTES));
for (int iter = 1 + randomInt(2); iter > 0; iter--) {
logger.info("starting disruptions & indexing (iteration [{}])", iter);
disruptionScheme.startDisrupting();
docsPerIndexer = 1 + randomInt(5);
logger.info("indexing " + docsPerIndexer + " docs per indexer during partition");
countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
Collections.shuffle(semaphores);
for (Semaphore semaphore : semaphores) {
assertThat(semaphore.availablePermits(), equalTo(0));
semaphore.release(docsPerIndexer);
}
assertTrue(countDownLatchRef.get().await(60000 + disruptionScheme.expectedTimeToHeal().millis() * (docsPerIndexer * indexers.size()), TimeUnit.MILLISECONDS));
logger.info("stopping disruption");
disruptionScheme.stopDisrupting();
ensureStableCluster(3, TimeValue.timeValueMillis(disruptionScheme.expectedTimeToHeal().millis() + DISRUPTION_HEALING_OVERHEAD.millis()));
ensureGreen("test");
logger.info("validating successful docs");
for (String node : nodes) {
try {
logger.debug("validating through node [{}]", node);
for (String id : ackedDocs.keySet()) {
assertTrue("doc [" + id + "] indexed via node [" + ackedDocs.get(id) + "] not found",
client(node).prepareGet("test", "type", id).setPreference("_local").get().isExists());
}
} catch (AssertionError e) {
throw new AssertionError(e.getMessage() + " (checked via node [" + node + "]", e);
}
}
logger.info("done validating (iteration [{}])", iter);
}
} finally {
if (exceptedExceptions.size() > 0) {
StringBuilder sb = new StringBuilder("Indexing exceptions during disruption:");
for (Exception e : exceptedExceptions) {
sb.append("\n").append(e.getMessage());
}
logger.debug(sb.toString());
}
logger.info("shutting down indexers");
stop.set(true);
for (Thread indexer : indexers) {
indexer.interrupt();
indexer.join(60000);
}
}
}
/**
* Test that cluster recovers from a long GC on master that causes other nodes to elect a new one
*/
@Test
@TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
public void testMasterNodeGCs() throws Exception {
// TODO: on mac OS multicast threads are shared between nodes and we therefore we can't simulate GC and stop pinging for just one node
// find a way to block thread creation in the generic thread pool to avoid this.
List<String> nodes = startUnicastCluster(3, null, -1);
String oldMasterNode = internalCluster().getMasterName();
// a very long GC, but it's OK as we remove the disruption when it has had an effect
SingleNodeDisruption masterNodeDisruption = new LongGCDisruption(oldMasterNode, getRandom(), 100, 200, 30000, 60000);
internalCluster().setDisruptionScheme(masterNodeDisruption);
masterNodeDisruption.startDisrupting();
Set<String> oldNonMasterNodesSet = new HashSet<>(nodes);
oldNonMasterNodesSet.remove(oldMasterNode);
List<String> oldNonMasterNodes = new ArrayList<>(oldNonMasterNodesSet);
logger.info("waiting for nodes to de-elect master [{}]", oldMasterNode);
for (String node : oldNonMasterNodesSet) {
assertDifferentMaster(node, oldMasterNode);
}
logger.info("waiting for nodes to elect a new master");
ensureStableCluster(2, oldNonMasterNodes.get(0));
logger.info("waiting for any pinging to stop");
for (final String node : oldNonMasterNodes) {
assertTrue("node [" + node + "] is still joining master", awaitBusy(new Predicate<Object>() {
@Override
public boolean apply(Object input) {
return !((ZenDiscovery) internalCluster().getInstance(Discovery.class, node)).joiningCluster();
}
}, 30, TimeUnit.SECONDS));
}
// restore GC
masterNodeDisruption.stopDisrupting();
ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + masterNodeDisruption.expectedTimeToHeal().millis()),
oldNonMasterNodes.get(0));
// make sure all nodes agree on master
String newMaster = internalCluster().getMasterName();
assertThat(newMaster, not(equalTo(oldMasterNode)));
assertMaster(newMaster, nodes);
}
/**
* Test that a document which is indexed on the majority side of a partition, is available from the minory side,
* once the partition is healed
*
* @throws Exception
*/
@Test
@TestLogging("discovery.zen:TRACE,action:TRACE,cluster.service:TRACE,indices.recovery:TRACE,indices.cluster:TRACE")
public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
List<String> nodes = startCluster(3);
assertAcked(prepareCreate("test")
.setSettings(ImmutableSettings.builder()
.put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)
)
.get());
ensureGreen("test");
nodes = new ArrayList<>(nodes);
Collections.shuffle(nodes, getRandom());
String isolatedNode = nodes.get(0);
String notIsolatedNode = nodes.get(1);
ServiceDisruptionScheme scheme = addRandomIsolation(isolatedNode);
scheme.startDisrupting();
ensureStableCluster(2, notIsolatedNode);
assertFalse(client(notIsolatedNode).admin().cluster().prepareHealth("test").setWaitForYellowStatus().get().isTimedOut());
IndexResponse indexResponse = internalCluster().client(notIsolatedNode).prepareIndex("test", "type").setSource("field", "value").get();
assertThat(indexResponse.getVersion(), equalTo(1l));
logger.info("Verifying if document exists via node[" + notIsolatedNode + "]");
GetResponse getResponse = internalCluster().client(notIsolatedNode).prepareGet("test", "type", indexResponse.getId())
.setPreference("_local")
.get();
assertThat(getResponse.isExists(), is(true));
assertThat(getResponse.getVersion(), equalTo(1l));
assertThat(getResponse.getId(), equalTo(indexResponse.getId()));
scheme.stopDisrupting();
ensureStableCluster(3);
ensureGreen("test");
for (String node : nodes) {
logger.info("Verifying if document exists after isolating node[" + isolatedNode + "] via node[" + node + "]");
getResponse = internalCluster().client(node).prepareGet("test", "type", indexResponse.getId())
.setPreference("_local")
.get();
assertThat(getResponse.isExists(), is(true));
assertThat(getResponse.getVersion(), equalTo(1l));
assertThat(getResponse.getId(), equalTo(indexResponse.getId()));
}
}
/**
* A 4 node cluster with m_m_n set to 3 and each node has one unicast enpoint. One node partitions from the master node.
* The temporal unicast responses is empty. When partition is solved the one ping response contains a master node.
* The rejoining node should take this master node and connect.
*/
@Test
@TestLogging("discovery.zen:TRACE,action:TRACE")
public void unicastSinglePingResponseContainsMaster() throws Exception {
List<String> nodes = startUnicastCluster(4, new int[]{0}, -1);
// Figure out what is the elected master node
final String masterNode = internalCluster().getMasterName();
logger.info("---> legit elected master node=" + masterNode);
List<String> otherNodes = new ArrayList<>(nodes);
otherNodes.remove(masterNode);
otherNodes.remove(nodes.get(0)); // <-- Don't isolate the node that is in the unicast endpoint for all the other nodes.
final String isolatedNode = otherNodes.get(0);
// Forcefully clean temporal response lists on all nodes. Otherwise the node in the unicast host list
// includes all the other nodes that have pinged it and the issue doesn't manifest
for (ZenPingService pingService : internalCluster().getInstances(ZenPingService.class)) {
for (ZenPing zenPing : pingService.zenPings()) {
((UnicastZenPing) zenPing).clearTemporalReponses();
}
}
// Simulate a network issue between the unlucky node and elected master node in both directions.
NetworkDisconnectPartition networkDisconnect = new NetworkDisconnectPartition(masterNode, isolatedNode, getRandom());
setDisruptionScheme(networkDisconnect);
networkDisconnect.startDisrupting();
// Wait until elected master has removed that the unlucky node...
ensureStableCluster(3, masterNode);
// The isolate master node must report no master, so it starts with pinging
assertNoMaster(isolatedNode);
networkDisconnect.stopDisrupting();
// Wait until the master node sees all 4 nodes again.
ensureStableCluster(4);
// The elected master shouldn't have changed, since the isolated node never could have elected himself as
// master since m_m_n of 3 could never be satisfied.
assertMaster(masterNode, nodes);
}
@Test
@TestLogging("discovery.zen:TRACE,action:TRACE")
public void isolatedUnicastNodes() throws Exception {
List<String> nodes = startUnicastCluster(3, new int[]{0}, -1);
// Figure out what is the elected master node
final String unicastTarget = nodes.get(0);
Set<String> unicastTargetSide = new HashSet<>();
unicastTargetSide.add(unicastTarget);
Set<String> restOfClusterSide = new HashSet<>();
restOfClusterSide.addAll(nodes);
restOfClusterSide.remove(unicastTarget);
// Forcefully clean temporal response lists on all nodes. Otherwise the node in the unicast host list
// includes all the other nodes that have pinged it and the issue doesn't manifest
for (ZenPingService pingService : internalCluster().getInstances(ZenPingService.class)) {
for (ZenPing zenPing : pingService.zenPings()) {
((UnicastZenPing) zenPing).clearTemporalReponses();
}
}
// Simulate a network issue between the unicast target node and the rest of the cluster
NetworkDisconnectPartition networkDisconnect = new NetworkDisconnectPartition(unicastTargetSide, restOfClusterSide, getRandom());
setDisruptionScheme(networkDisconnect);
networkDisconnect.startDisrupting();
// Wait until elected master has removed that the unlucky node...
ensureStableCluster(2, nodes.get(1));
// The isolate master node must report no master, so it starts with pinging
assertNoMaster(unicastTarget);
networkDisconnect.stopDisrupting();
// Wait until the master node sees all 3 nodes again.
ensureStableCluster(3);
}
/** Test cluster join with issues in cluster state publishing * */
@Test
@TestLogging("discovery.zen:TRACE,action:TRACE")
public void testClusterJoinDespiteOfPublishingIssues() throws Exception {
List<String> nodes = startCluster(2, 1);
String masterNode = internalCluster().getMasterName();
String nonMasterNode;
if (masterNode.equals(nodes.get(0))) {
nonMasterNode = nodes.get(1);
} else {
nonMasterNode = nodes.get(0);
}
DiscoveryNodes discoveryNodes = internalCluster().getInstance(ClusterService.class, nonMasterNode).state().nodes();
logger.info("blocking requests from non master [{}] to master [{}]", nonMasterNode, masterNode);
MockTransportService nonMasterTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, nonMasterNode);
nonMasterTransportService.addFailToSendNoConnectRule(discoveryNodes.masterNode());
assertNoMaster(nonMasterNode);
logger.info("blocking cluster state publishing from master [{}] to non master [{}]", masterNode, nonMasterNode);
MockTransportService masterTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, masterNode);
masterTransportService.addFailToSendNoConnectRule(discoveryNodes.localNode(), PublishClusterStateAction.ACTION_NAME);
logger.info("allowing requests from non master [{}] to master [{}], waiting for two join request", nonMasterNode, masterNode);
final CountDownLatch countDownLatch = new CountDownLatch(2);
nonMasterTransportService.addDelegate(discoveryNodes.masterNode(), new MockTransportService.DelegateTransport(nonMasterTransportService.original()) {
@Override
public void sendRequest(DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException {
if (action.equals(MembershipAction.DISCOVERY_JOIN_ACTION_NAME)) {
countDownLatch.countDown();
}
super.sendRequest(node, requestId, action, request, options);
}
});
countDownLatch.await();
logger.info("waiting for cluster to reform");
masterTransportService.clearRule(discoveryNodes.localNode());
nonMasterTransportService.clearRule(discoveryNodes.masterNode());
ensureStableCluster(2);
}
protected NetworkPartition addRandomPartition() {
NetworkPartition partition;
if (randomBoolean()) {
partition = new NetworkUnresponsivePartition(getRandom());
} else {
partition = new NetworkDisconnectPartition(getRandom());
}
setDisruptionScheme(partition);
return partition;
}
protected NetworkPartition addRandomIsolation(String isolatedNode) {
Set<String> side1 = new HashSet<>();
Set<String> side2 = new HashSet<>(Arrays.asList(internalCluster().getNodeNames()));
side1.add(isolatedNode);
side2.remove(isolatedNode);
NetworkPartition partition;
if (randomBoolean()) {
partition = new NetworkUnresponsivePartition(side1, side2, getRandom());
} else {
partition = new NetworkDisconnectPartition(side1, side2, getRandom());
}
internalCluster().setDisruptionScheme(partition);
return partition;
}
private ServiceDisruptionScheme addRandomDisruptionScheme() {
List<ServiceDisruptionScheme> list = Arrays.asList(
new NetworkUnresponsivePartition(getRandom()),
new NetworkDelaysPartition(getRandom()),
new NetworkDisconnectPartition(getRandom()),
new SlowClusterStateProcessing(getRandom())
);
Collections.shuffle(list);
setDisruptionScheme(list.get(0));
return list.get(0);
}
private void ensureStableCluster(int nodeCount) {
ensureStableCluster(nodeCount, TimeValue.timeValueSeconds(30), null);
}
private void ensureStableCluster(int nodeCount, TimeValue timeValue) {
ensureStableCluster(nodeCount, timeValue, null);
}
private void ensureStableCluster(int nodeCount, @Nullable String viaNode) {
ensureStableCluster(nodeCount, TimeValue.timeValueSeconds(30), viaNode);
}
private void ensureStableCluster(int nodeCount, TimeValue timeValue, @Nullable String viaNode) {
if (viaNode == null) {
viaNode = randomFrom(internalCluster().getNodeNames());
}
logger.debug("ensuring cluster is stable with [{}] nodes. access node: [{}]. timeout: [{}]", nodeCount, viaNode, timeValue);
ClusterHealthResponse clusterHealthResponse = client(viaNode).admin().cluster().prepareHealth()
.setWaitForEvents(Priority.LANGUID)
.setWaitForNodes(Integer.toString(nodeCount))
.setTimeout(timeValue)
.setWaitForRelocatingShards(0)
.get();
if (clusterHealthResponse.isTimedOut()) {
ClusterStateResponse stateResponse = client(viaNode).admin().cluster().prepareState().get();
fail("failed to reach a stable cluster of [" + nodeCount + "] nodes. Tried via [" + viaNode + "]. last cluster state:\n"
+ stateResponse.getState().prettyPrint());
}
assertThat(clusterHealthResponse.isTimedOut(), is(false));
}
private ClusterState getNodeClusterState(String node) {
return client(node).admin().cluster().prepareState().setLocal(true).get().getState();
}
private void assertNoMaster(final String node) throws Exception {
assertNoMaster(node, null, TimeValue.timeValueSeconds(10));
}
private void assertNoMaster(final String node, TimeValue maxWaitTime) throws Exception {
assertNoMaster(node, null, maxWaitTime);
}
private void assertNoMaster(final String node, @Nullable final ClusterBlock expectedBlocks, TimeValue maxWaitTime) throws Exception {
assertBusy(new Runnable() {
@Override
public void run() {
ClusterState state = getNodeClusterState(node);
assertNull("node [" + node + "] still has [" + state.nodes().masterNode() + "] as master", state.nodes().masterNode());
if (expectedBlocks != null) {
for (ClusterBlockLevel level : expectedBlocks.levels()) {
assertTrue("node [" + node + "] does have level [" + level + "] in it's blocks", state.getBlocks().hasGlobalBlock(level));
}
}
}
}, maxWaitTime.getMillis(), TimeUnit.MILLISECONDS);
}
private void assertDifferentMaster(final String node, final String oldMasterNode) throws Exception {
assertBusy(new Runnable() {
@Override
public void run() {
ClusterState state = getNodeClusterState(node);
String masterNode = null;
if (state.nodes().masterNode() != null) {
masterNode = state.nodes().masterNode().name();
}
logger.trace("[{}] master is [{}]", node, state.nodes().masterNode());
assertThat("node [" + node + "] still has [" + masterNode + "] as master",
oldMasterNode, not(equalTo(masterNode)));
}
}, 10, TimeUnit.SECONDS);
}
private void assertMaster(String masterNode, List<String> nodes) {
for (String node : nodes) {
ClusterState state = getNodeClusterState(node);
String failMsgSuffix = "cluster_state:\n" + state.prettyPrint();
assertThat("wrong node count on [" + node + "]. " + failMsgSuffix, state.nodes().size(), equalTo(nodes.size()));
assertThat("wrong master on node [" + node + "]. " + failMsgSuffix, state.nodes().masterNode().name(), equalTo(masterNode));
}
}
}

View File

@ -0,0 +1,219 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.discovery;
import com.google.common.collect.ImmutableMap;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;
import org.elasticsearch.discovery.zen.fd.FaultDetection;
import org.elasticsearch.discovery.zen.fd.MasterFaultDetection;
import org.elasticsearch.discovery.zen.fd.NodesFaultDetection;
import org.elasticsearch.node.service.NodeService;
import org.elasticsearch.test.ElasticsearchTestCase;
import org.elasticsearch.test.transport.MockTransportService;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.TransportConnectionListener;
import org.elasticsearch.transport.local.LocalTransport;
import org.hamcrest.Matcher;
import org.hamcrest.Matchers;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import static org.hamcrest.Matchers.equalTo;
public class ZenFaultDetectionTests extends ElasticsearchTestCase {
protected ThreadPool threadPool;
protected static final Version version0 = Version.fromId(/*0*/99);
protected DiscoveryNode nodeA;
protected MockTransportService serviceA;
protected static final Version version1 = Version.fromId(199);
protected DiscoveryNode nodeB;
protected MockTransportService serviceB;
@Before
public void setUp() throws Exception {
super.setUp();
threadPool = new ThreadPool(getClass().getName());
serviceA = build(ImmutableSettings.builder().put("name", "TS_A").build(), version0);
nodeA = new DiscoveryNode("TS_A", "TS_A", serviceA.boundAddress().publishAddress(), ImmutableMap.<String, String>of(), version0);
serviceB = build(ImmutableSettings.builder().put("name", "TS_B").build(), version1);
nodeB = new DiscoveryNode("TS_B", "TS_B", serviceB.boundAddress().publishAddress(), ImmutableMap.<String, String>of(), version1);
// wait till all nodes are properly connected and the event has been sent, so tests in this class
// will not get this callback called on the connections done in this setup
final CountDownLatch latch = new CountDownLatch(4);
TransportConnectionListener waitForConnection = new TransportConnectionListener() {
@Override
public void onNodeConnected(DiscoveryNode node) {
latch.countDown();
}
@Override
public void onNodeDisconnected(DiscoveryNode node) {
fail("disconnect should not be called " + node);
}
};
serviceA.addConnectionListener(waitForConnection);
serviceB.addConnectionListener(waitForConnection);
serviceA.connectToNode(nodeB);
serviceA.connectToNode(nodeA);
serviceB.connectToNode(nodeA);
serviceB.connectToNode(nodeB);
assertThat("failed to wait for all nodes to connect", latch.await(5, TimeUnit.SECONDS), equalTo(true));
serviceA.removeConnectionListener(waitForConnection);
serviceB.removeConnectionListener(waitForConnection);
}
@After
public void tearDown() throws Exception {
super.tearDown();
serviceA.close();
serviceB.close();
threadPool.shutdown();
}
protected MockTransportService build(Settings settings, Version version) {
MockTransportService transportService = new MockTransportService(ImmutableSettings.EMPTY, new LocalTransport(settings, threadPool, version), threadPool);
transportService.start();
return transportService;
}
private DiscoveryNodes buildNodesForA(boolean master) {
DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
builder.put(nodeA);
builder.put(nodeB);
builder.localNodeId(nodeA.id());
builder.masterNodeId(master ? nodeA.id() : nodeB.id());
return builder.build();
}
private DiscoveryNodes buildNodesForB(boolean master) {
DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
builder.put(nodeA);
builder.put(nodeB);
builder.localNodeId(nodeB.id());
builder.masterNodeId(master ? nodeB.id() : nodeA.id());
return builder.build();
}
@Test
public void testNodesFaultDetectionConnectOnDisconnect() throws InterruptedException {
ImmutableSettings.Builder settings = ImmutableSettings.builder();
boolean shouldRetry = randomBoolean();
// make sure we don't ping
settings.put(FaultDetection.SETTING_CONNECT_ON_NETWORK_DISCONNECT, shouldRetry)
.put(FaultDetection.SETTING_PING_INTERVAL, "5m");
NodesFaultDetection nodesFD = new NodesFaultDetection(settings.build(), threadPool, serviceA, new ClusterName("test"));
nodesFD.start();
nodesFD.updateNodes(buildNodesForA(true), -1);
final String[] failureReason = new String[1];
final DiscoveryNode[] failureNode = new DiscoveryNode[1];
final CountDownLatch notified = new CountDownLatch(1);
nodesFD.addListener(new NodesFaultDetection.Listener() {
@Override
public void onNodeFailure(DiscoveryNode node, String reason) {
failureNode[0] = node;
failureReason[0] = reason;
notified.countDown();
}
});
// will raise a disconnect on A
serviceB.stop();
notified.await(30, TimeUnit.SECONDS);
assertEquals(nodeB, failureNode[0]);
Matcher<String> matcher = Matchers.containsString("verified");
if (!shouldRetry) {
matcher = Matchers.not(matcher);
}
assertThat(failureReason[0], matcher);
}
@Test
public void testMasterFaultDetectionConnectOnDisconnect() throws InterruptedException {
ImmutableSettings.Builder settings = ImmutableSettings.builder();
boolean shouldRetry = randomBoolean();
// make sure we don't ping
settings.put(FaultDetection.SETTING_CONNECT_ON_NETWORK_DISCONNECT, shouldRetry)
.put(FaultDetection.SETTING_PING_INTERVAL, "5m");
ClusterName clusterName = new ClusterName(randomAsciiOfLengthBetween(3, 20));
final DiscoveryNodes nodes = buildNodesForA(false);
MasterFaultDetection masterFD = new MasterFaultDetection(settings.build(), threadPool, serviceA,
new DiscoveryNodesProvider() {
@Override
public DiscoveryNodes nodes() {
return nodes;
}
@Override
public NodeService nodeService() {
return null;
}
},
clusterName
);
masterFD.start(nodeB, "test");
final String[] failureReason = new String[1];
final DiscoveryNode[] failureNode = new DiscoveryNode[1];
final CountDownLatch notified = new CountDownLatch(1);
masterFD.addListener(new MasterFaultDetection.Listener() {
@Override
public void onMasterFailure(DiscoveryNode masterNode, String reason) {
failureNode[0] = masterNode;
failureReason[0] = reason;
notified.countDown();
}
@Override
public void onDisconnectedFromMaster() {
}
});
// will raise a disconnect on A
serviceB.stop();
notified.await(30, TimeUnit.SECONDS);
assertEquals(nodeB, failureNode[0]);
Matcher<String> matcher = Matchers.containsString("verified");
if (!shouldRetry) {
matcher = Matchers.not(matcher);
}
assertThat(failureReason[0], matcher);
}
}

View File

@ -26,7 +26,6 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
import org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
import org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
import org.elasticsearch.transport.local.LocalTransport;
import org.junit.Before;
import org.junit.Test;
@ -38,47 +37,24 @@ import static org.hamcrest.Matchers.equalTo;
@ClusterScope(scope = Scope.TEST, numDataNodes = 0)
public class ZenUnicastDiscoveryTests extends ElasticsearchIntegrationTest {
private static int currentNumNodes = -1;
static int currentBaseHttpPort = -1;
static int currentNumOfUnicastHosts = -1;
@Before
public void setUP() throws Exception {
ElasticsearchIntegrationTest.beforeClass();
currentNumNodes = randomIntBetween(3, 5);
currentNumOfUnicastHosts = randomIntBetween(1, currentNumNodes);
currentBaseHttpPort = 25000 + randomInt(100);
}
private ClusterDiscoveryConfiguration discoveryConfig;
@Override
protected Settings nodeSettings(int nodeOrdinal) {
ImmutableSettings.Builder builder = ImmutableSettings.settingsBuilder()
.put(super.nodeSettings(nodeOrdinal))
.put("discovery.type", "zen")
.put("discovery.zen.ping.multicast.enabled", false)
.put("http.enabled", false); // just to make test quicker
return discoveryConfig.node(nodeOrdinal);
}
String[] unicastHosts = new String[currentNumOfUnicastHosts];
if (internalCluster().getDefaultSettings().get("node.mode").equals("local")) {
builder.put(LocalTransport.TRANSPORT_LOCAL_ADDRESS, "unicast_test_" + nodeOrdinal);
for (int i = 0; i < unicastHosts.length; i++) {
unicastHosts[i] = "unicast_test_" + i;
}
} else {
// we need to pin the node ports so we'd know where to point things
builder.put("transport.tcp.port", currentBaseHttpPort + nodeOrdinal);
for (int i = 0; i < unicastHosts.length; i++) {
unicastHosts[i] = "localhost:" + (currentBaseHttpPort + i);
}
}
builder.putArray("discovery.zen.ping.unicast.hosts", unicastHosts);
return builder.build();
@Before
public void clearConfig() {
discoveryConfig = null;
}
@Test
public void testNormalClusterForming() throws ExecutionException, InterruptedException {
int currentNumNodes = randomIntBetween(3, 5);
int currentNumOfUnicastHosts = randomIntBetween(1, currentNumNodes);
discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(currentNumNodes, currentNumOfUnicastHosts);
internalCluster().startNodesAsync(currentNumNodes).get();
if (client().admin().cluster().prepareHealth().setWaitForNodes("" + currentNumNodes).get().isTimedOut()) {
@ -92,9 +68,12 @@ public class ZenUnicastDiscoveryTests extends ElasticsearchIntegrationTest {
// test fails, because 2 nodes elect themselves as master and the health request times out b/c waiting_for_nodes=N
// can't be satisfied.
public void testMinimumMasterNodes() throws Exception {
int currentNumNodes = randomIntBetween(3, 5);
int currentNumOfUnicastHosts = randomIntBetween(1, currentNumNodes);
final Settings settings = ImmutableSettings.settingsBuilder().put("discovery.zen.minimum_master_nodes", currentNumNodes / 2 + 1).build();
discoveryConfig = new ClusterDiscoveryConfiguration.UnicastZen(currentNumNodes, currentNumOfUnicastHosts, settings);
List<String> nodes = internalCluster().startNodesAsync(currentNumNodes, settings).get();
List<String> nodes = internalCluster().startNodesAsync(currentNumNodes).get();
ensureGreen();

View File

@ -0,0 +1,105 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.discovery.zen;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.transport.DummyTransportAddress;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.test.ElasticsearchTestCase;
import org.junit.Test;
import java.util.*;
public class ElectMasterServiceTest extends ElasticsearchTestCase {
ElectMasterService electMasterService() {
return new ElectMasterService(ImmutableSettings.EMPTY);
}
List<DiscoveryNode> generateRandomNodes() {
int count = scaledRandomIntBetween(1, 100);
ArrayList<DiscoveryNode> nodes = new ArrayList<>(count);
Map<String, String> master = new HashMap<>();
master.put("master", "true");
Map<String, String> nonMaster = new HashMap<>();
nonMaster.put("master", "false");
for (int i = 0; i < count; i++) {
Map<String, String> attributes = randomBoolean() ? master : nonMaster;
DiscoveryNode node = new DiscoveryNode("n_" + i, "n_" + i, DummyTransportAddress.INSTANCE, attributes, Version.CURRENT);
nodes.add(node);
}
Collections.shuffle(nodes, getRandom());
return nodes;
}
@Test
public void sortByMasterLikelihood() {
List<DiscoveryNode> nodes = generateRandomNodes();
List<DiscoveryNode> sortedNodes = electMasterService().sortByMasterLikelihood(nodes);
assertEquals(nodes.size(), sortedNodes.size());
DiscoveryNode prevNode = sortedNodes.get(0);
for (int i = 1; i < sortedNodes.size(); i++) {
DiscoveryNode node = sortedNodes.get(i);
if (!prevNode.masterNode()) {
assertFalse(node.masterNode());
} else if (node.masterNode()) {
assertTrue(prevNode.id().compareTo(node.id()) < 0);
}
prevNode = node;
}
}
@Test
public void electMaster() {
List<DiscoveryNode> nodes = generateRandomNodes();
ElectMasterService service = electMasterService();
int min_master_nodes = randomIntBetween(0, nodes.size());
service.minimumMasterNodes(min_master_nodes);
int master_nodes = 0;
for (DiscoveryNode node : nodes) {
if (node.masterNode()) {
master_nodes++;
}
}
DiscoveryNode master = null;
if (service.hasEnoughMasterNodes(nodes)) {
master = service.electMaster(nodes);
}
if (master_nodes == 0) {
assertNull(master);
} else if (min_master_nodes > 0 && master_nodes < min_master_nodes) {
assertNull(master);
} else {
for (DiscoveryNode node : nodes) {
if (node.masterNode()) {
assertTrue(master.id().compareTo(node.id()) <= 0);
}
}
}
}
}

View File

@ -0,0 +1,102 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.discovery.zen;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.admin.indices.recovery.RecoveryResponse;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.discovery.zen.fd.FaultDetection;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
import org.junit.Test;
import static org.hamcrest.Matchers.*;
/**
*/
@ElasticsearchIntegrationTest.ClusterScope(scope = ElasticsearchIntegrationTest.Scope.TEST, numDataNodes = 0, numClientNodes = 0)
public class ZenDiscoveryRejoinOnMaster extends ElasticsearchIntegrationTest {
@Test
public void testChangeRejoinOnMasterOptionIsDynamic() throws Exception {
Settings nodeSettings = ImmutableSettings.settingsBuilder()
.put("discovery.type", "zen") // <-- To override the local setting if set externally
.build();
String nodeName = internalCluster().startNode(nodeSettings);
ZenDiscovery zenDiscovery = (ZenDiscovery) internalCluster().getInstance(Discovery.class, nodeName);
assertThat(zenDiscovery.isRejoinOnMasterGone(), is(true));
client().admin().cluster().prepareUpdateSettings()
.setTransientSettings(ImmutableSettings.builder().put(ZenDiscovery.SETTING_REJOIN_ON_MASTER_GONE, false))
.get();
assertThat(zenDiscovery.isRejoinOnMasterGone(), is(false));
}
@Test
public void testNoShardRelocationsOccurWhenElectedMasterNodeFails() throws Exception {
Settings defaultSettings = ImmutableSettings.builder()
.put(FaultDetection.SETTING_PING_TIMEOUT, "1s")
.put(FaultDetection.SETTING_PING_RETRIES, "1")
.put("discovery.type", "zen")
.build();
Settings masterNodeSettings = ImmutableSettings.builder()
.put("node.data", false)
.put(defaultSettings)
.build();
internalCluster().startNodesAsync(2, masterNodeSettings).get();
Settings dateNodeSettings = ImmutableSettings.builder()
.put("node.master", false)
.put(defaultSettings)
.build();
internalCluster().startNodesAsync(2, dateNodeSettings).get();
ClusterHealthResponse clusterHealthResponse = client().admin().cluster().prepareHealth()
.setWaitForEvents(Priority.LANGUID)
.setWaitForNodes("4")
.setWaitForRelocatingShards(0)
.get();
assertThat(clusterHealthResponse.isTimedOut(), is(false));
createIndex("test");
ensureSearchable("test");
RecoveryResponse r = client().admin().indices().prepareRecoveries("test").get();
int numRecoveriesBeforeNewMaster = r.shardResponses().get("test").size();
final String oldMaster = internalCluster().getMasterName();
internalCluster().stopCurrentMasterNode();
assertBusy(new Runnable() {
@Override
public void run() {
String current = internalCluster().getMasterName();
assertThat(current, notNullValue());
assertThat(current, not(equalTo(oldMaster)));
}
});
ensureSearchable("test");
r = client().admin().indices().prepareRecoveries("test").get();
int numRecoveriesAfterNewMaster = r.shardResponses().get("test").size();
assertThat(numRecoveriesAfterNewMaster, equalTo(numRecoveriesBeforeNewMaster));
}
}

View File

@ -30,6 +30,7 @@ import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.discovery.zen.ping.ZenPing;
import org.elasticsearch.node.service.NodeService;
import org.elasticsearch.test.ElasticsearchTestCase;
@ -55,6 +56,7 @@ public class UnicastZenPingTests extends ElasticsearchTestCase {
ThreadPool threadPool = new ThreadPool(getClass().getName());
ClusterName clusterName = new ClusterName("test");
NetworkService networkService = new NetworkService(settings);
ElectMasterService electMasterService = new ElectMasterService(settings);
NettyTransport transportA = new NettyTransport(settings, threadPool, networkService, BigArrays.NON_RECYCLING_INSTANCE, Version.CURRENT);
final TransportService transportServiceA = new TransportService(transportA, threadPool).start();
@ -73,7 +75,7 @@ public class UnicastZenPingTests extends ElasticsearchTestCase {
addressB.address().getAddress().getHostAddress() + ":" + addressB.address().getPort())
.build();
UnicastZenPing zenPingA = new UnicastZenPing(hostsSettings, threadPool, transportServiceA, clusterName, Version.CURRENT, null);
UnicastZenPing zenPingA = new UnicastZenPing(hostsSettings, threadPool, transportServiceA, clusterName, Version.CURRENT, electMasterService, null);
zenPingA.setNodesProvider(new DiscoveryNodesProvider() {
@Override
public DiscoveryNodes nodes() {
@ -87,7 +89,7 @@ public class UnicastZenPingTests extends ElasticsearchTestCase {
});
zenPingA.start();
UnicastZenPing zenPingB = new UnicastZenPing(hostsSettings, threadPool, transportServiceB, clusterName, Version.CURRENT, null);
UnicastZenPing zenPingB = new UnicastZenPing(hostsSettings, threadPool, transportServiceB, clusterName, Version.CURRENT, electMasterService, null);
zenPingB.setNodesProvider(new DiscoveryNodesProvider() {
@Override
public DiscoveryNodes nodes() {

View File

@ -33,6 +33,7 @@ import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.discovery.DiscoverySettings;
import org.elasticsearch.discovery.zen.fd.FaultDetection;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
import org.elasticsearch.test.junit.annotations.TestLogging;
import org.elasticsearch.test.transport.MockTransportService;
@ -54,8 +55,8 @@ public class TransportIndexFailuresTest extends ElasticsearchIntegrationTest {
private static final Settings nodeSettings = ImmutableSettings.settingsBuilder()
.put("discovery.type", "zen") // <-- To override the local setting if set externally
.put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
.put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
.put(FaultDetection.SETTING_PING_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
.put(FaultDetection.SETTING_PING_RETRIES, "1") // <-- for hitting simulated network failures quickly
.put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
.put("discovery.zen.minimum_master_nodes", 1)
.put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())

View File

@ -30,7 +30,7 @@ import org.elasticsearch.test.junit.annotations.TestLogging;
import org.junit.Test;
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
import static org.elasticsearch.test.ElasticsearchIntegrationTest.*;
import static org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
/**
@ -54,7 +54,7 @@ public class FullRollingRestartTests extends ElasticsearchIntegrationTest {
@Test
@Slow
@TestLogging("indices.cluster:TRACE,cluster.service:TRACE")
@TestLogging("indices.cluster:TRACE,cluster.service:TRACE,action.search:TRACE,indices.recovery:TRACE")
public void testFullRollingRestart() throws Exception {
internalCluster().startNode();
createIndex("test");

View File

@ -43,7 +43,6 @@ import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.*;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoTimeout;
import static org.hamcrest.Matchers.equalTo;
public class RecoveryWhileUnderLoadTests extends ElasticsearchIntegrationTest {

View File

@ -217,7 +217,7 @@ public class BackgroundIndexer implements AutoCloseable {
setBudget(numOfDocs);
}
/** Stop all background threads **/
/** Stop all background threads * */
public void stop() throws InterruptedException {
if (stop.get()) {
return;

View File

@ -97,6 +97,7 @@ import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.search.SearchService;
import org.elasticsearch.test.client.RandomizingClient;
import org.elasticsearch.test.disruption.ServiceDisruptionScheme;
import org.hamcrest.Matchers;
import org.junit.*;
@ -581,6 +582,7 @@ public abstract class ElasticsearchIntegrationTest extends ElasticsearchTestCase
boolean success = false;
try {
logger.info("[{}#{}]: cleaning up after test", getTestClass().getSimpleName(), getTestName());
clearDisruptionScheme();
final Scope currentClusterScope = getCurrentClusterScope();
try {
if (currentClusterScope != Scope.TEST) {
@ -644,6 +646,13 @@ public abstract class ElasticsearchIntegrationTest extends ElasticsearchTestCase
}
public static Client client() {
return client(null);
}
public static Client client(@Nullable String node) {
if (node != null) {
return internalCluster().client(node);
}
Client client = cluster().client();
if (frequently()) {
client = new RandomizingClient(client, getRandom());
@ -689,6 +698,15 @@ public abstract class ElasticsearchIntegrationTest extends ElasticsearchTestCase
return between(minimumNumberOfReplicas(), maximumNumberOfReplicas());
}
public void setDisruptionScheme(ServiceDisruptionScheme scheme) {
internalCluster().setDisruptionScheme(scheme);
}
public void clearDisruptionScheme() {
internalCluster().clearDisruptionScheme();
}
/**
* Returns a settings object used in {@link #createIndex(String...)} and {@link #prepareCreate(String)} and friends.
* This method can be overwritten by subclasses to set defaults for the indices that are created by the test.
@ -889,7 +907,7 @@ public abstract class ElasticsearchIntegrationTest extends ElasticsearchTestCase
* It is useful to ensure that all action on the cluster have finished and all shards that were currently relocating
* are now allocated and started.
*/
public ClusterHealthStatus ensureGreen(String... indices) {
public ClusterHealthStatus ensureGreen(String... indices) {
ClusterHealthResponse actionGet = client().admin().cluster()
.health(Requests.clusterHealthRequest(indices).waitForGreenStatus().waitForEvents(Priority.LANGUID).waitForRelocatingShards(0)).actionGet();
if (actionGet.isTimedOut()) {

View File

@ -76,6 +76,7 @@ import org.elasticsearch.plugins.PluginsService;
import org.elasticsearch.search.SearchService;
import org.elasticsearch.test.cache.recycler.MockBigArraysModule;
import org.elasticsearch.test.cache.recycler.MockPageCacheRecyclerModule;
import org.elasticsearch.test.disruption.ServiceDisruptionScheme;
import org.elasticsearch.test.engine.MockEngineModule;
import org.elasticsearch.test.store.MockFSIndexStoreModule;
import org.elasticsearch.test.transport.AssertingLocalTransport;
@ -106,6 +107,7 @@ import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import static org.elasticsearch.test.ElasticsearchTestCase.assertBusy;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoTimeout;
import static org.hamcrest.Matchers.equalTo;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThat;
/**
@ -150,7 +152,7 @@ public final class InternalTestCluster extends TestCluster {
static final boolean DEFAULT_ENABLE_RANDOM_BENCH_NODES = true;
static final String NODE_MODE = nodeMode();
public static final String NODE_MODE = nodeMode();
/* sorted map to make traverse order reproducible, concurrent since we do checks on it not within a sync block */
private final NavigableMap<String, NodeAndClient> nodes = new TreeMap<>();
@ -187,6 +189,7 @@ public final class InternalTestCluster extends TestCluster {
*/
private final String nodePrefix;
private ServiceDisruptionScheme activeDisruptionScheme;
public InternalTestCluster(long clusterSeed, int minNumDataNodes, int maxNumDataNodes, String clusterName, int numClientNodes, boolean enableRandomBenchNodes,
int jvmOrdinal, String nodePrefix) {
@ -222,7 +225,7 @@ public final class InternalTestCluster extends TestCluster {
this.numSharedClientNodes = numClientNodes;
}
}
assert this.numSharedClientNodes >=0;
assert this.numSharedClientNodes >= 0;
this.enableRandomBenchNodes = enableRandomBenchNodes;
@ -251,7 +254,7 @@ public final class InternalTestCluster extends TestCluster {
if (numOfDataPaths > 0) {
StringBuilder dataPath = new StringBuilder();
for (int i = 0; i < numOfDataPaths; i++) {
dataPath.append(new File("data/d"+i).getAbsolutePath()).append(',');
dataPath.append(new File("data/d" + i).getAbsolutePath()).append(',');
}
builder.put("path.data", dataPath.toString());
}
@ -275,7 +278,7 @@ public final class InternalTestCluster extends TestCluster {
public static String nodeMode() {
Builder builder = ImmutableSettings.builder();
if (Strings.isEmpty(System.getProperty("es.node.mode"))&& Strings.isEmpty(System.getProperty("es.node.local"))) {
if (Strings.isEmpty(System.getProperty("es.node.mode")) && Strings.isEmpty(System.getProperty("es.node.local"))) {
return "local"; // default if nothing is specified
}
if (Strings.hasLength(System.getProperty("es.node.mode"))) {
@ -296,6 +299,10 @@ public final class InternalTestCluster extends TestCluster {
return clusterName;
}
public String[] getNodeNames() {
return nodes.keySet().toArray(Strings.EMPTY_ARRAY);
}
private static boolean isLocalTransportConfigured() {
if ("local".equals(System.getProperty("es.node.mode", "network"))) {
return true;
@ -328,7 +335,7 @@ public final class InternalTestCluster extends TestCluster {
//.put("index.store.type", random.nextInt(10) == 0 ? MockRamIndexStoreModule.class.getName() : MockFSIndexStoreModule.class.getName())
// decrease the routing schedule so new nodes will be added quickly - some random value between 30 and 80 ms
.put("cluster.routing.schedule", (30 + random.nextInt(50)) + "ms")
// default to non gateway
// default to non gateway
.put("gateway.type", "none")
.put(SETTING_CLUSTER_NODE_SEED, seed);
if (ENABLE_MOCK_MODULES && usually(random)) {
@ -352,7 +359,7 @@ public final class InternalTestCluster extends TestCluster {
builder.put(SearchService.KEEPALIVE_INTERVAL_KEY, TimeValue.timeValueSeconds(10 + random.nextInt(5 * 60)));
}
if (random.nextBoolean()) { // sometimes set a
builder.put(SearchService.DEFAUTL_KEEPALIVE_KEY, TimeValue.timeValueSeconds(100 + random.nextInt(5*60)));
builder.put(SearchService.DEFAUTL_KEEPALIVE_KEY, TimeValue.timeValueSeconds(100 + random.nextInt(5 * 60)));
}
if (random.nextBoolean()) {
// change threadpool types to make sure we don't have components that rely on the type of thread pools
@ -493,6 +500,7 @@ public final class InternalTestCluster extends TestCluster {
while (limit.hasNext()) {
NodeAndClient next = limit.next();
nodesToRemove.add(next);
removeDistruptionSchemeFromNode(next);
next.close();
}
for (NodeAndClient toRemove : nodesToRemove) {
@ -667,6 +675,10 @@ public final class InternalTestCluster extends TestCluster {
@Override
public void close() {
if (this.open.compareAndSet(true, false)) {
if (activeDisruptionScheme != null) {
activeDisruptionScheme.testClusterClosed();
activeDisruptionScheme = null;
}
IOUtils.closeWhileHandlingException(nodes.values());
nodes.clear();
executor.shutdownNow();
@ -777,7 +789,6 @@ public final class InternalTestCluster extends TestCluster {
public static final String TRANSPORT_CLIENT_PREFIX = "transport_client_";
static class TransportClientFactory {
private static TransportClientFactory NO_SNIFF_CLIENT_FACTORY = new TransportClientFactory(false, ImmutableSettings.EMPTY);
private static TransportClientFactory SNIFF_CLIENT_FACTORY = new TransportClientFactory(true, ImmutableSettings.EMPTY);
@ -831,10 +842,6 @@ public final class InternalTestCluster extends TestCluster {
}
private synchronized void reset(boolean wipeData) throws IOException {
randomlyResetClients();
if (wipeData) {
wipeDataDirectories();
}
// clear all rules for mock transport services
for (NodeAndClient nodeAndClient : nodes.values()) {
TransportService transportService = nodeAndClient.node.injector().getInstance(TransportService.class);
@ -842,6 +849,10 @@ public final class InternalTestCluster extends TestCluster {
((MockTransportService) transportService).clearAllRules();
}
}
randomlyResetClients();
if (wipeData) {
wipeDataDirectories();
}
if (nextNodeId.get() == sharedNodesSeeds.length && nodes.size() == sharedNodesSeeds.length) {
logger.debug("Cluster hasn't changed - moving out - nodes: [{}] nextNodeId: [{}] numSharedNodes: [{}]", nodes.keySet(), nextNodeId.get(), sharedNodesSeeds.length);
return;
@ -1030,6 +1041,7 @@ public final class InternalTestCluster extends TestCluster {
NodeAndClient nodeAndClient = getRandomNodeAndClient(new DataNodePredicate());
if (nodeAndClient != null) {
logger.info("Closing random node [{}] ", nodeAndClient.name);
removeDistruptionSchemeFromNode(nodeAndClient);
nodes.remove(nodeAndClient.name);
nodeAndClient.close();
}
@ -1049,6 +1061,7 @@ public final class InternalTestCluster extends TestCluster {
});
if (nodeAndClient != null) {
logger.info("Closing filtered random node [{}] ", nodeAndClient.name);
removeDistruptionSchemeFromNode(nodeAndClient);
nodes.remove(nodeAndClient.name);
nodeAndClient.close();
}
@ -1063,6 +1076,7 @@ public final class InternalTestCluster extends TestCluster {
String masterNodeName = getMasterName();
assert nodes.containsKey(masterNodeName);
logger.info("Closing master node [{}] ", masterNodeName);
removeDistruptionSchemeFromNode(nodes.get(masterNodeName));
NodeAndClient remove = nodes.remove(masterNodeName);
remove.close();
}
@ -1074,6 +1088,7 @@ public final class InternalTestCluster extends TestCluster {
NodeAndClient nodeAndClient = getRandomNodeAndClient(Predicates.not(new MasterNodePredicate(getMasterName())));
if (nodeAndClient != null) {
logger.info("Closing random non master node [{}] current master [{}] ", nodeAndClient.name, getMasterName());
removeDistruptionSchemeFromNode(nodeAndClient);
nodes.remove(nodeAndClient.name);
nodeAndClient.close();
}
@ -1127,6 +1142,9 @@ public final class InternalTestCluster extends TestCluster {
if (!callback.doRestart(nodeAndClient.name)) {
logger.info("Closing node [{}] during restart", nodeAndClient.name);
toRemove.add(nodeAndClient);
if (activeDisruptionScheme != null) {
activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
}
nodeAndClient.close();
}
}
@ -1141,18 +1159,33 @@ public final class InternalTestCluster extends TestCluster {
for (NodeAndClient nodeAndClient : nodes.values()) {
callback.doAfterNodes(numNodesRestarted++, nodeAndClient.nodeClient());
logger.info("Restarting node [{}] ", nodeAndClient.name);
if (activeDisruptionScheme != null) {
activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
}
nodeAndClient.restart(callback);
if (activeDisruptionScheme != null) {
activeDisruptionScheme.applyToNode(nodeAndClient.name, this);
}
}
} else {
int numNodesRestarted = 0;
for (NodeAndClient nodeAndClient : nodes.values()) {
callback.doAfterNodes(numNodesRestarted++, nodeAndClient.nodeClient());
logger.info("Stopping node [{}] ", nodeAndClient.name);
if (activeDisruptionScheme != null) {
activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
}
nodeAndClient.node.close();
}
for (NodeAndClient nodeAndClient : nodes.values()) {
logger.info("Starting node [{}] ", nodeAndClient.name);
if (activeDisruptionScheme != null) {
activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
}
nodeAndClient.restart(callback);
if (activeDisruptionScheme != null) {
activeDisruptionScheme.applyToNode(nodeAndClient.name, this);
}
}
}
}
@ -1193,7 +1226,10 @@ public final class InternalTestCluster extends TestCluster {
}
private String getMasterName() {
/**
* get the name of the current master node
*/
public String getMasterName() {
try {
ClusterState state = client().admin().cluster().prepareState().execute().actionGet().getState();
return state.nodes().masterNode().name();
@ -1350,6 +1386,7 @@ public final class InternalTestCluster extends TestCluster {
dataDirToClean.addAll(Arrays.asList(nodeEnv.nodeDataLocations()));
}
nodes.put(nodeAndClient.name, nodeAndClient);
applyDisruptionSchemeToNode(nodeAndClient);
}
public void closeNonSharedNodes(boolean wipeData) throws IOException {
@ -1371,6 +1408,48 @@ public final class InternalTestCluster extends TestCluster {
return hasFilterCache;
}
public void setDisruptionScheme(ServiceDisruptionScheme scheme) {
clearDisruptionScheme();
scheme.applyToCluster(this);
activeDisruptionScheme = scheme;
}
public void clearDisruptionScheme() {
if (activeDisruptionScheme != null) {
TimeValue expectedHealingTime = activeDisruptionScheme.expectedTimeToHeal();
logger.info("Clearing active scheme {}, expected healing time {}", activeDisruptionScheme, expectedHealingTime);
activeDisruptionScheme.removeFromCluster(this);
// We don't what scheme is picked, certain schemes don't partition the cluster, but process slow, so we need
// to to sleep, cluster health alone doesn't verify if these schemes have been cleared.
if (expectedHealingTime != null && expectedHealingTime.millis() > 0) {
try {
Thread.sleep(expectedHealingTime.millis());
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
assertFalse("cluster failed to form after disruption was healed", client().admin().cluster().prepareHealth()
.setWaitForNodes("" + nodes.size())
.setWaitForRelocatingShards(0)
.get().isTimedOut());
}
activeDisruptionScheme = null;
}
private void applyDisruptionSchemeToNode(NodeAndClient nodeAndClient) {
if (activeDisruptionScheme != null) {
assert nodes.containsKey(nodeAndClient.name);
activeDisruptionScheme.applyToNode(nodeAndClient.name, this);
}
}
private void removeDistruptionSchemeFromNode(NodeAndClient nodeAndClient) {
if (activeDisruptionScheme != null) {
assert nodes.containsKey(nodeAndClient.name);
activeDisruptionScheme.removeFromNode(nodeAndClient.name, this);
}
}
private synchronized Collection<NodeAndClient> dataNodeAndClients() {
return Collections2.filter(nodes.values(), new DataNodePredicate());
}

View File

@ -20,7 +20,7 @@ package org.elasticsearch.test;
import org.elasticsearch.common.settings.Settings;
abstract class SettingsSource {
public abstract class SettingsSource {
public static final SettingsSource EMPTY = new SettingsSource() {
@Override
@ -35,7 +35,7 @@ abstract class SettingsSource {
};
/**
* @return the settings for the node represented by the given ordinal, or {@code null} if there are no settings defined
* @return the settings for the node represented by the given ordinal, or {@code null} if there are no settings defined
*/
public abstract Settings node(int nodeOrdinal);

View File

@ -0,0 +1,177 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.disruption;
import org.elasticsearch.common.unit.TimeValue;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
public class LongGCDisruption extends SingleNodeDisruption {
volatile boolean disrupting;
volatile Thread worker;
final long intervalBetweenDelaysMin;
final long intervalBetweenDelaysMax;
final long delayDurationMin;
final long delayDurationMax;
public LongGCDisruption(Random random) {
this(null, random);
}
public LongGCDisruption(String disruptedNode, Random random) {
this(disruptedNode, random, 100, 200, 300, 20000);
}
public LongGCDisruption(String disruptedNode, Random random, long intervalBetweenDelaysMin,
long intervalBetweenDelaysMax, long delayDurationMin, long delayDurationMax) {
this(random, intervalBetweenDelaysMin, intervalBetweenDelaysMax, delayDurationMin, delayDurationMax);
this.disruptedNode = disruptedNode;
}
public LongGCDisruption(Random random,
long intervalBetweenDelaysMin, long intervalBetweenDelaysMax, long delayDurationMin,
long delayDurationMax) {
super(random);
this.intervalBetweenDelaysMin = intervalBetweenDelaysMin;
this.intervalBetweenDelaysMax = intervalBetweenDelaysMax;
this.delayDurationMin = delayDurationMin;
this.delayDurationMax = delayDurationMax;
}
final static AtomicInteger thread_ids = new AtomicInteger();
@Override
public void startDisrupting() {
disrupting = true;
worker = new Thread(new BackgroundWorker(), "long_gc_simulation_" + thread_ids.incrementAndGet());
worker.setDaemon(true);
worker.start();
}
@Override
public void stopDisrupting() {
if (worker == null) {
return;
}
logger.info("stopping long GCs on [{}]", disruptedNode);
disrupting = false;
worker.interrupt();
try {
worker.join(2 * (intervalBetweenDelaysMax + delayDurationMax));
} catch (InterruptedException e) {
logger.info("background thread failed to stop");
}
worker = null;
}
final static Pattern[] unsafeClasses = new Pattern[]{
// logging has shared JVM locks - we may suspend a thread and block other nodes from doing their thing
Pattern.compile("Logger")
};
private boolean stopNodeThreads(String node, Set<Thread> nodeThreads) {
Set<Thread> allThreadsSet = Thread.getAllStackTraces().keySet();
boolean stopped = false;
final String nodeThreadNamePart = "[" + node + "]";
for (Thread thread : allThreadsSet) {
String name = thread.getName();
if (name.contains(nodeThreadNamePart)) {
if (thread.isAlive() && nodeThreads.add(thread)) {
stopped = true;
thread.suspend();
// double check the thread is not in a shared resource like logging. If so, let it go and come back..
boolean safe = true;
safe:
for (StackTraceElement stackElement : thread.getStackTrace()) {
String className = stackElement.getClassName();
for (Pattern unsafePattern : unsafeClasses) {
if (unsafePattern.matcher(className).find()) {
safe = false;
break safe;
}
}
}
if (!safe) {
thread.resume();
nodeThreads.remove(thread);
}
}
}
}
return stopped;
}
private void resumeThreads(Set<Thread> threads) {
for (Thread thread : threads) {
thread.resume();
}
}
private void simulateLongGC(final TimeValue duration) throws InterruptedException {
final String disruptionNodeCopy = disruptedNode;
if (disruptionNodeCopy == null) {
return;
}
logger.info("node [{}] goes into GC for for [{}]", disruptionNodeCopy, duration);
final Set<Thread> nodeThreads = new HashSet<>();
try {
while (stopNodeThreads(disruptionNodeCopy, nodeThreads)) ;
if (!nodeThreads.isEmpty()) {
Thread.sleep(duration.millis());
}
} finally {
logger.info("node [{}] resumes from GC", disruptionNodeCopy);
resumeThreads(nodeThreads);
}
}
@Override
public TimeValue expectedTimeToHeal() {
return TimeValue.timeValueMillis(0);
}
class BackgroundWorker implements Runnable {
@Override
public void run() {
while (disrupting && disruptedNode != null) {
try {
TimeValue duration = new TimeValue(delayDurationMin + random.nextInt((int) (delayDurationMax - delayDurationMin)));
simulateLongGC(duration);
duration = new TimeValue(intervalBetweenDelaysMin + random.nextInt((int) (intervalBetweenDelaysMax - intervalBetweenDelaysMin)));
if (disrupting && disruptedNode != null) {
Thread.sleep(duration.millis());
}
} catch (InterruptedException e) {
} catch (Exception e) {
logger.error("error in background worker", e);
}
}
}
}
}

View File

@ -0,0 +1,92 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.disruption;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.test.transport.MockTransportService;
import java.util.Random;
import java.util.Set;
public class NetworkDelaysPartition extends NetworkPartition {
static long DEFAULT_DELAY_MIN = 10000;
static long DEFAULT_DELAY_MAX = 90000;
final long delayMin;
final long delayMax;
TimeValue duration;
public NetworkDelaysPartition(Random random) {
this(random, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX);
}
public NetworkDelaysPartition(Random random, long delayMin, long delayMax) {
super(random);
this.delayMin = delayMin;
this.delayMax = delayMax;
}
public NetworkDelaysPartition(String node1, String node2, Random random) {
this(node1, node2, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX, random);
}
public NetworkDelaysPartition(String node1, String node2, long delayMin, long delayMax, Random random) {
super(node1, node2, random);
this.delayMin = delayMin;
this.delayMax = delayMax;
}
public NetworkDelaysPartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, Random random) {
this(nodesSideOne, nodesSideTwo, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX, random);
}
public NetworkDelaysPartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, long delayMin, long delayMax, Random random) {
super(nodesSideOne, nodesSideTwo, random);
this.delayMin = delayMin;
this.delayMax = delayMax;
}
@Override
public synchronized void startDisrupting() {
duration = new TimeValue(delayMin + random.nextInt((int) (delayMax - delayMin)));
super.startDisrupting();
}
@Override
void applyDisruption(DiscoveryNode node1, MockTransportService transportService1,
DiscoveryNode node2, MockTransportService transportService2) {
transportService1.addUnresponsiveRule(node1, duration);
transportService1.addUnresponsiveRule(node2, duration);
}
@Override
protected String getPartitionDescription() {
return "network delays for [" + duration + "]";
}
@Override
public TimeValue expectedTimeToHeal() {
return TimeValue.timeValueMillis(delayMax);
}
}

View File

@ -0,0 +1,59 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.disruption;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.test.transport.MockTransportService;
import java.util.Random;
import java.util.Set;
public class NetworkDisconnectPartition extends NetworkPartition {
public NetworkDisconnectPartition(Random random) {
super(random);
}
public NetworkDisconnectPartition(String node1, String node2, Random random) {
super(node1, node2, random);
}
public NetworkDisconnectPartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, Random random) {
super(nodesSideOne, nodesSideTwo, random);
}
@Override
protected String getPartitionDescription() {
return "disconnected";
}
@Override
void applyDisruption(DiscoveryNode node1, MockTransportService transportService1,
DiscoveryNode node2, MockTransportService transportService2) {
transportService1.addFailToSendNoConnectRule(node2);
transportService2.addFailToSendNoConnectRule(node1);
}
@Override
public TimeValue expectedTimeToHeal() {
return TimeValue.timeValueSeconds(0);
}
}

View File

@ -0,0 +1,202 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.disruption;
import com.google.common.collect.ImmutableList;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.test.InternalTestCluster;
import org.elasticsearch.test.transport.MockTransportService;
import org.elasticsearch.transport.TransportService;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
public abstract class NetworkPartition implements ServiceDisruptionScheme {
protected final ESLogger logger = Loggers.getLogger(getClass());
final Set<String> nodesSideOne;
final Set<String> nodesSideTwo;
volatile boolean autoExpand;
protected final Random random;
protected volatile InternalTestCluster cluster;
protected volatile boolean activeDisruption = false;
public NetworkPartition(Random random) {
this.random = new Random(random.nextLong());
nodesSideOne = new HashSet<>();
nodesSideTwo = new HashSet<>();
autoExpand = true;
}
public NetworkPartition(String node1, String node2, Random random) {
this(random);
nodesSideOne.add(node1);
nodesSideTwo.add(node2);
autoExpand = false;
}
public NetworkPartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, Random random) {
this(random);
this.nodesSideOne.addAll(nodesSideOne);
this.nodesSideTwo.addAll(nodesSideTwo);
autoExpand = false;
}
public List<String> getNodesSideOne() {
return ImmutableList.copyOf(nodesSideOne);
}
public List<String> getNodesSideTwo() {
return ImmutableList.copyOf(nodesSideTwo);
}
public List<String> getMajoritySide() {
if (nodesSideOne.size() >= nodesSideTwo.size()) {
return getNodesSideOne();
} else {
return getNodesSideTwo();
}
}
public List<String> getMinoritySide() {
if (nodesSideOne.size() >= nodesSideTwo.size()) {
return getNodesSideTwo();
} else {
return getNodesSideOne();
}
}
@Override
public void applyToCluster(InternalTestCluster cluster) {
this.cluster = cluster;
if (autoExpand) {
for (String node : cluster.getNodeNames()) {
applyToNode(node, cluster);
}
}
}
@Override
public void removeFromCluster(InternalTestCluster cluster) {
stopDisrupting();
}
@Override
public synchronized void applyToNode(String node, InternalTestCluster cluster) {
if (!autoExpand || nodesSideOne.contains(node) || nodesSideTwo.contains(node)) {
return;
}
if (nodesSideOne.isEmpty()) {
nodesSideOne.add(node);
} else if (nodesSideTwo.isEmpty()) {
nodesSideTwo.add(node);
} else if (random.nextBoolean()) {
nodesSideOne.add(node);
} else {
nodesSideTwo.add(node);
}
}
@Override
public synchronized void removeFromNode(String node, InternalTestCluster cluster) {
MockTransportService transportService = (MockTransportService) cluster.getInstance(TransportService.class, node);
DiscoveryNode discoveryNode = discoveryNode(node);
Set<String> otherSideNodes;
if (nodesSideOne.contains(node)) {
otherSideNodes = nodesSideTwo;
} else if (nodesSideTwo.contains(node)) {
otherSideNodes = nodesSideOne;
} else {
return;
}
for (String node2 : otherSideNodes) {
MockTransportService transportService2 = (MockTransportService) cluster.getInstance(TransportService.class, node2);
DiscoveryNode discoveryNode2 = discoveryNode(node2);
removeDisruption(discoveryNode, transportService, discoveryNode2, transportService2);
}
}
@Override
public synchronized void testClusterClosed() {
}
protected abstract String getPartitionDescription();
protected DiscoveryNode discoveryNode(String node) {
return cluster.getInstance(Discovery.class, node).localNode();
}
@Override
public synchronized void startDisrupting() {
if (nodesSideOne.size() == 0 || nodesSideTwo.size() == 0) {
return;
}
logger.info("nodes {} will be partitioned from {}. partition type [{}]", nodesSideOne, nodesSideTwo, getPartitionDescription());
activeDisruption = true;
for (String node1 : nodesSideOne) {
MockTransportService transportService1 = (MockTransportService) cluster.getInstance(TransportService.class, node1);
DiscoveryNode discoveryNode1 = discoveryNode(node1);
for (String node2 : nodesSideTwo) {
DiscoveryNode discoveryNode2 = discoveryNode(node2);
MockTransportService transportService2 = (MockTransportService) cluster.getInstance(TransportService.class, node2);
applyDisruption(discoveryNode1, transportService1, discoveryNode2, transportService2);
}
}
}
@Override
public synchronized void stopDisrupting() {
if (nodesSideOne.size() == 0 || nodesSideTwo.size() == 0 || !activeDisruption) {
return;
}
logger.info("restoring partition between nodes {} & nodes {}", nodesSideOne, nodesSideTwo);
for (String node1 : nodesSideOne) {
MockTransportService transportService1 = (MockTransportService) cluster.getInstance(TransportService.class, node1);
DiscoveryNode discoveryNode1 = discoveryNode(node1);
for (String node2 : nodesSideTwo) {
DiscoveryNode discoveryNode2 = discoveryNode(node2);
MockTransportService transportService2 = (MockTransportService) cluster.getInstance(TransportService.class, node2);
removeDisruption(discoveryNode1, transportService1, discoveryNode2, transportService2);
}
}
activeDisruption = false;
}
abstract void applyDisruption(DiscoveryNode node1, MockTransportService transportService1,
DiscoveryNode node2, MockTransportService transportService2);
protected void removeDisruption(DiscoveryNode node1, MockTransportService transportService1,
DiscoveryNode node2, MockTransportService transportService2) {
transportService1.clearRule(node2);
transportService2.clearRule(node1);
}
}

View File

@ -0,0 +1,58 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.disruption;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.test.transport.MockTransportService;
import java.util.Random;
import java.util.Set;
public class NetworkUnresponsivePartition extends NetworkPartition {
public NetworkUnresponsivePartition(Random random) {
super(random);
}
public NetworkUnresponsivePartition(String node1, String node2, Random random) {
super(node1, node2, random);
}
public NetworkUnresponsivePartition(Set<String> nodesSideOne, Set<String> nodesSideTwo, Random random) {
super(nodesSideOne, nodesSideTwo, random);
}
@Override
protected String getPartitionDescription() {
return "unresponsive";
}
@Override
void applyDisruption(DiscoveryNode node1, MockTransportService transportService1,
DiscoveryNode node2, MockTransportService transportService2) {
transportService1.addUnresponsiveRule(node2);
transportService2.addUnresponsiveRule(node1);
}
@Override
public TimeValue expectedTimeToHeal() {
return TimeValue.timeValueSeconds(0);
}
}

View File

@ -0,0 +1,66 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.disruption;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.test.InternalTestCluster;
public class NoOpDisruptionScheme implements ServiceDisruptionScheme {
@Override
public void applyToCluster(InternalTestCluster cluster) {
}
@Override
public void removeFromCluster(InternalTestCluster cluster) {
}
@Override
public void applyToNode(String node, InternalTestCluster cluster) {
}
@Override
public void removeFromNode(String node, InternalTestCluster cluster) {
}
@Override
public void startDisrupting() {
}
@Override
public void stopDisrupting() {
}
@Override
public void testClusterClosed() {
}
@Override
public TimeValue expectedTimeToHeal() {
return TimeValue.timeValueSeconds(0);
}
}

View File

@ -0,0 +1,42 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.disruption;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.test.InternalTestCluster;
public interface ServiceDisruptionScheme {
public void applyToCluster(InternalTestCluster cluster);
public void removeFromCluster(InternalTestCluster cluster);
public void applyToNode(String node, InternalTestCluster cluster);
public void removeFromNode(String node, InternalTestCluster cluster);
public void startDisrupting();
public void stopDisrupting();
public void testClusterClosed();
public TimeValue expectedTimeToHeal();
}

View File

@ -0,0 +1,83 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.disruption;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.test.InternalTestCluster;
import java.util.Random;
public abstract class SingleNodeDisruption implements ServiceDisruptionScheme {
protected final ESLogger logger = Loggers.getLogger(getClass());
protected volatile String disruptedNode;
protected volatile InternalTestCluster cluster;
protected final Random random;
public SingleNodeDisruption(String disruptedNode, Random random) {
this(random);
this.disruptedNode = disruptedNode;
}
public SingleNodeDisruption(Random random) {
this.random = new Random(random.nextLong());
}
@Override
public void applyToCluster(InternalTestCluster cluster) {
this.cluster = cluster;
if (disruptedNode == null) {
String[] nodes = cluster.getNodeNames();
disruptedNode = nodes[random.nextInt(nodes.length)];
}
}
@Override
public void removeFromCluster(InternalTestCluster cluster) {
if (disruptedNode != null) {
removeFromNode(disruptedNode, cluster);
}
}
@Override
public synchronized void applyToNode(String node, InternalTestCluster cluster) {
}
@Override
public synchronized void removeFromNode(String node, InternalTestCluster cluster) {
if (disruptedNode == null) {
return;
}
if (!node.equals(disruptedNode)) {
return;
}
stopDisrupting();
disruptedNode = null;
}
@Override
public synchronized void testClusterClosed() {
disruptedNode = null;
}
}

View File

@ -0,0 +1,153 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.disruption;
import org.elasticsearch.cluster.ClusterService;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ClusterStateNonMasterUpdateTask;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.unit.TimeValue;
import java.util.Random;
import java.util.concurrent.CountDownLatch;
public class SlowClusterStateProcessing extends SingleNodeDisruption {
volatile boolean disrupting;
volatile Thread worker;
final long intervalBetweenDelaysMin;
final long intervalBetweenDelaysMax;
final long delayDurationMin;
final long delayDurationMax;
public SlowClusterStateProcessing(Random random) {
this(null, random);
}
public SlowClusterStateProcessing(String disruptedNode, Random random) {
this(disruptedNode, random, 100, 200, 300, 20000);
}
public SlowClusterStateProcessing(String disruptedNode, Random random, long intervalBetweenDelaysMin,
long intervalBetweenDelaysMax, long delayDurationMin, long delayDurationMax) {
this(random, intervalBetweenDelaysMin, intervalBetweenDelaysMax, delayDurationMin, delayDurationMax);
this.disruptedNode = disruptedNode;
}
public SlowClusterStateProcessing(Random random,
long intervalBetweenDelaysMin, long intervalBetweenDelaysMax, long delayDurationMin,
long delayDurationMax) {
super(random);
this.intervalBetweenDelaysMin = intervalBetweenDelaysMin;
this.intervalBetweenDelaysMax = intervalBetweenDelaysMax;
this.delayDurationMin = delayDurationMin;
this.delayDurationMax = delayDurationMax;
}
@Override
public void startDisrupting() {
disrupting = true;
worker = new Thread(new BackgroundWorker());
worker.setDaemon(true);
worker.start();
}
@Override
public void stopDisrupting() {
if (worker == null) {
return;
}
logger.info("stopping to slow down cluster state processing on [{}]", disruptedNode);
disrupting = false;
worker.interrupt();
try {
worker.join(2 * (intervalBetweenDelaysMax + delayDurationMax));
} catch (InterruptedException e) {
logger.info("background thread failed to stop");
}
worker = null;
}
private boolean interruptClusterStateProcessing(final TimeValue duration) throws InterruptedException {
final String disruptionNodeCopy = disruptedNode;
if (disruptionNodeCopy == null) {
return false;
}
logger.info("delaying cluster state updates on node [{}] for [{}]", disruptionNodeCopy, duration);
final CountDownLatch countDownLatch = new CountDownLatch(1);
ClusterService clusterService = cluster.getInstance(ClusterService.class, disruptionNodeCopy);
if (clusterService == null) {
return false;
}
clusterService.submitStateUpdateTask("service_disruption_delay", Priority.IMMEDIATE, new ClusterStateNonMasterUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
Thread.sleep(duration.millis());
countDownLatch.countDown();
return currentState;
}
@Override
public void onFailure(String source, Throwable t) {
countDownLatch.countDown();
}
});
try {
countDownLatch.await();
} catch (InterruptedException e) {
// try to wait again, we really want the cluster state thread to be freed up when stopping disruption
countDownLatch.await();
}
return true;
}
@Override
public TimeValue expectedTimeToHeal() {
return TimeValue.timeValueMillis(0);
}
class BackgroundWorker implements Runnable {
@Override
public void run() {
while (disrupting && disruptedNode != null) {
try {
TimeValue duration = new TimeValue(delayDurationMin + random.nextInt((int) (delayDurationMax - delayDurationMin)));
if (!interruptClusterStateProcessing(duration)) {
continue;
}
duration = new TimeValue(intervalBetweenDelaysMin + random.nextInt((int) (intervalBetweenDelaysMax - intervalBetweenDelaysMin)));
if (disrupting && disruptedNode != null) {
Thread.sleep(duration.millis());
}
} catch (InterruptedException e) {
} catch (Exception e) {
logger.error("error in background worker", e);
}
}
}
}
}

View File

@ -24,14 +24,21 @@ import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.component.Lifecycle;
import org.elasticsearch.common.component.LifecycleListener;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.io.stream.BytesStreamInput;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.network.NetworkService;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.BoundTransportAddress;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.*;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ConcurrentMap;
@ -46,6 +53,7 @@ public class MockTransportService extends TransportService {
public MockTransportService(Settings settings, Transport transport, ThreadPool threadPool) {
super(settings, new LookupTestTransport(transport), threadPool);
this.original = transport;
}
/**
@ -92,12 +100,19 @@ public class MockTransportService extends TransportService {
});
}
/**
* Adds a rule that will cause matching operations to throw ConnectTransportExceptions
*/
public void addFailToSendNoConnectRule(DiscoveryNode node, final String... blockedActions) {
addFailToSendNoConnectRule(node, new HashSet<>(Arrays.asList(blockedActions)));
}
/**
* Adds a rule that will cause matching operations to throw ConnectTransportExceptions
*/
public void addFailToSendNoConnectRule(DiscoveryNode node, final Set<String> blockedActions) {
((LookupTestTransport) transport).transports.put(node.getAddress(), new DelegateTransport(original) {
addDelegate(node, new DelegateTransport(original) {
@Override
public void connectToNode(DiscoveryNode node) throws ConnectTransportException {
original.connectToNode(node);
@ -124,7 +139,6 @@ public class MockTransportService extends TransportService {
* and failing to connect once the rule was added.
*/
public void addUnresponsiveRule(DiscoveryNode node) {
// TODO add a parameter to delay the connect timeout?
addDelegate(node, new DelegateTransport(original) {
@Override
public void connectToNode(DiscoveryNode node) throws ConnectTransportException {
@ -143,8 +157,101 @@ public class MockTransportService extends TransportService {
});
}
/**
* Adds a rule that will cause ignores each send request, simulating an unresponsive node
* and failing to connect once the rule was added.
*
* @param duration the amount of time to delay sending and connecting.
*/
public void addUnresponsiveRule(DiscoveryNode node, final TimeValue duration) {
final long startTime = System.currentTimeMillis();
addDelegate(node, new DelegateTransport(original) {
TimeValue getDelay() {
return new TimeValue(duration.millis() - (System.currentTimeMillis() - startTime));
}
@Override
public void connectToNode(DiscoveryNode node) throws ConnectTransportException {
TimeValue delay = getDelay();
if (delay.millis() <= 0) {
original.connectToNode(node);
return;
}
// TODO: Replace with proper setting
TimeValue connectingTimeout = NetworkService.TcpSettings.TCP_DEFAULT_CONNECT_TIMEOUT;
try {
if (delay.millis() < connectingTimeout.millis()) {
Thread.sleep(delay.millis());
original.connectToNode(node);
} else {
Thread.sleep(connectingTimeout.millis());
throw new ConnectTransportException(node, "UNRESPONSIVE: simulated");
}
} catch (InterruptedException e) {
throw new ConnectTransportException(node, "UNRESPONSIVE: interrupted while sleeping", e);
}
}
@Override
public void connectToNodeLight(DiscoveryNode node) throws ConnectTransportException {
TimeValue delay = getDelay();
if (delay.millis() <= 0) {
original.connectToNodeLight(node);
return;
}
// TODO: Replace with proper setting
TimeValue connectingTimeout = NetworkService.TcpSettings.TCP_DEFAULT_CONNECT_TIMEOUT;
try {
if (delay.millis() < connectingTimeout.millis()) {
Thread.sleep(delay.millis());
original.connectToNodeLight(node);
} else {
Thread.sleep(connectingTimeout.millis());
throw new ConnectTransportException(node, "UNRESPONSIVE: simulated");
}
} catch (InterruptedException e) {
throw new ConnectTransportException(node, "UNRESPONSIVE: interrupted while sleeping", e);
}
}
@Override
public void sendRequest(final DiscoveryNode node, final long requestId, final String action, TransportRequest request, final TransportRequestOptions options) throws IOException, TransportException {
// delayed sending - even if larger then the request timeout to simulated a potential late response from target node
TimeValue delay = getDelay();
if (delay.millis() <= 0) {
original.sendRequest(node, requestId, action, request, options);
return;
}
// poor mans request cloning...
TransportRequestHandler handler = MockTransportService.this.getHandler(action);
BytesStreamOutput bStream = new BytesStreamOutput();
request.writeTo(bStream);
final TransportRequest clonedRequest = handler.newInstance();
clonedRequest.readFrom(new BytesStreamInput(bStream.bytes()));
threadPool.schedule(delay, ThreadPool.Names.GENERIC, new AbstractRunnable() {
@Override
public void run() {
try {
original.sendRequest(node, requestId, action, clonedRequest, options);
} catch (Throwable e) {
logger.debug("failed to send delayed request", e);
}
}
});
}
});
}
/**
* Adds a new delegate transport that is used for communication with the given node.
*
* @return <tt>true</tt> iff no other delegate was registered for this node before, otherwise <tt>false</tt>
*/
public boolean addDelegate(DiscoveryNode node, DelegateTransport transport) {
@ -209,12 +316,11 @@ public class MockTransportService extends TransportService {
protected final Transport transport;
public DelegateTransport(Transport transport) {
this.transport = transport;
}
@Override
public void transportServiceAdapter(TransportServiceAdapter service) {
transport.transportServiceAdapter(service);