Deprecate and ignore join timeout (#60872)
There is no point in timing out a join attempt any more once a cluster is entirely in 7.x. Timing out and retrying with the same master is pointless, and an in-flight join attempt to one master no longer blocks attempts to join other masters. This commit deprecates this unnecessary setting and removes its effect from the joining process. Relates #60873 which removes this setting in master.
This commit is contained in:
parent
235e5ed3ea
commit
f44c28b595
|
@ -43,9 +43,9 @@ compatibility. Support for the old name will be removed in a future version.
|
||||||
Specifies whether {es} should form a multiple-node cluster. By default, {es}
|
Specifies whether {es} should form a multiple-node cluster. By default, {es}
|
||||||
discovers other nodes when forming a cluster and allows other nodes to join
|
discovers other nodes when forming a cluster and allows other nodes to join
|
||||||
the cluster later. If `discovery.type` is set to `single-node`, {es} forms a
|
the cluster later. If `discovery.type` is set to `single-node`, {es} forms a
|
||||||
single-node cluster and suppresses the timeouts set by
|
single-node cluster and suppresses the timeout set by
|
||||||
`cluster.publish.timeout` and `cluster.join.timeout`. For more information
|
`cluster.publish.timeout`. For more information about when you might use
|
||||||
about when you might use this setting, see <<single-node-discovery>>.
|
this setting, see <<single-node-discovery>>.
|
||||||
|
|
||||||
`cluster.initial_master_nodes`::
|
`cluster.initial_master_nodes`::
|
||||||
|
|
||||||
|
@ -196,9 +196,9 @@ or may become unstable or intolerant of certain failures.
|
||||||
|
|
||||||
`cluster.join.timeout`::
|
`cluster.join.timeout`::
|
||||||
|
|
||||||
Sets how long a node will wait after sending a request to join a cluster
|
deprecated[7.10, Has no effect in 7.x clusters] Sets how long a node will
|
||||||
before it considers the request to have failed and retries, unless
|
wait after sending a request to join a version 6.8 master before it
|
||||||
`discovery.type` is set to `single-node`. Defaults to `60s`.
|
considers the request to have failed and retries. Defaults to `60s`.
|
||||||
|
|
||||||
`cluster.max_voting_config_exclusions`::
|
`cluster.max_voting_config_exclusions`::
|
||||||
|
|
||||||
|
|
|
@ -67,9 +67,7 @@ to the voting configuration if it is appropriate to do so.
|
||||||
|
|
||||||
During master election or when joining an existing formed cluster, a node
|
During master election or when joining an existing formed cluster, a node
|
||||||
sends a join request to the master in order to be officially added to the
|
sends a join request to the master in order to be officially added to the
|
||||||
cluster. You can use the `cluster.join.timeout` setting to configure how long a
|
cluster.
|
||||||
node waits after sending a request to join a cluster. Its default value is `30s`.
|
|
||||||
See <<modules-discovery-settings>>.
|
|
||||||
|
|
||||||
[discrete]
|
[discrete]
|
||||||
[[modules-discovery-removing-nodes]]
|
[[modules-discovery-removing-nodes]]
|
||||||
|
|
|
@ -52,15 +52,11 @@ public class IndexingMasterFailoverIT extends ESIntegTestCase {
|
||||||
public void testMasterFailoverDuringIndexingWithMappingChanges() throws Throwable {
|
public void testMasterFailoverDuringIndexingWithMappingChanges() throws Throwable {
|
||||||
logger.info("--> start 4 nodes, 3 master, 1 data");
|
logger.info("--> start 4 nodes, 3 master, 1 data");
|
||||||
|
|
||||||
final Settings sharedSettings = Settings.builder()
|
|
||||||
.put("cluster.join.timeout", "10s") // still long to induce failures but not too long so test won't time out
|
|
||||||
.build();
|
|
||||||
|
|
||||||
internalCluster().setBootstrapMasterNodeIndex(2);
|
internalCluster().setBootstrapMasterNodeIndex(2);
|
||||||
|
|
||||||
internalCluster().startMasterOnlyNodes(3, sharedSettings);
|
internalCluster().startMasterOnlyNodes(3, Settings.EMPTY);
|
||||||
|
|
||||||
String dataNode = internalCluster().startDataOnlyNode(sharedSettings);
|
String dataNode = internalCluster().startDataOnlyNode(Settings.EMPTY);
|
||||||
|
|
||||||
logger.info("--> wait for all nodes to join the cluster");
|
logger.info("--> wait for all nodes to join the cluster");
|
||||||
ensureStableCluster(4);
|
ensureStableCluster(4);
|
||||||
|
|
|
@ -34,7 +34,6 @@ import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||||
import org.elasticsearch.cluster.routing.RerouteService;
|
import org.elasticsearch.cluster.routing.RerouteService;
|
||||||
import org.elasticsearch.cluster.routing.allocation.AllocationService;
|
import org.elasticsearch.cluster.routing.allocation.AllocationService;
|
||||||
import org.elasticsearch.cluster.service.MasterService;
|
import org.elasticsearch.cluster.service.MasterService;
|
||||||
import org.elasticsearch.common.Nullable;
|
|
||||||
import org.elasticsearch.common.Priority;
|
import org.elasticsearch.common.Priority;
|
||||||
import org.elasticsearch.common.collect.Tuple;
|
import org.elasticsearch.common.collect.Tuple;
|
||||||
import org.elasticsearch.common.io.stream.StreamInput;
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
|
@ -43,7 +42,6 @@ import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.common.unit.TimeValue;
|
import org.elasticsearch.common.unit.TimeValue;
|
||||||
import org.elasticsearch.discovery.zen.MembershipAction;
|
import org.elasticsearch.discovery.zen.MembershipAction;
|
||||||
import org.elasticsearch.discovery.zen.ZenDiscovery;
|
import org.elasticsearch.discovery.zen.ZenDiscovery;
|
||||||
import org.elasticsearch.discovery.DiscoveryModule;
|
|
||||||
import org.elasticsearch.monitor.NodeHealthService;
|
import org.elasticsearch.monitor.NodeHealthService;
|
||||||
import org.elasticsearch.monitor.StatusInfo;
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.threadpool.ThreadPool;
|
import org.elasticsearch.threadpool.ThreadPool;
|
||||||
|
@ -83,22 +81,21 @@ public class JoinHelper {
|
||||||
public static final String VALIDATE_JOIN_ACTION_NAME = "internal:cluster/coordination/join/validate";
|
public static final String VALIDATE_JOIN_ACTION_NAME = "internal:cluster/coordination/join/validate";
|
||||||
public static final String START_JOIN_ACTION_NAME = "internal:cluster/coordination/start_join";
|
public static final String START_JOIN_ACTION_NAME = "internal:cluster/coordination/start_join";
|
||||||
|
|
||||||
// the timeout for each join attempt
|
// the timeout for Zen1 join attempts
|
||||||
public static final Setting<TimeValue> JOIN_TIMEOUT_SETTING =
|
public static final Setting<TimeValue> JOIN_TIMEOUT_SETTING =
|
||||||
Setting.timeSetting("cluster.join.timeout",
|
Setting.timeSetting("cluster.join.timeout",
|
||||||
TimeValue.timeValueMillis(60000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);
|
TimeValue.timeValueMillis(60000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope, Setting.Property.Deprecated);
|
||||||
|
|
||||||
private final MasterService masterService;
|
private final MasterService masterService;
|
||||||
private final TransportService transportService;
|
private final TransportService transportService;
|
||||||
private final JoinTaskExecutor joinTaskExecutor;
|
private final JoinTaskExecutor joinTaskExecutor;
|
||||||
|
|
||||||
@Nullable // if using single-node discovery
|
private final TimeValue joinTimeout; // only used for Zen1 joining
|
||||||
private final TimeValue joinTimeout;
|
|
||||||
private final NodeHealthService nodeHealthService;
|
private final NodeHealthService nodeHealthService;
|
||||||
|
|
||||||
private final Set<Tuple<DiscoveryNode, JoinRequest>> pendingOutgoingJoins = Collections.synchronizedSet(new HashSet<>());
|
private final Set<Tuple<DiscoveryNode, JoinRequest>> pendingOutgoingJoins = Collections.synchronizedSet(new HashSet<>());
|
||||||
|
|
||||||
private AtomicReference<FailedJoinAttempt> lastFailedJoinAttempt = new AtomicReference<>();
|
private final AtomicReference<FailedJoinAttempt> lastFailedJoinAttempt = new AtomicReference<>();
|
||||||
|
|
||||||
JoinHelper(Settings settings, AllocationService allocationService, MasterService masterService,
|
JoinHelper(Settings settings, AllocationService allocationService, MasterService masterService,
|
||||||
TransportService transportService, LongSupplier currentTermSupplier, Supplier<ClusterState> currentStateSupplier,
|
TransportService transportService, LongSupplier currentTermSupplier, Supplier<ClusterState> currentStateSupplier,
|
||||||
|
@ -108,7 +105,7 @@ public class JoinHelper {
|
||||||
this.masterService = masterService;
|
this.masterService = masterService;
|
||||||
this.transportService = transportService;
|
this.transportService = transportService;
|
||||||
this.nodeHealthService = nodeHealthService;
|
this.nodeHealthService = nodeHealthService;
|
||||||
this.joinTimeout = DiscoveryModule.isSingleNodeDiscovery(settings) ? null : JOIN_TIMEOUT_SETTING.get(settings);
|
this.joinTimeout = JOIN_TIMEOUT_SETTING.get(settings);
|
||||||
this.joinTaskExecutor = new JoinTaskExecutor(settings, allocationService, logger, rerouteService) {
|
this.joinTaskExecutor = new JoinTaskExecutor(settings, allocationService, logger, rerouteService) {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -286,15 +283,17 @@ public class JoinHelper {
|
||||||
logger.debug("attempting to join {} with {}", destination, joinRequest);
|
logger.debug("attempting to join {} with {}", destination, joinRequest);
|
||||||
final String actionName;
|
final String actionName;
|
||||||
final TransportRequest transportRequest;
|
final TransportRequest transportRequest;
|
||||||
|
final TransportRequestOptions transportRequestOptions;
|
||||||
if (Coordinator.isZen1Node(destination)) {
|
if (Coordinator.isZen1Node(destination)) {
|
||||||
actionName = MembershipAction.DISCOVERY_JOIN_ACTION_NAME;
|
actionName = MembershipAction.DISCOVERY_JOIN_ACTION_NAME;
|
||||||
transportRequest = new MembershipAction.JoinRequest(transportService.getLocalNode());
|
transportRequest = new MembershipAction.JoinRequest(transportService.getLocalNode());
|
||||||
|
transportRequestOptions = TransportRequestOptions.builder().withTimeout(joinTimeout).build();
|
||||||
} else {
|
} else {
|
||||||
actionName = JOIN_ACTION_NAME;
|
actionName = JOIN_ACTION_NAME;
|
||||||
transportRequest = joinRequest;
|
transportRequest = joinRequest;
|
||||||
|
transportRequestOptions = TransportRequestOptions.EMPTY;
|
||||||
}
|
}
|
||||||
transportService.sendRequest(destination, actionName, transportRequest,
|
transportService.sendRequest(destination, actionName, transportRequest, transportRequestOptions,
|
||||||
TransportRequestOptions.builder().withTimeout(joinTimeout).build(),
|
|
||||||
new TransportResponseHandler<Empty>() {
|
new TransportResponseHandler<Empty>() {
|
||||||
@Override
|
@Override
|
||||||
public Empty read(StreamInput in) {
|
public Empty read(StreamInput in) {
|
||||||
|
@ -363,9 +362,7 @@ public class JoinHelper {
|
||||||
} else {
|
} else {
|
||||||
actionName = VALIDATE_JOIN_ACTION_NAME;
|
actionName = VALIDATE_JOIN_ACTION_NAME;
|
||||||
}
|
}
|
||||||
transportService.sendRequest(node, actionName,
|
transportService.sendRequest(node, actionName, new ValidateJoinRequest(state),
|
||||||
new ValidateJoinRequest(state),
|
|
||||||
TransportRequestOptions.builder().withTimeout(joinTimeout).build(),
|
|
||||||
new ActionListenerResponseHandler<>(listener, i -> Empty.INSTANCE, ThreadPool.Names.GENERIC));
|
new ActionListenerResponseHandler<>(listener, i -> Empty.INSTANCE, ThreadPool.Names.GENERIC));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,6 @@ import org.elasticsearch.cluster.block.ClusterBlock;
|
||||||
import org.elasticsearch.cluster.block.ClusterBlockLevel;
|
import org.elasticsearch.cluster.block.ClusterBlockLevel;
|
||||||
import org.elasticsearch.cluster.coordination.Coordinator;
|
import org.elasticsearch.cluster.coordination.Coordinator;
|
||||||
import org.elasticsearch.cluster.coordination.FollowersChecker;
|
import org.elasticsearch.cluster.coordination.FollowersChecker;
|
||||||
import org.elasticsearch.cluster.coordination.JoinHelper;
|
|
||||||
import org.elasticsearch.cluster.coordination.LeaderChecker;
|
import org.elasticsearch.cluster.coordination.LeaderChecker;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
||||||
import org.elasticsearch.common.Nullable;
|
import org.elasticsearch.common.Nullable;
|
||||||
|
@ -126,7 +125,6 @@ public abstract class AbstractDisruptionTestCase extends ESIntegTestCase {
|
||||||
.put(LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING.getKey(), 1) // for hitting simulated network failures quickly
|
.put(LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING.getKey(), 1) // for hitting simulated network failures quickly
|
||||||
.put(FollowersChecker.FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), "5s") // for hitting simulated network failures quickly
|
.put(FollowersChecker.FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), "5s") // for hitting simulated network failures quickly
|
||||||
.put(FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), 1) // for hitting simulated network failures quickly
|
.put(FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), 1) // for hitting simulated network failures quickly
|
||||||
.put(JoinHelper.JOIN_TIMEOUT_SETTING.getKey(), "10s") // still long to induce failures but to long so test won't time out
|
|
||||||
.put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "5s") // <-- for hitting simulated network failures quickly
|
.put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "5s") // <-- for hitting simulated network failures quickly
|
||||||
.put(TransportSettings.CONNECT_TIMEOUT.getKey(), "10s") // Network delay disruption waits for the min between this
|
.put(TransportSettings.CONNECT_TIMEOUT.getKey(), "10s") // Network delay disruption waits for the min between this
|
||||||
// value and the time of disruption and does not recover immediately
|
// value and the time of disruption and does not recover immediately
|
||||||
|
|
Loading…
Reference in New Issue