Today we do not allow a node to start if its filesystem is readonly, but it is possible for a filesystem to become readonly while the node is running. We don't currently have any infrastructure in place to make sure that Elasticsearch behaves well if this happens. A node that cannot write to disk may be poisonous to the rest of the cluster. With this commit we periodically verify that nodes' filesystems are writable. If a node fails these writability checks then it is removed from the cluster and prevented from re-joining until the checks start passing again. Closes #45286 Co-authored-by: Bukhtawar Khan <bukhtawar7152@gmail.com>
This commit is contained in:
parent
a8220ad51e
commit
46c8d00852
|
@ -1046,6 +1046,11 @@ public class ElasticsearchException extends RuntimeException implements ToXConte
|
||||||
org.elasticsearch.indices.recovery.PeerRecoveryNotFound.class,
|
org.elasticsearch.indices.recovery.PeerRecoveryNotFound.class,
|
||||||
org.elasticsearch.indices.recovery.PeerRecoveryNotFound::new,
|
org.elasticsearch.indices.recovery.PeerRecoveryNotFound::new,
|
||||||
158,
|
158,
|
||||||
|
Version.V_7_9_0),
|
||||||
|
NODE_HEALTH_CHECK_FAILURE_EXCEPTION(
|
||||||
|
org.elasticsearch.cluster.coordination.NodeHealthCheckFailureException.class,
|
||||||
|
org.elasticsearch.cluster.coordination.NodeHealthCheckFailureException::new,
|
||||||
|
159,
|
||||||
Version.V_7_9_0);
|
Version.V_7_9_0);
|
||||||
|
|
||||||
final Class<? extends ElasticsearchException> exceptionClass;
|
final Class<? extends ElasticsearchException> exceptionClass;
|
||||||
|
|
|
@ -32,6 +32,7 @@ import org.elasticsearch.common.transport.TransportAddress;
|
||||||
import org.elasticsearch.common.unit.TimeValue;
|
import org.elasticsearch.common.unit.TimeValue;
|
||||||
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
|
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
|
||||||
import org.elasticsearch.gateway.GatewayMetaState;
|
import org.elasticsearch.gateway.GatewayMetaState;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.threadpool.ThreadPool;
|
import org.elasticsearch.threadpool.ThreadPool;
|
||||||
import org.elasticsearch.threadpool.ThreadPool.Names;
|
import org.elasticsearch.threadpool.ThreadPool.Names;
|
||||||
|
|
||||||
|
@ -44,6 +45,7 @@ import java.util.stream.Collectors;
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING;
|
import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
|
|
||||||
public class ClusterFormationFailureHelper {
|
public class ClusterFormationFailureHelper {
|
||||||
private static final Logger logger = LogManager.getLogger(ClusterFormationFailureHelper.class);
|
private static final Logger logger = LogManager.getLogger(ClusterFormationFailureHelper.class);
|
||||||
|
@ -124,18 +126,24 @@ public class ClusterFormationFailureHelper {
|
||||||
private final List<DiscoveryNode> foundPeers;
|
private final List<DiscoveryNode> foundPeers;
|
||||||
private final long currentTerm;
|
private final long currentTerm;
|
||||||
private final ElectionStrategy electionStrategy;
|
private final ElectionStrategy electionStrategy;
|
||||||
|
private final StatusInfo statusInfo;
|
||||||
|
|
||||||
ClusterFormationState(Settings settings, ClusterState clusterState, List<TransportAddress> resolvedAddresses,
|
ClusterFormationState(Settings settings, ClusterState clusterState, List<TransportAddress> resolvedAddresses,
|
||||||
List<DiscoveryNode> foundPeers, long currentTerm, ElectionStrategy electionStrategy) {
|
List<DiscoveryNode> foundPeers, long currentTerm, ElectionStrategy electionStrategy,
|
||||||
|
StatusInfo statusInfo) {
|
||||||
this.settings = settings;
|
this.settings = settings;
|
||||||
this.clusterState = clusterState;
|
this.clusterState = clusterState;
|
||||||
this.resolvedAddresses = resolvedAddresses;
|
this.resolvedAddresses = resolvedAddresses;
|
||||||
this.foundPeers = foundPeers;
|
this.foundPeers = foundPeers;
|
||||||
this.currentTerm = currentTerm;
|
this.currentTerm = currentTerm;
|
||||||
this.electionStrategy = electionStrategy;
|
this.electionStrategy = electionStrategy;
|
||||||
|
this.statusInfo = statusInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
String getDescription() {
|
String getDescription() {
|
||||||
|
if (statusInfo.getStatus() == UNHEALTHY) {
|
||||||
|
return String.format(Locale.ROOT, "this node is unhealthy: %s", statusInfo.getInfo());
|
||||||
|
}
|
||||||
final List<String> clusterStateNodes = StreamSupport.stream(clusterState.nodes().getMasterNodes().values().spliterator(), false)
|
final List<String> clusterStateNodes = StreamSupport.stream(clusterState.nodes().getMasterNodes().values().spliterator(), false)
|
||||||
.map(n -> n.value.toString()).collect(Collectors.toList());
|
.map(n -> n.value.toString()).collect(Collectors.toList());
|
||||||
|
|
||||||
|
|
|
@ -70,6 +70,8 @@ import org.elasticsearch.discovery.PeerFinder;
|
||||||
import org.elasticsearch.discovery.SeedHostsProvider;
|
import org.elasticsearch.discovery.SeedHostsProvider;
|
||||||
import org.elasticsearch.discovery.SeedHostsResolver;
|
import org.elasticsearch.discovery.SeedHostsResolver;
|
||||||
import org.elasticsearch.discovery.zen.PendingClusterStateStats;
|
import org.elasticsearch.discovery.zen.PendingClusterStateStats;
|
||||||
|
import org.elasticsearch.monitor.NodeHealthService;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.threadpool.Scheduler;
|
import org.elasticsearch.threadpool.Scheduler;
|
||||||
import org.elasticsearch.threadpool.ThreadPool.Names;
|
import org.elasticsearch.threadpool.ThreadPool.Names;
|
||||||
import org.elasticsearch.transport.TransportResponse.Empty;
|
import org.elasticsearch.transport.TransportResponse.Empty;
|
||||||
|
@ -94,6 +96,7 @@ import java.util.stream.StreamSupport;
|
||||||
import static org.elasticsearch.cluster.coordination.NoMasterBlockService.NO_MASTER_BLOCK_ID;
|
import static org.elasticsearch.cluster.coordination.NoMasterBlockService.NO_MASTER_BLOCK_ID;
|
||||||
import static org.elasticsearch.gateway.ClusterStateUpdaters.hideStateIfNotRecovered;
|
import static org.elasticsearch.gateway.ClusterStateUpdaters.hideStateIfNotRecovered;
|
||||||
import static org.elasticsearch.gateway.GatewayService.STATE_NOT_RECOVERED_BLOCK;
|
import static org.elasticsearch.gateway.GatewayService.STATE_NOT_RECOVERED_BLOCK;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
|
|
||||||
public class Coordinator extends AbstractLifecycleComponent implements Discovery {
|
public class Coordinator extends AbstractLifecycleComponent implements Discovery {
|
||||||
|
|
||||||
|
@ -153,6 +156,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
private Optional<Join> lastJoin;
|
private Optional<Join> lastJoin;
|
||||||
private JoinHelper.JoinAccumulator joinAccumulator;
|
private JoinHelper.JoinAccumulator joinAccumulator;
|
||||||
private Optional<CoordinatorPublication> currentPublication = Optional.empty();
|
private Optional<CoordinatorPublication> currentPublication = Optional.empty();
|
||||||
|
private final NodeHealthService nodeHealthService;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param nodeName The name of the node, used to name the {@link java.util.concurrent.ExecutorService} of the {@link SeedHostsResolver}.
|
* @param nodeName The name of the node, used to name the {@link java.util.concurrent.ExecutorService} of the {@link SeedHostsResolver}.
|
||||||
|
@ -162,7 +166,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
NamedWriteableRegistry namedWriteableRegistry, AllocationService allocationService, MasterService masterService,
|
NamedWriteableRegistry namedWriteableRegistry, AllocationService allocationService, MasterService masterService,
|
||||||
Supplier<CoordinationState.PersistedState> persistedStateSupplier, SeedHostsProvider seedHostsProvider,
|
Supplier<CoordinationState.PersistedState> persistedStateSupplier, SeedHostsProvider seedHostsProvider,
|
||||||
ClusterApplier clusterApplier, Collection<BiConsumer<DiscoveryNode, ClusterState>> onJoinValidators, Random random,
|
ClusterApplier clusterApplier, Collection<BiConsumer<DiscoveryNode, ClusterState>> onJoinValidators, Random random,
|
||||||
RerouteService rerouteService, ElectionStrategy electionStrategy) {
|
RerouteService rerouteService, ElectionStrategy electionStrategy, NodeHealthService nodeHealthService) {
|
||||||
this.settings = settings;
|
this.settings = settings;
|
||||||
this.transportService = transportService;
|
this.transportService = transportService;
|
||||||
this.masterService = masterService;
|
this.masterService = masterService;
|
||||||
|
@ -172,7 +176,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
this.electionStrategy = electionStrategy;
|
this.electionStrategy = electionStrategy;
|
||||||
this.joinHelper = new JoinHelper(settings, allocationService, masterService, transportService,
|
this.joinHelper = new JoinHelper(settings, allocationService, masterService, transportService,
|
||||||
this::getCurrentTerm, this::getStateForMasterService, this::handleJoinRequest, this::joinLeaderInTerm, this.onJoinValidators,
|
this::getCurrentTerm, this::getStateForMasterService, this::handleJoinRequest, this::joinLeaderInTerm, this.onJoinValidators,
|
||||||
rerouteService);
|
rerouteService, nodeHealthService);
|
||||||
this.persistedStateSupplier = persistedStateSupplier;
|
this.persistedStateSupplier = persistedStateSupplier;
|
||||||
this.noMasterBlockService = new NoMasterBlockService(settings, clusterSettings);
|
this.noMasterBlockService = new NoMasterBlockService(settings, clusterSettings);
|
||||||
this.lastKnownLeader = Optional.empty();
|
this.lastKnownLeader = Optional.empty();
|
||||||
|
@ -182,14 +186,16 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
this.publishInfoTimeout = PUBLISH_INFO_TIMEOUT_SETTING.get(settings);
|
this.publishInfoTimeout = PUBLISH_INFO_TIMEOUT_SETTING.get(settings);
|
||||||
this.random = random;
|
this.random = random;
|
||||||
this.electionSchedulerFactory = new ElectionSchedulerFactory(settings, random, transportService.getThreadPool());
|
this.electionSchedulerFactory = new ElectionSchedulerFactory(settings, random, transportService.getThreadPool());
|
||||||
this.preVoteCollector = new PreVoteCollector(transportService, this::startElection, this::updateMaxTermSeen, electionStrategy);
|
this.preVoteCollector = new PreVoteCollector(transportService, this::startElection, this::updateMaxTermSeen, electionStrategy,
|
||||||
|
nodeHealthService);
|
||||||
configuredHostsResolver = new SeedHostsResolver(nodeName, settings, transportService, seedHostsProvider);
|
configuredHostsResolver = new SeedHostsResolver(nodeName, settings, transportService, seedHostsProvider);
|
||||||
this.peerFinder = new CoordinatorPeerFinder(settings, transportService,
|
this.peerFinder = new CoordinatorPeerFinder(settings, transportService,
|
||||||
new HandshakingTransportAddressConnector(settings, transportService), configuredHostsResolver);
|
new HandshakingTransportAddressConnector(settings, transportService), configuredHostsResolver);
|
||||||
this.publicationHandler = new PublicationTransportHandler(transportService, namedWriteableRegistry,
|
this.publicationHandler = new PublicationTransportHandler(transportService, namedWriteableRegistry,
|
||||||
this::handlePublishRequest, this::handleApplyCommit);
|
this::handlePublishRequest, this::handleApplyCommit);
|
||||||
this.leaderChecker = new LeaderChecker(settings, transportService, this::onLeaderFailure);
|
this.leaderChecker = new LeaderChecker(settings, transportService, this::onLeaderFailure, nodeHealthService);
|
||||||
this.followersChecker = new FollowersChecker(settings, transportService, this::onFollowerCheckRequest, this::removeNode);
|
this.followersChecker = new FollowersChecker(settings, transportService, this::onFollowerCheckRequest, this::removeNode,
|
||||||
|
nodeHealthService);
|
||||||
this.nodeRemovalExecutor = new NodeRemovalClusterStateTaskExecutor(allocationService, logger);
|
this.nodeRemovalExecutor = new NodeRemovalClusterStateTaskExecutor(allocationService, logger);
|
||||||
this.clusterApplier = clusterApplier;
|
this.clusterApplier = clusterApplier;
|
||||||
masterService.setClusterStateSupplier(this::getStateForMasterService);
|
masterService.setClusterStateSupplier(this::getStateForMasterService);
|
||||||
|
@ -202,12 +208,13 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
transportService::getLocalNode);
|
transportService::getLocalNode);
|
||||||
this.clusterFormationFailureHelper = new ClusterFormationFailureHelper(settings, this::getClusterFormationState,
|
this.clusterFormationFailureHelper = new ClusterFormationFailureHelper(settings, this::getClusterFormationState,
|
||||||
transportService.getThreadPool(), joinHelper::logLastFailedJoinAttempt);
|
transportService.getThreadPool(), joinHelper::logLastFailedJoinAttempt);
|
||||||
|
this.nodeHealthService = nodeHealthService;
|
||||||
}
|
}
|
||||||
|
|
||||||
private ClusterFormationState getClusterFormationState() {
|
private ClusterFormationState getClusterFormationState() {
|
||||||
return new ClusterFormationState(settings, getStateForMasterService(), peerFinder.getLastResolvedAddresses(),
|
return new ClusterFormationState(settings, getStateForMasterService(), peerFinder.getLastResolvedAddresses(),
|
||||||
Stream.concat(Stream.of(getLocalNode()), StreamSupport.stream(peerFinder.getFoundPeers().spliterator(), false))
|
Stream.concat(Stream.of(getLocalNode()), StreamSupport.stream(peerFinder.getFoundPeers().spliterator(), false))
|
||||||
.collect(Collectors.toList()), getCurrentTerm(), electionStrategy);
|
.collect(Collectors.toList()), getCurrentTerm(), electionStrategy, nodeHealthService.getHealth());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void onLeaderFailure(Exception e) {
|
private void onLeaderFailure(Exception e) {
|
||||||
|
@ -1230,6 +1237,12 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final StatusInfo statusInfo = nodeHealthService.getHealth();
|
||||||
|
if (statusInfo.getStatus() == UNHEALTHY) {
|
||||||
|
logger.debug("skip prevoting as local node is unhealthy: [{}]", statusInfo.getInfo());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (prevotingRound != null) {
|
if (prevotingRound != null) {
|
||||||
prevotingRound.close();
|
prevotingRound.close();
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,6 +34,8 @@ import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.common.unit.TimeValue;
|
import org.elasticsearch.common.unit.TimeValue;
|
||||||
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
|
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
|
||||||
import org.elasticsearch.discovery.zen.NodesFaultDetection;
|
import org.elasticsearch.discovery.zen.NodesFaultDetection;
|
||||||
|
import org.elasticsearch.monitor.NodeHealthService;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.threadpool.ThreadPool.Names;
|
import org.elasticsearch.threadpool.ThreadPool.Names;
|
||||||
import org.elasticsearch.transport.ConnectTransportException;
|
import org.elasticsearch.transport.ConnectTransportException;
|
||||||
import org.elasticsearch.transport.Transport;
|
import org.elasticsearch.transport.Transport;
|
||||||
|
@ -57,6 +59,7 @@ import java.util.function.Consumer;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
|
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The FollowersChecker is responsible for allowing a leader to check that its followers are still connected and healthy. On deciding that a
|
* The FollowersChecker is responsible for allowing a leader to check that its followers are still connected and healthy. On deciding that a
|
||||||
|
@ -97,16 +100,17 @@ public class FollowersChecker {
|
||||||
private final Set<DiscoveryNode> faultyNodes = new HashSet<>();
|
private final Set<DiscoveryNode> faultyNodes = new HashSet<>();
|
||||||
|
|
||||||
private final TransportService transportService;
|
private final TransportService transportService;
|
||||||
|
private final NodeHealthService nodeHealthService;
|
||||||
private volatile FastResponseState fastResponseState;
|
private volatile FastResponseState fastResponseState;
|
||||||
|
|
||||||
public FollowersChecker(Settings settings, TransportService transportService,
|
public FollowersChecker(Settings settings, TransportService transportService,
|
||||||
Consumer<FollowerCheckRequest> handleRequestAndUpdateState,
|
Consumer<FollowerCheckRequest> handleRequestAndUpdateState,
|
||||||
BiConsumer<DiscoveryNode, String> onNodeFailure) {
|
BiConsumer<DiscoveryNode, String> onNodeFailure, NodeHealthService nodeHealthService) {
|
||||||
this.settings = settings;
|
this.settings = settings;
|
||||||
this.transportService = transportService;
|
this.transportService = transportService;
|
||||||
this.handleRequestAndUpdateState = handleRequestAndUpdateState;
|
this.handleRequestAndUpdateState = handleRequestAndUpdateState;
|
||||||
this.onNodeFailure = onNodeFailure;
|
this.onNodeFailure = onNodeFailure;
|
||||||
|
this.nodeHealthService = nodeHealthService;
|
||||||
|
|
||||||
followerCheckInterval = FOLLOWER_CHECK_INTERVAL_SETTING.get(settings);
|
followerCheckInterval = FOLLOWER_CHECK_INTERVAL_SETTING.get(settings);
|
||||||
followerCheckTimeout = FOLLOWER_CHECK_TIMEOUT_SETTING.get(settings);
|
followerCheckTimeout = FOLLOWER_CHECK_TIMEOUT_SETTING.get(settings);
|
||||||
|
@ -167,8 +171,15 @@ public class FollowersChecker {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void handleFollowerCheck(FollowerCheckRequest request, TransportChannel transportChannel) throws IOException {
|
private void handleFollowerCheck(FollowerCheckRequest request, TransportChannel transportChannel) throws IOException {
|
||||||
FastResponseState responder = this.fastResponseState;
|
final StatusInfo statusInfo = nodeHealthService.getHealth();
|
||||||
|
if (statusInfo.getStatus() == UNHEALTHY) {
|
||||||
|
final String message
|
||||||
|
= "handleFollowerCheck: node is unhealthy [" + statusInfo.getInfo() + "], rejecting " + statusInfo.getInfo();
|
||||||
|
logger.debug(message);
|
||||||
|
throw new NodeHealthCheckFailureException(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
final FastResponseState responder = this.fastResponseState;
|
||||||
if (responder.mode == Mode.FOLLOWER && responder.term == request.term) {
|
if (responder.mode == Mode.FOLLOWER && responder.term == request.term) {
|
||||||
logger.trace("responding to {} on fast path", request);
|
logger.trace("responding to {} on fast path", request);
|
||||||
transportChannel.sendResponse(Empty.INSTANCE);
|
transportChannel.sendResponse(Empty.INSTANCE);
|
||||||
|
@ -340,6 +351,9 @@ public class FollowersChecker {
|
||||||
|| exp.getCause() instanceof ConnectTransportException) {
|
|| exp.getCause() instanceof ConnectTransportException) {
|
||||||
logger.debug(() -> new ParameterizedMessage("{} disconnected", FollowerChecker.this), exp);
|
logger.debug(() -> new ParameterizedMessage("{} disconnected", FollowerChecker.this), exp);
|
||||||
reason = "disconnected";
|
reason = "disconnected";
|
||||||
|
} else if (exp.getCause() instanceof NodeHealthCheckFailureException) {
|
||||||
|
logger.debug(() -> new ParameterizedMessage("{} health check failed", FollowerChecker.this), exp);
|
||||||
|
reason = "health check failed";
|
||||||
} else {
|
} else {
|
||||||
logger.debug(() -> new ParameterizedMessage("{} failed, retrying", FollowerChecker.this), exp);
|
logger.debug(() -> new ParameterizedMessage("{} failed, retrying", FollowerChecker.this), exp);
|
||||||
scheduleNextWakeUp();
|
scheduleNextWakeUp();
|
||||||
|
|
|
@ -44,6 +44,8 @@ import org.elasticsearch.common.unit.TimeValue;
|
||||||
import org.elasticsearch.discovery.zen.MembershipAction;
|
import org.elasticsearch.discovery.zen.MembershipAction;
|
||||||
import org.elasticsearch.discovery.zen.ZenDiscovery;
|
import org.elasticsearch.discovery.zen.ZenDiscovery;
|
||||||
import org.elasticsearch.discovery.DiscoveryModule;
|
import org.elasticsearch.discovery.DiscoveryModule;
|
||||||
|
import org.elasticsearch.monitor.NodeHealthService;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.threadpool.ThreadPool;
|
import org.elasticsearch.threadpool.ThreadPool;
|
||||||
import org.elasticsearch.threadpool.ThreadPool.Names;
|
import org.elasticsearch.threadpool.ThreadPool.Names;
|
||||||
import org.elasticsearch.transport.TransportChannel;
|
import org.elasticsearch.transport.TransportChannel;
|
||||||
|
@ -71,6 +73,8 @@ import java.util.function.Function;
|
||||||
import java.util.function.LongSupplier;
|
import java.util.function.LongSupplier;
|
||||||
import java.util.function.Supplier;
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
|
|
||||||
public class JoinHelper {
|
public class JoinHelper {
|
||||||
|
|
||||||
private static final Logger logger = LogManager.getLogger(JoinHelper.class);
|
private static final Logger logger = LogManager.getLogger(JoinHelper.class);
|
||||||
|
@ -90,6 +94,7 @@ public class JoinHelper {
|
||||||
|
|
||||||
@Nullable // if using single-node discovery
|
@Nullable // if using single-node discovery
|
||||||
private final TimeValue joinTimeout;
|
private final TimeValue joinTimeout;
|
||||||
|
private final NodeHealthService nodeHealthService;
|
||||||
|
|
||||||
private final Set<Tuple<DiscoveryNode, JoinRequest>> pendingOutgoingJoins = Collections.synchronizedSet(new HashSet<>());
|
private final Set<Tuple<DiscoveryNode, JoinRequest>> pendingOutgoingJoins = Collections.synchronizedSet(new HashSet<>());
|
||||||
|
|
||||||
|
@ -98,9 +103,11 @@ public class JoinHelper {
|
||||||
JoinHelper(Settings settings, AllocationService allocationService, MasterService masterService,
|
JoinHelper(Settings settings, AllocationService allocationService, MasterService masterService,
|
||||||
TransportService transportService, LongSupplier currentTermSupplier, Supplier<ClusterState> currentStateSupplier,
|
TransportService transportService, LongSupplier currentTermSupplier, Supplier<ClusterState> currentStateSupplier,
|
||||||
BiConsumer<JoinRequest, JoinCallback> joinHandler, Function<StartJoinRequest, Join> joinLeaderInTerm,
|
BiConsumer<JoinRequest, JoinCallback> joinHandler, Function<StartJoinRequest, Join> joinLeaderInTerm,
|
||||||
Collection<BiConsumer<DiscoveryNode, ClusterState>> joinValidators, RerouteService rerouteService) {
|
Collection<BiConsumer<DiscoveryNode, ClusterState>> joinValidators, RerouteService rerouteService,
|
||||||
|
NodeHealthService nodeHealthService) {
|
||||||
this.masterService = masterService;
|
this.masterService = masterService;
|
||||||
this.transportService = transportService;
|
this.transportService = transportService;
|
||||||
|
this.nodeHealthService = nodeHealthService;
|
||||||
this.joinTimeout = DiscoveryModule.isSingleNodeDiscovery(settings) ? null : JOIN_TIMEOUT_SETTING.get(settings);
|
this.joinTimeout = DiscoveryModule.isSingleNodeDiscovery(settings) ? null : JOIN_TIMEOUT_SETTING.get(settings);
|
||||||
this.joinTaskExecutor = new JoinTaskExecutor(settings, allocationService, logger, rerouteService) {
|
this.joinTaskExecutor = new JoinTaskExecutor(settings, allocationService, logger, rerouteService) {
|
||||||
|
|
||||||
|
@ -268,6 +275,11 @@ public class JoinHelper {
|
||||||
|
|
||||||
public void sendJoinRequest(DiscoveryNode destination, long term, Optional<Join> optionalJoin, Runnable onCompletion) {
|
public void sendJoinRequest(DiscoveryNode destination, long term, Optional<Join> optionalJoin, Runnable onCompletion) {
|
||||||
assert destination.isMasterNode() : "trying to join master-ineligible " + destination;
|
assert destination.isMasterNode() : "trying to join master-ineligible " + destination;
|
||||||
|
final StatusInfo statusInfo = nodeHealthService.getHealth();
|
||||||
|
if (statusInfo.getStatus() == UNHEALTHY) {
|
||||||
|
logger.debug("dropping join request to [{}]: [{}]", destination, statusInfo.getInfo());
|
||||||
|
return;
|
||||||
|
}
|
||||||
final JoinRequest joinRequest = new JoinRequest(transportService.getLocalNode(), term, optionalJoin);
|
final JoinRequest joinRequest = new JoinRequest(transportService.getLocalNode(), term, optionalJoin);
|
||||||
final Tuple<DiscoveryNode, JoinRequest> dedupKey = Tuple.tuple(destination, joinRequest);
|
final Tuple<DiscoveryNode, JoinRequest> dedupKey = Tuple.tuple(destination, joinRequest);
|
||||||
if (pendingOutgoingJoins.add(dedupKey)) {
|
if (pendingOutgoingJoins.add(dedupKey)) {
|
||||||
|
|
|
@ -34,6 +34,8 @@ import org.elasticsearch.common.settings.Setting;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.common.unit.TimeValue;
|
import org.elasticsearch.common.unit.TimeValue;
|
||||||
import org.elasticsearch.discovery.zen.MasterFaultDetection;
|
import org.elasticsearch.discovery.zen.MasterFaultDetection;
|
||||||
|
import org.elasticsearch.monitor.NodeHealthService;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.threadpool.ThreadPool.Names;
|
import org.elasticsearch.threadpool.ThreadPool.Names;
|
||||||
import org.elasticsearch.transport.ConnectTransportException;
|
import org.elasticsearch.transport.ConnectTransportException;
|
||||||
import org.elasticsearch.transport.NodeDisconnectedException;
|
import org.elasticsearch.transport.NodeDisconnectedException;
|
||||||
|
@ -55,6 +57,8 @@ import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.concurrent.atomic.AtomicReference;
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The LeaderChecker is responsible for allowing followers to check that the currently elected leader is still connected and healthy. We are
|
* The LeaderChecker is responsible for allowing followers to check that the currently elected leader is still connected and healthy. We are
|
||||||
* fairly lenient, possibly allowing multiple checks to fail before considering the leader to be faulty, to allow for the leader to
|
* fairly lenient, possibly allowing multiple checks to fail before considering the leader to be faulty, to allow for the leader to
|
||||||
|
@ -88,18 +92,21 @@ public class LeaderChecker {
|
||||||
private final int leaderCheckRetryCount;
|
private final int leaderCheckRetryCount;
|
||||||
private final TransportService transportService;
|
private final TransportService transportService;
|
||||||
private final Consumer<Exception> onLeaderFailure;
|
private final Consumer<Exception> onLeaderFailure;
|
||||||
|
private final NodeHealthService nodeHealthService;
|
||||||
|
|
||||||
private AtomicReference<CheckScheduler> currentChecker = new AtomicReference<>();
|
private AtomicReference<CheckScheduler> currentChecker = new AtomicReference<>();
|
||||||
|
|
||||||
private volatile DiscoveryNodes discoveryNodes;
|
private volatile DiscoveryNodes discoveryNodes;
|
||||||
|
|
||||||
LeaderChecker(final Settings settings, final TransportService transportService, final Consumer<Exception> onLeaderFailure) {
|
LeaderChecker(final Settings settings, final TransportService transportService, final Consumer<Exception> onLeaderFailure,
|
||||||
|
NodeHealthService nodeHealthService) {
|
||||||
this.settings = settings;
|
this.settings = settings;
|
||||||
leaderCheckInterval = LEADER_CHECK_INTERVAL_SETTING.get(settings);
|
leaderCheckInterval = LEADER_CHECK_INTERVAL_SETTING.get(settings);
|
||||||
leaderCheckTimeout = LEADER_CHECK_TIMEOUT_SETTING.get(settings);
|
leaderCheckTimeout = LEADER_CHECK_TIMEOUT_SETTING.get(settings);
|
||||||
leaderCheckRetryCount = LEADER_CHECK_RETRY_COUNT_SETTING.get(settings);
|
leaderCheckRetryCount = LEADER_CHECK_RETRY_COUNT_SETTING.get(settings);
|
||||||
this.transportService = transportService;
|
this.transportService = transportService;
|
||||||
this.onLeaderFailure = onLeaderFailure;
|
this.onLeaderFailure = onLeaderFailure;
|
||||||
|
this.nodeHealthService = nodeHealthService;
|
||||||
|
|
||||||
transportService.registerRequestHandler(LEADER_CHECK_ACTION_NAME, Names.SAME, false, false, LeaderCheckRequest::new,
|
transportService.registerRequestHandler(LEADER_CHECK_ACTION_NAME, Names.SAME, false, false, LeaderCheckRequest::new,
|
||||||
(request, channel, task) -> {
|
(request, channel, task) -> {
|
||||||
|
@ -169,8 +176,13 @@ public class LeaderChecker {
|
||||||
private void handleLeaderCheck(LeaderCheckRequest request) {
|
private void handleLeaderCheck(LeaderCheckRequest request) {
|
||||||
final DiscoveryNodes discoveryNodes = this.discoveryNodes;
|
final DiscoveryNodes discoveryNodes = this.discoveryNodes;
|
||||||
assert discoveryNodes != null;
|
assert discoveryNodes != null;
|
||||||
|
final StatusInfo statusInfo = nodeHealthService.getHealth();
|
||||||
if (discoveryNodes.isLocalNodeElectedMaster() == false) {
|
if (statusInfo.getStatus() == UNHEALTHY) {
|
||||||
|
final String message = "rejecting leader check from [" + request.getSender() + "] " +
|
||||||
|
"since node is unhealthy [" + statusInfo.getInfo() + "]";
|
||||||
|
logger.debug(message);
|
||||||
|
throw new NodeHealthCheckFailureException(message);
|
||||||
|
} else if (discoveryNodes.isLocalNodeElectedMaster() == false) {
|
||||||
logger.debug("rejecting leader check on non-master {}", request);
|
logger.debug("rejecting leader check on non-master {}", request);
|
||||||
throw new CoordinationStateRejectedException(
|
throw new CoordinationStateRejectedException(
|
||||||
"rejecting leader check from [" + request.getSender() + "] sent to a node that is no longer the master");
|
"rejecting leader check from [" + request.getSender() + "] sent to a node that is no longer the master");
|
||||||
|
@ -266,8 +278,12 @@ public class LeaderChecker {
|
||||||
"leader [{}] disconnected during check", leader), exp);
|
"leader [{}] disconnected during check", leader), exp);
|
||||||
leaderFailed(new ConnectTransportException(leader, "disconnected during check", exp));
|
leaderFailed(new ConnectTransportException(leader, "disconnected during check", exp));
|
||||||
return;
|
return;
|
||||||
|
} else if (exp.getCause() instanceof NodeHealthCheckFailureException) {
|
||||||
|
logger.debug(new ParameterizedMessage(
|
||||||
|
"leader [{}] health check failed", leader), exp);
|
||||||
|
leaderFailed(new NodeHealthCheckFailureException("node [" + leader + "] failed health checks", exp));
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
long failureCount = failureCountSinceLastSuccess.incrementAndGet();
|
long failureCount = failureCountSinceLastSuccess.incrementAndGet();
|
||||||
if (failureCount >= leaderCheckRetryCount) {
|
if (failureCount >= leaderCheckRetryCount) {
|
||||||
logger.debug(new ParameterizedMessage(
|
logger.debug(new ParameterizedMessage(
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.cluster.coordination;
|
||||||
|
|
||||||
|
import org.elasticsearch.ElasticsearchException;
|
||||||
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This exception is thrown if the File system is reported unhealthy by @{@link org.elasticsearch.monitor.fs.FsHealthService}
|
||||||
|
* and this nodes needs to be removed from the cluster
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class NodeHealthCheckFailureException extends ElasticsearchException {
|
||||||
|
|
||||||
|
public NodeHealthCheckFailureException(String msg, Object... args) {
|
||||||
|
super(msg, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
public NodeHealthCheckFailureException(StreamInput in) throws IOException {
|
||||||
|
super(in);
|
||||||
|
}
|
||||||
|
}
|
|
@ -29,6 +29,8 @@ import org.elasticsearch.common.Nullable;
|
||||||
import org.elasticsearch.common.collect.Tuple;
|
import org.elasticsearch.common.collect.Tuple;
|
||||||
import org.elasticsearch.common.io.stream.StreamInput;
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
import org.elasticsearch.common.lease.Releasable;
|
import org.elasticsearch.common.lease.Releasable;
|
||||||
|
import org.elasticsearch.monitor.NodeHealthService;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.threadpool.ThreadPool.Names;
|
import org.elasticsearch.threadpool.ThreadPool.Names;
|
||||||
import org.elasticsearch.transport.TransportException;
|
import org.elasticsearch.transport.TransportException;
|
||||||
import org.elasticsearch.transport.TransportResponseHandler;
|
import org.elasticsearch.transport.TransportResponseHandler;
|
||||||
|
@ -40,6 +42,8 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.function.LongConsumer;
|
import java.util.function.LongConsumer;
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
|
|
||||||
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
|
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
|
||||||
|
|
||||||
public class PreVoteCollector {
|
public class PreVoteCollector {
|
||||||
|
@ -52,16 +56,18 @@ public class PreVoteCollector {
|
||||||
private final Runnable startElection;
|
private final Runnable startElection;
|
||||||
private final LongConsumer updateMaxTermSeen;
|
private final LongConsumer updateMaxTermSeen;
|
||||||
private final ElectionStrategy electionStrategy;
|
private final ElectionStrategy electionStrategy;
|
||||||
|
private NodeHealthService nodeHealthService;
|
||||||
|
|
||||||
// Tuple for simple atomic updates. null until the first call to `update()`.
|
// Tuple for simple atomic updates. null until the first call to `update()`.
|
||||||
private volatile Tuple<DiscoveryNode, PreVoteResponse> state; // DiscoveryNode component is null if there is currently no known leader.
|
private volatile Tuple<DiscoveryNode, PreVoteResponse> state; // DiscoveryNode component is null if there is currently no known leader.
|
||||||
|
|
||||||
PreVoteCollector(final TransportService transportService, final Runnable startElection, final LongConsumer updateMaxTermSeen,
|
PreVoteCollector(final TransportService transportService, final Runnable startElection, final LongConsumer updateMaxTermSeen,
|
||||||
final ElectionStrategy electionStrategy) {
|
final ElectionStrategy electionStrategy, NodeHealthService nodeHealthService) {
|
||||||
this.transportService = transportService;
|
this.transportService = transportService;
|
||||||
this.startElection = startElection;
|
this.startElection = startElection;
|
||||||
this.updateMaxTermSeen = updateMaxTermSeen;
|
this.updateMaxTermSeen = updateMaxTermSeen;
|
||||||
this.electionStrategy = electionStrategy;
|
this.electionStrategy = electionStrategy;
|
||||||
|
this.nodeHealthService = nodeHealthService;
|
||||||
|
|
||||||
transportService.registerRequestHandler(REQUEST_PRE_VOTE_ACTION_NAME, Names.GENERIC, false, false,
|
transportService.registerRequestHandler(REQUEST_PRE_VOTE_ACTION_NAME, Names.GENERIC, false, false,
|
||||||
PreVoteRequest::new,
|
PreVoteRequest::new,
|
||||||
|
@ -106,6 +112,13 @@ public class PreVoteCollector {
|
||||||
final DiscoveryNode leader = state.v1();
|
final DiscoveryNode leader = state.v1();
|
||||||
final PreVoteResponse response = state.v2();
|
final PreVoteResponse response = state.v2();
|
||||||
|
|
||||||
|
final StatusInfo statusInfo = nodeHealthService.getHealth();
|
||||||
|
if (statusInfo.getStatus() == UNHEALTHY) {
|
||||||
|
String message = "rejecting " + request + " on unhealthy node: [" + statusInfo.getInfo() + "]";
|
||||||
|
logger.debug(message);
|
||||||
|
throw new NodeHealthCheckFailureException(message);
|
||||||
|
}
|
||||||
|
|
||||||
if (leader == null) {
|
if (leader == null) {
|
||||||
return response;
|
return response;
|
||||||
}
|
}
|
||||||
|
|
|
@ -97,6 +97,7 @@ import org.elasticsearch.indices.breaker.HierarchyCircuitBreakerService;
|
||||||
import org.elasticsearch.indices.fielddata.cache.IndicesFieldDataCache;
|
import org.elasticsearch.indices.fielddata.cache.IndicesFieldDataCache;
|
||||||
import org.elasticsearch.indices.recovery.RecoverySettings;
|
import org.elasticsearch.indices.recovery.RecoverySettings;
|
||||||
import org.elasticsearch.indices.store.IndicesStore;
|
import org.elasticsearch.indices.store.IndicesStore;
|
||||||
|
import org.elasticsearch.monitor.fs.FsHealthService;
|
||||||
import org.elasticsearch.monitor.fs.FsService;
|
import org.elasticsearch.monitor.fs.FsService;
|
||||||
import org.elasticsearch.monitor.jvm.JvmGcMonitorService;
|
import org.elasticsearch.monitor.jvm.JvmGcMonitorService;
|
||||||
import org.elasticsearch.monitor.jvm.JvmService;
|
import org.elasticsearch.monitor.jvm.JvmService;
|
||||||
|
@ -549,7 +550,10 @@ public final class ClusterSettings extends AbstractScopedSettings {
|
||||||
HandshakingTransportAddressConnector.PROBE_CONNECT_TIMEOUT_SETTING,
|
HandshakingTransportAddressConnector.PROBE_CONNECT_TIMEOUT_SETTING,
|
||||||
HandshakingTransportAddressConnector.PROBE_HANDSHAKE_TIMEOUT_SETTING,
|
HandshakingTransportAddressConnector.PROBE_HANDSHAKE_TIMEOUT_SETTING,
|
||||||
DiscoveryUpgradeService.BWC_PING_TIMEOUT_SETTING,
|
DiscoveryUpgradeService.BWC_PING_TIMEOUT_SETTING,
|
||||||
DiscoveryUpgradeService.ENABLE_UNSAFE_BOOTSTRAPPING_ON_UPGRADE_SETTING)));
|
DiscoveryUpgradeService.ENABLE_UNSAFE_BOOTSTRAPPING_ON_UPGRADE_SETTING,
|
||||||
|
FsHealthService.ENABLED_SETTING,
|
||||||
|
FsHealthService.REFRESH_INTERVAL_SETTING,
|
||||||
|
FsHealthService.SLOW_PATH_LOGGING_THRESHOLD_SETTING)));
|
||||||
|
|
||||||
public static List<SettingUpgrader<?>> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList(
|
public static List<SettingUpgrader<?>> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList(
|
||||||
SniffConnectionStrategy.SEARCH_REMOTE_CLUSTER_SEEDS_UPGRADER,
|
SniffConnectionStrategy.SEARCH_REMOTE_CLUSTER_SEEDS_UPGRADER,
|
||||||
|
|
|
@ -39,6 +39,7 @@ import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.common.transport.TransportAddress;
|
import org.elasticsearch.common.transport.TransportAddress;
|
||||||
import org.elasticsearch.discovery.zen.ZenDiscovery;
|
import org.elasticsearch.discovery.zen.ZenDiscovery;
|
||||||
import org.elasticsearch.gateway.GatewayMetaState;
|
import org.elasticsearch.gateway.GatewayMetaState;
|
||||||
|
import org.elasticsearch.monitor.NodeHealthService;
|
||||||
import org.elasticsearch.plugins.DiscoveryPlugin;
|
import org.elasticsearch.plugins.DiscoveryPlugin;
|
||||||
import org.elasticsearch.threadpool.ThreadPool;
|
import org.elasticsearch.threadpool.ThreadPool;
|
||||||
import org.elasticsearch.transport.TransportService;
|
import org.elasticsearch.transport.TransportService;
|
||||||
|
@ -91,7 +92,7 @@ public class DiscoveryModule {
|
||||||
NamedWriteableRegistry namedWriteableRegistry, NetworkService networkService, MasterService masterService,
|
NamedWriteableRegistry namedWriteableRegistry, NetworkService networkService, MasterService masterService,
|
||||||
ClusterApplier clusterApplier, ClusterSettings clusterSettings, List<DiscoveryPlugin> plugins,
|
ClusterApplier clusterApplier, ClusterSettings clusterSettings, List<DiscoveryPlugin> plugins,
|
||||||
AllocationService allocationService, Path configFile, GatewayMetaState gatewayMetaState,
|
AllocationService allocationService, Path configFile, GatewayMetaState gatewayMetaState,
|
||||||
RerouteService rerouteService) {
|
RerouteService rerouteService, NodeHealthService nodeHealthService) {
|
||||||
final Collection<BiConsumer<DiscoveryNode, ClusterState>> joinValidators = new ArrayList<>();
|
final Collection<BiConsumer<DiscoveryNode, ClusterState>> joinValidators = new ArrayList<>();
|
||||||
final Map<String, Supplier<SeedHostsProvider>> hostProviders = new HashMap<>();
|
final Map<String, Supplier<SeedHostsProvider>> hostProviders = new HashMap<>();
|
||||||
hostProviders.put("settings", () -> new SettingsBasedSeedHostsProvider(settings, transportService));
|
hostProviders.put("settings", () -> new SettingsBasedSeedHostsProvider(settings, transportService));
|
||||||
|
@ -153,7 +154,7 @@ public class DiscoveryModule {
|
||||||
settings, clusterSettings,
|
settings, clusterSettings,
|
||||||
transportService, namedWriteableRegistry, allocationService, masterService, gatewayMetaState::getPersistedState,
|
transportService, namedWriteableRegistry, allocationService, masterService, gatewayMetaState::getPersistedState,
|
||||||
seedHostsProvider, clusterApplier, joinValidators, new Random(Randomness.get().nextLong()), rerouteService,
|
seedHostsProvider, clusterApplier, joinValidators, new Random(Randomness.get().nextLong()), rerouteService,
|
||||||
electionStrategy);
|
electionStrategy, nodeHealthService);
|
||||||
} else if (ZEN_DISCOVERY_TYPE.equals(discoveryType)) {
|
} else if (ZEN_DISCOVERY_TYPE.equals(discoveryType)) {
|
||||||
discovery = new ZenDiscovery(settings, threadPool, transportService, namedWriteableRegistry, masterService, clusterApplier,
|
discovery = new ZenDiscovery(settings, threadPool, transportService, namedWriteableRegistry, masterService, clusterApplier,
|
||||||
clusterSettings, seedHostsProvider, allocationService, joinValidators, rerouteService);
|
clusterSettings, seedHostsProvider, allocationService, joinValidators, rerouteService);
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.monitor;
|
||||||
|
|
||||||
|
@FunctionalInterface
|
||||||
|
public interface NodeHealthService {
|
||||||
|
|
||||||
|
StatusInfo getHealth();
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.monitor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class that represents the Health status for a node as determined by {@link NodeHealthService} and provides additional
|
||||||
|
* info explaining the reasons
|
||||||
|
*/
|
||||||
|
public class StatusInfo {
|
||||||
|
|
||||||
|
public enum Status { HEALTHY, UNHEALTHY }
|
||||||
|
|
||||||
|
private Status status;
|
||||||
|
private String info;
|
||||||
|
|
||||||
|
public StatusInfo(Status status, String info) {
|
||||||
|
this.status = status;
|
||||||
|
this.info = info;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getInfo() {
|
||||||
|
return info;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Status getStatus() {
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "status[" + status + "]" + ", info[" + info + "]";
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,182 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.monitor.fs;
|
||||||
|
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.apache.logging.log4j.message.ParameterizedMessage;
|
||||||
|
import org.elasticsearch.common.Nullable;
|
||||||
|
import org.elasticsearch.common.UUIDs;
|
||||||
|
import org.elasticsearch.common.component.AbstractLifecycleComponent;
|
||||||
|
import org.elasticsearch.common.settings.ClusterSettings;
|
||||||
|
import org.elasticsearch.common.settings.Setting;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.common.unit.TimeValue;
|
||||||
|
import org.elasticsearch.core.internal.io.IOUtils;
|
||||||
|
import org.elasticsearch.env.NodeEnvironment;
|
||||||
|
import org.elasticsearch.monitor.NodeHealthService;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
|
import org.elasticsearch.threadpool.Scheduler;
|
||||||
|
import org.elasticsearch.threadpool.ThreadPool;
|
||||||
|
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.function.LongSupplier;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs periodically and attempts to create a temp file to see if the filesystem is writable. If not then it marks the
|
||||||
|
* path as unhealthy.
|
||||||
|
*/
|
||||||
|
public class FsHealthService extends AbstractLifecycleComponent implements NodeHealthService {
|
||||||
|
|
||||||
|
private static final Logger logger = LogManager.getLogger(FsHealthService.class);
|
||||||
|
private final ThreadPool threadPool;
|
||||||
|
private volatile boolean enabled;
|
||||||
|
private final TimeValue refreshInterval;
|
||||||
|
private volatile TimeValue slowPathLoggingThreshold;
|
||||||
|
private final NodeEnvironment nodeEnv;
|
||||||
|
private final LongSupplier currentTimeMillisSupplier;
|
||||||
|
private volatile Scheduler.Cancellable scheduledFuture;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
private volatile Set<Path> unhealthyPaths;
|
||||||
|
|
||||||
|
public static final Setting<Boolean> ENABLED_SETTING =
|
||||||
|
Setting.boolSetting("monitor.fs.health.enabled", true, Setting.Property.NodeScope, Setting.Property.Dynamic);
|
||||||
|
public static final Setting<TimeValue> REFRESH_INTERVAL_SETTING =
|
||||||
|
Setting.timeSetting("monitor.fs.health.refresh_interval", TimeValue.timeValueSeconds(120), TimeValue.timeValueMillis(1),
|
||||||
|
Setting.Property.NodeScope);
|
||||||
|
public static final Setting<TimeValue> SLOW_PATH_LOGGING_THRESHOLD_SETTING =
|
||||||
|
Setting.timeSetting("monitor.fs.health.slow_path_logging_threshold", TimeValue.timeValueSeconds(5), TimeValue.timeValueMillis(1),
|
||||||
|
Setting.Property.NodeScope, Setting.Property.Dynamic);
|
||||||
|
|
||||||
|
|
||||||
|
public FsHealthService(Settings settings, ClusterSettings clusterSettings, ThreadPool threadPool, NodeEnvironment nodeEnv) {
|
||||||
|
this.threadPool = threadPool;
|
||||||
|
this.enabled = ENABLED_SETTING.get(settings);
|
||||||
|
this.refreshInterval = REFRESH_INTERVAL_SETTING.get(settings);
|
||||||
|
this.slowPathLoggingThreshold = SLOW_PATH_LOGGING_THRESHOLD_SETTING.get(settings);
|
||||||
|
this.currentTimeMillisSupplier = threadPool::relativeTimeInMillis;
|
||||||
|
this.nodeEnv = nodeEnv;
|
||||||
|
clusterSettings.addSettingsUpdateConsumer(SLOW_PATH_LOGGING_THRESHOLD_SETTING, this::setSlowPathLoggingThreshold);
|
||||||
|
clusterSettings.addSettingsUpdateConsumer(ENABLED_SETTING, this::setEnabled);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void doStart() {
|
||||||
|
scheduledFuture = threadPool.scheduleWithFixedDelay(new FsHealthMonitor(), refreshInterval,
|
||||||
|
ThreadPool.Names.GENERIC);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void doStop() {
|
||||||
|
scheduledFuture.cancel();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void doClose() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEnabled(boolean enabled) {
|
||||||
|
this.enabled = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSlowPathLoggingThreshold(TimeValue slowPathLoggingThreshold) {
|
||||||
|
this.slowPathLoggingThreshold = slowPathLoggingThreshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StatusInfo getHealth() {
|
||||||
|
StatusInfo statusInfo;
|
||||||
|
Set<Path> unhealthyPaths = this.unhealthyPaths;
|
||||||
|
if (enabled == false) {
|
||||||
|
statusInfo = new StatusInfo(HEALTHY, "health check disabled");
|
||||||
|
} else if (unhealthyPaths == null) {
|
||||||
|
statusInfo = new StatusInfo(HEALTHY, "health check passed");
|
||||||
|
} else {
|
||||||
|
String info = "health check failed on [" + unhealthyPaths.stream()
|
||||||
|
.map(k -> k.toString()).collect(Collectors.joining(",")) + "]";
|
||||||
|
statusInfo = new StatusInfo(UNHEALTHY, info);
|
||||||
|
}
|
||||||
|
return statusInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
class FsHealthMonitor implements Runnable {
|
||||||
|
|
||||||
|
private static final String TEMP_FILE_NAME = ".es_temp_file";
|
||||||
|
private byte[] byteToWrite;
|
||||||
|
|
||||||
|
FsHealthMonitor(){
|
||||||
|
this.byteToWrite = UUIDs.randomBase64UUID().getBytes(StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
try {
|
||||||
|
if (enabled) {
|
||||||
|
monitorFSHealth();
|
||||||
|
logger.debug("health check succeeded");
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("health check failed", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void monitorFSHealth() {
|
||||||
|
Set<Path> currentUnhealthyPaths = null;
|
||||||
|
for (Path path : nodeEnv.nodeDataPaths()) {
|
||||||
|
long executionStartTime = currentTimeMillisSupplier.getAsLong();
|
||||||
|
try {
|
||||||
|
if (Files.exists(path)) {
|
||||||
|
Path tempDataPath = path.resolve(TEMP_FILE_NAME);
|
||||||
|
Files.deleteIfExists(tempDataPath);
|
||||||
|
try (OutputStream os = Files.newOutputStream(tempDataPath, StandardOpenOption.CREATE_NEW)) {
|
||||||
|
os.write(byteToWrite);
|
||||||
|
IOUtils.fsync(tempDataPath, false);
|
||||||
|
}
|
||||||
|
Files.delete(tempDataPath);
|
||||||
|
final long elapsedTime = currentTimeMillisSupplier.getAsLong() - executionStartTime;
|
||||||
|
if (elapsedTime > slowPathLoggingThreshold.millis()) {
|
||||||
|
logger.warn("health check of [{}] took [{}ms] which is above the warn threshold of [{}]",
|
||||||
|
path, elapsedTime, slowPathLoggingThreshold);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error(new ParameterizedMessage("health check of [{}] failed", path), ex);
|
||||||
|
if (currentUnhealthyPaths == null) {
|
||||||
|
currentUnhealthyPaths = new HashSet<>(1);
|
||||||
|
}
|
||||||
|
currentUnhealthyPaths.add(path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unhealthyPaths = currentUnhealthyPaths;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -123,6 +123,7 @@ import org.elasticsearch.indices.recovery.RecoverySettings;
|
||||||
import org.elasticsearch.indices.store.IndicesStore;
|
import org.elasticsearch.indices.store.IndicesStore;
|
||||||
import org.elasticsearch.ingest.IngestService;
|
import org.elasticsearch.ingest.IngestService;
|
||||||
import org.elasticsearch.monitor.MonitorService;
|
import org.elasticsearch.monitor.MonitorService;
|
||||||
|
import org.elasticsearch.monitor.fs.FsHealthService;
|
||||||
import org.elasticsearch.monitor.jvm.JvmInfo;
|
import org.elasticsearch.monitor.jvm.JvmInfo;
|
||||||
import org.elasticsearch.persistent.PersistentTasksClusterService;
|
import org.elasticsearch.persistent.PersistentTasksClusterService;
|
||||||
import org.elasticsearch.persistent.PersistentTasksExecutor;
|
import org.elasticsearch.persistent.PersistentTasksExecutor;
|
||||||
|
@ -406,6 +407,8 @@ public class Node implements Closeable {
|
||||||
modules.add(pluginModule);
|
modules.add(pluginModule);
|
||||||
}
|
}
|
||||||
final MonitorService monitorService = new MonitorService(settings, nodeEnvironment, threadPool, clusterInfoService);
|
final MonitorService monitorService = new MonitorService(settings, nodeEnvironment, threadPool, clusterInfoService);
|
||||||
|
final FsHealthService fsHealthService = new FsHealthService(settings, clusterService.getClusterSettings(), threadPool,
|
||||||
|
nodeEnvironment);
|
||||||
ClusterModule clusterModule = new ClusterModule(settings, clusterService, clusterPlugins, clusterInfoService);
|
ClusterModule clusterModule = new ClusterModule(settings, clusterService, clusterPlugins, clusterInfoService);
|
||||||
modules.add(clusterModule);
|
modules.add(clusterModule);
|
||||||
IndicesModule indicesModule = new IndicesModule(pluginsService.filterPlugins(MapperPlugin.class));
|
IndicesModule indicesModule = new IndicesModule(pluginsService.filterPlugins(MapperPlugin.class));
|
||||||
|
@ -570,7 +573,8 @@ public class Node implements Closeable {
|
||||||
final DiscoveryModule discoveryModule = new DiscoveryModule(settings, threadPool, transportService, namedWriteableRegistry,
|
final DiscoveryModule discoveryModule = new DiscoveryModule(settings, threadPool, transportService, namedWriteableRegistry,
|
||||||
networkService, clusterService.getMasterService(), clusterService.getClusterApplierService(),
|
networkService, clusterService.getMasterService(), clusterService.getClusterApplierService(),
|
||||||
clusterService.getClusterSettings(), pluginsService.filterPlugins(DiscoveryPlugin.class),
|
clusterService.getClusterSettings(), pluginsService.filterPlugins(DiscoveryPlugin.class),
|
||||||
clusterModule.getAllocationService(), environment.configFile(), gatewayMetaState, rerouteService);
|
clusterModule.getAllocationService(), environment.configFile(), gatewayMetaState, rerouteService,
|
||||||
|
fsHealthService);
|
||||||
this.nodeService = new NodeService(settings, threadPool, monitorService, discoveryModule.getDiscovery(),
|
this.nodeService = new NodeService(settings, threadPool, monitorService, discoveryModule.getDiscovery(),
|
||||||
transportService, indicesService, pluginsService, circuitBreakerService, scriptService,
|
transportService, indicesService, pluginsService, circuitBreakerService, scriptService,
|
||||||
httpServerTransport, ingestService, clusterService, settingsModule.getSettingsFilter(), responseCollectorService,
|
httpServerTransport, ingestService, clusterService, settingsModule.getSettingsFilter(), responseCollectorService,
|
||||||
|
@ -653,6 +657,7 @@ public class Node implements Closeable {
|
||||||
b.bind(RestoreService.class).toInstance(restoreService);
|
b.bind(RestoreService.class).toInstance(restoreService);
|
||||||
b.bind(RerouteService.class).toInstance(rerouteService);
|
b.bind(RerouteService.class).toInstance(rerouteService);
|
||||||
b.bind(ShardLimitValidator.class).toInstance(shardLimitValidator);
|
b.bind(ShardLimitValidator.class).toInstance(shardLimitValidator);
|
||||||
|
b.bind(FsHealthService.class).toInstance(fsHealthService);
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
injector = modules.createInjector();
|
injector = modules.createInjector();
|
||||||
|
@ -747,6 +752,7 @@ public class Node implements Closeable {
|
||||||
injector.getInstance(SnapshotShardsService.class).start();
|
injector.getInstance(SnapshotShardsService.class).start();
|
||||||
injector.getInstance(RepositoriesService.class).start();
|
injector.getInstance(RepositoriesService.class).start();
|
||||||
injector.getInstance(SearchService.class).start();
|
injector.getInstance(SearchService.class).start();
|
||||||
|
injector.getInstance(FsHealthService.class).start();
|
||||||
nodeService.getMonitorService().start();
|
nodeService.getMonitorService().start();
|
||||||
|
|
||||||
final ClusterService clusterService = injector.getInstance(ClusterService.class);
|
final ClusterService clusterService = injector.getInstance(ClusterService.class);
|
||||||
|
@ -883,6 +889,7 @@ public class Node implements Closeable {
|
||||||
// we close indices first, so operations won't be allowed on it
|
// we close indices first, so operations won't be allowed on it
|
||||||
injector.getInstance(ClusterService.class).stop();
|
injector.getInstance(ClusterService.class).stop();
|
||||||
injector.getInstance(NodeConnectionsService.class).stop();
|
injector.getInstance(NodeConnectionsService.class).stop();
|
||||||
|
injector.getInstance(FsHealthService.class).stop();
|
||||||
nodeService.getMonitorService().stop();
|
nodeService.getMonitorService().stop();
|
||||||
injector.getInstance(GatewayService.class).stop();
|
injector.getInstance(GatewayService.class).stop();
|
||||||
injector.getInstance(SearchService.class).stop();
|
injector.getInstance(SearchService.class).stop();
|
||||||
|
@ -939,6 +946,8 @@ public class Node implements Closeable {
|
||||||
toClose.add(injector.getInstance(Discovery.class));
|
toClose.add(injector.getInstance(Discovery.class));
|
||||||
toClose.add(() -> stopWatch.stop().start("monitor"));
|
toClose.add(() -> stopWatch.stop().start("monitor"));
|
||||||
toClose.add(nodeService.getMonitorService());
|
toClose.add(nodeService.getMonitorService());
|
||||||
|
toClose.add(() -> stopWatch.stop().start("fsHealth"));
|
||||||
|
toClose.add(injector.getInstance(FsHealthService.class));
|
||||||
toClose.add(() -> stopWatch.stop().start("gateway"));
|
toClose.add(() -> stopWatch.stop().start("gateway"));
|
||||||
toClose.add(injector.getInstance(GatewayService.class));
|
toClose.add(injector.getInstance(GatewayService.class));
|
||||||
toClose.add(() -> stopWatch.stop().start("search"));
|
toClose.add(() -> stopWatch.stop().start("search"));
|
||||||
|
|
|
@ -72,6 +72,7 @@ import org.elasticsearch.indices.InvalidIndexTemplateException;
|
||||||
import org.elasticsearch.indices.recovery.PeerRecoveryNotFound;
|
import org.elasticsearch.indices.recovery.PeerRecoveryNotFound;
|
||||||
import org.elasticsearch.indices.recovery.RecoverFilesRecoveryException;
|
import org.elasticsearch.indices.recovery.RecoverFilesRecoveryException;
|
||||||
import org.elasticsearch.ingest.IngestProcessorException;
|
import org.elasticsearch.ingest.IngestProcessorException;
|
||||||
|
import org.elasticsearch.cluster.coordination.NodeHealthCheckFailureException;
|
||||||
import org.elasticsearch.repositories.RepositoryException;
|
import org.elasticsearch.repositories.RepositoryException;
|
||||||
import org.elasticsearch.rest.RestStatus;
|
import org.elasticsearch.rest.RestStatus;
|
||||||
import org.elasticsearch.rest.action.admin.indices.AliasesNotFoundException;
|
import org.elasticsearch.rest.action.admin.indices.AliasesNotFoundException;
|
||||||
|
@ -828,6 +829,7 @@ public class ExceptionSerializationTests extends ESTestCase {
|
||||||
ids.put(156, RetentionLeaseInvalidRetainingSeqNoException.class);
|
ids.put(156, RetentionLeaseInvalidRetainingSeqNoException.class);
|
||||||
ids.put(157, IngestProcessorException.class);
|
ids.put(157, IngestProcessorException.class);
|
||||||
ids.put(158, PeerRecoveryNotFound.class);
|
ids.put(158, PeerRecoveryNotFound.class);
|
||||||
|
ids.put(159, NodeHealthCheckFailureException.class);
|
||||||
|
|
||||||
Map<Class<? extends ElasticsearchException>, Integer> reverse = new HashMap<>();
|
Map<Class<? extends ElasticsearchException>, Integer> reverse = new HashMap<>();
|
||||||
for (Map.Entry<Integer, Class<? extends ElasticsearchException>> entry : ids.entrySet()) {
|
for (Map.Entry<Integer, Class<? extends ElasticsearchException>> entry : ids.entrySet()) {
|
||||||
|
|
|
@ -30,6 +30,7 @@ import org.elasticsearch.cluster.node.DiscoveryNodes;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.common.transport.TransportAddress;
|
import org.elasticsearch.common.transport.TransportAddress;
|
||||||
import org.elasticsearch.gateway.GatewayMetaState;
|
import org.elasticsearch.gateway.GatewayMetaState;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
@ -43,6 +44,8 @@ import static java.util.Collections.emptySet;
|
||||||
import static java.util.Collections.singletonList;
|
import static java.util.Collections.singletonList;
|
||||||
import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.BOOTSTRAP_PLACEHOLDER_PREFIX;
|
import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.BOOTSTRAP_PLACEHOLDER_PREFIX;
|
||||||
import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING;
|
import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
import static org.hamcrest.Matchers.is;
|
import static org.hamcrest.Matchers.is;
|
||||||
|
@ -77,7 +80,8 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
|
||||||
final ClusterFormationFailureHelper clusterFormationFailureHelper = new ClusterFormationFailureHelper(settingsBuilder.build(),
|
final ClusterFormationFailureHelper clusterFormationFailureHelper = new ClusterFormationFailureHelper(settingsBuilder.build(),
|
||||||
() -> {
|
() -> {
|
||||||
warningCount.incrementAndGet();
|
warningCount.incrementAndGet();
|
||||||
return new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 0L, electionStrategy);
|
return new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 0L, electionStrategy,
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
},
|
},
|
||||||
deterministicTaskQueue.getThreadPool(), logLastFailedJoinAttemptWarningCount::incrementAndGet);
|
deterministicTaskQueue.getThreadPool(), logLastFailedJoinAttemptWarningCount::incrementAndGet);
|
||||||
|
|
||||||
|
@ -147,19 +151,22 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
|
||||||
.metadata(Metadata.builder().coordinationMetadata(CoordinationMetadata.builder().term(4L).build()))
|
.metadata(Metadata.builder().coordinationMetadata(CoordinationMetadata.builder().term(4L).build()))
|
||||||
.nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
|
.nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 15L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 15L, electionStrategy,
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info"))
|
||||||
.getDescription(),
|
.getDescription(),
|
||||||
is("master not discovered yet: have discovered []; discovery will continue using [] from hosts providers " +
|
is("master not discovered yet: have discovered []; discovery will continue using [] from hosts providers " +
|
||||||
"and [] from last-known cluster state; node term 15, last-accepted version 12 in term 4"));
|
"and [] from last-known cluster state; node term 15, last-accepted version 12 in term 4"));
|
||||||
|
|
||||||
final TransportAddress otherAddress = buildNewFakeTransportAddress();
|
final TransportAddress otherAddress = buildNewFakeTransportAddress();
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 16L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 16L, electionStrategy,
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info"))
|
||||||
.getDescription(),
|
.getDescription(),
|
||||||
is("master not discovered yet: have discovered []; discovery will continue using [" + otherAddress +
|
is("master not discovered yet: have discovered []; discovery will continue using [" + otherAddress +
|
||||||
"] from hosts providers and [] from last-known cluster state; node term 16, last-accepted version 12 in term 4"));
|
"] from hosts providers and [] from last-known cluster state; node term 16, last-accepted version 12 in term 4"));
|
||||||
|
|
||||||
final DiscoveryNode otherNode = new DiscoveryNode("other", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode otherNode = new DiscoveryNode("other", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 17L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 17L, electionStrategy,
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info"))
|
||||||
.getDescription(),
|
.getDescription(),
|
||||||
is("master not discovered yet: have discovered [" + otherNode + "]; discovery will continue using [] from hosts providers " +
|
is("master not discovered yet: have discovered [" + otherNode + "]; discovery will continue using [] from hosts providers " +
|
||||||
"and [] from last-known cluster state; node term 17, last-accepted version 12 in term 4"));
|
"and [] from last-known cluster state; node term 17, last-accepted version 12 in term 4"));
|
||||||
|
@ -174,12 +181,33 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
|
||||||
.build())
|
.build())
|
||||||
.nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
|
.nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 15L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 15L, electionStrategy,
|
||||||
.getDescription(),
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered yet: have discovered []; discovery will continue using [] from hosts providers " +
|
is("master not discovered yet: have discovered []; discovery will continue using [] from hosts providers " +
|
||||||
"and [] from last-known cluster state; node term 15, last-accepted version 42 in term 0"));
|
"and [] from last-known cluster state; node term 15, last-accepted version 42 in term 0"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testDescriptionOnUnhealthyNodes() {
|
||||||
|
final DiscoveryNode dataNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
|
||||||
|
ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT)
|
||||||
|
.version(12L).nodes(DiscoveryNodes.builder().add(dataNode).localNodeId(dataNode.getId())).build();
|
||||||
|
|
||||||
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 15L, electionStrategy,
|
||||||
|
new StatusInfo(UNHEALTHY, "unhealthy-info"))
|
||||||
|
.getDescription(),
|
||||||
|
is("this node is unhealthy: unhealthy-info"));
|
||||||
|
|
||||||
|
final DiscoveryNode masterNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), emptyMap(),
|
||||||
|
org.elasticsearch.common.collect.Set.of(DiscoveryNodeRole.MASTER_ROLE), Version.CURRENT);
|
||||||
|
clusterState = ClusterState.builder(ClusterName.DEFAULT)
|
||||||
|
.version(12L).nodes(DiscoveryNodes.builder().add(masterNode).localNodeId(masterNode.getId())).build();
|
||||||
|
|
||||||
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 15L, electionStrategy,
|
||||||
|
new StatusInfo(UNHEALTHY, "unhealthy-info"))
|
||||||
|
.getDescription(),
|
||||||
|
is("this node is unhealthy: unhealthy-info"));
|
||||||
|
}
|
||||||
|
|
||||||
public void testDescriptionBeforeBootstrapping() {
|
public void testDescriptionBeforeBootstrapping() {
|
||||||
final DiscoveryNode localNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode localNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
final ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT)
|
final ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT)
|
||||||
|
@ -187,14 +215,16 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
|
||||||
.metadata(Metadata.builder().coordinationMetadata(CoordinationMetadata.builder().term(4L).build()))
|
.metadata(Metadata.builder().coordinationMetadata(CoordinationMetadata.builder().term(4L).build()))
|
||||||
.nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
|
.nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 1L, electionStrategy).getDescription(),
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 1L, electionStrategy,
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered yet, this node has not previously joined a bootstrapped (v7+) cluster, and " +
|
is("master not discovered yet, this node has not previously joined a bootstrapped (v7+) cluster, and " +
|
||||||
"[cluster.initial_master_nodes] is empty on this node: have discovered []; " +
|
"[cluster.initial_master_nodes] is empty on this node: have discovered []; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 1, last-accepted version 7 in term 4"));
|
"] from last-known cluster state; node term 1, last-accepted version 7 in term 4"));
|
||||||
|
|
||||||
final TransportAddress otherAddress = buildNewFakeTransportAddress();
|
final TransportAddress otherAddress = buildNewFakeTransportAddress();
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 2L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 2L, electionStrategy,
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info"))
|
||||||
.getDescription(),
|
.getDescription(),
|
||||||
is("master not discovered yet, this node has not previously joined a bootstrapped (v7+) cluster, and " +
|
is("master not discovered yet, this node has not previously joined a bootstrapped (v7+) cluster, and " +
|
||||||
"[cluster.initial_master_nodes] is empty on this node: have discovered []; " +
|
"[cluster.initial_master_nodes] is empty on this node: have discovered []; " +
|
||||||
|
@ -202,7 +232,8 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
|
||||||
"] from last-known cluster state; node term 2, last-accepted version 7 in term 4"));
|
"] from last-known cluster state; node term 2, last-accepted version 7 in term 4"));
|
||||||
|
|
||||||
final DiscoveryNode otherNode = new DiscoveryNode("other", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode otherNode = new DiscoveryNode("other", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 3L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 3L, electionStrategy,
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info"))
|
||||||
.getDescription(),
|
.getDescription(),
|
||||||
is("master not discovered yet, this node has not previously joined a bootstrapped (v7+) cluster, and " +
|
is("master not discovered yet, this node has not previously joined a bootstrapped (v7+) cluster, and " +
|
||||||
"[cluster.initial_master_nodes] is empty on this node: have discovered [" + otherNode + "]; " +
|
"[cluster.initial_master_nodes] is empty on this node: have discovered [" + otherNode + "]; " +
|
||||||
|
@ -210,7 +241,8 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
|
||||||
"] from last-known cluster state; node term 3, last-accepted version 7 in term 4"));
|
"] from last-known cluster state; node term 3, last-accepted version 7 in term 4"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.builder().putList(INITIAL_MASTER_NODES_SETTING.getKey(), "other").build(),
|
assertThat(new ClusterFormationState(Settings.builder().putList(INITIAL_MASTER_NODES_SETTING.getKey(), "other").build(),
|
||||||
clusterState, emptyList(), emptyList(), 4L, electionStrategy).getDescription(),
|
clusterState, emptyList(), emptyList(), 4L, electionStrategy,
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered yet, this node has not previously joined a bootstrapped (v7+) cluster, and " +
|
is("master not discovered yet, this node has not previously joined a bootstrapped (v7+) cluster, and " +
|
||||||
"this node must discover master-eligible nodes [other] to bootstrap a cluster: have discovered []; " +
|
"this node must discover master-eligible nodes [other] to bootstrap a cluster: have discovered []; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
|
@ -240,31 +272,32 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
|
||||||
final ClusterState clusterState = state(localNode,
|
final ClusterState clusterState = state(localNode,
|
||||||
VotingConfiguration.MUST_JOIN_ELECTED_MASTER.getNodeIds().toArray(new String[0]));
|
VotingConfiguration.MUST_JOIN_ELECTED_MASTER.getNodeIds().toArray(new String[0]));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 0L, electionStrategy).getDescription(),
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 0L, electionStrategy,
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered yet and this node was detached from its previous cluster, " +
|
is("master not discovered yet and this node was detached from its previous cluster, " +
|
||||||
"have discovered []; " +
|
"have discovered []; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
final TransportAddress otherAddress = buildNewFakeTransportAddress();
|
final TransportAddress otherAddress = buildNewFakeTransportAddress();
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 0L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 0L, electionStrategy,
|
||||||
.getDescription(),
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered yet and this node was detached from its previous cluster, " +
|
is("master not discovered yet and this node was detached from its previous cluster, " +
|
||||||
"have discovered []; " +
|
"have discovered []; " +
|
||||||
"discovery will continue using [" + otherAddress + "] from hosts providers and [" + localNode +
|
"discovery will continue using [" + otherAddress + "] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
final DiscoveryNode otherNode = new DiscoveryNode("otherNode", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode otherNode = new DiscoveryNode("otherNode", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 0L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 0L, electionStrategy,
|
||||||
.getDescription(),
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered yet and this node was detached from its previous cluster, " +
|
is("master not discovered yet and this node was detached from its previous cluster, " +
|
||||||
"have discovered [" + otherNode + "]; " +
|
"have discovered [" + otherNode + "]; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
final DiscoveryNode yetAnotherNode = new DiscoveryNode("yetAnotherNode", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode yetAnotherNode = new DiscoveryNode("yetAnotherNode", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(yetAnotherNode), 0L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(yetAnotherNode), 0L, electionStrategy,
|
||||||
.getDescription(),
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered yet and this node was detached from its previous cluster, " +
|
is("master not discovered yet and this node was detached from its previous cluster, " +
|
||||||
"have discovered [" + yetAnotherNode + "]; " +
|
"have discovered [" + yetAnotherNode + "]; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
|
@ -277,109 +310,110 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
|
||||||
|
|
||||||
final ClusterState clusterState = state(localNode, "otherNode");
|
final ClusterState clusterState = state(localNode, "otherNode");
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 0L, electionStrategy).getDescription(),
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 0L, electionStrategy,
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires a node with id [otherNode], " +
|
is("master not discovered or elected yet, an election requires a node with id [otherNode], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
final TransportAddress otherAddress = buildNewFakeTransportAddress();
|
final TransportAddress otherAddress = buildNewFakeTransportAddress();
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 0L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 0L, electionStrategy,
|
||||||
.getDescription(),
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires a node with id [otherNode], " +
|
is("master not discovered or elected yet, an election requires a node with id [otherNode], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [" + otherAddress + "] from hosts providers and [" + localNode +
|
"discovery will continue using [" + otherAddress + "] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
final DiscoveryNode otherNode = new DiscoveryNode("otherNode", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode otherNode = new DiscoveryNode("otherNode", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 0L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 0L, electionStrategy,
|
||||||
.getDescription(),
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires a node with id [otherNode], " +
|
is("master not discovered or elected yet, an election requires a node with id [otherNode], " +
|
||||||
"have discovered [" + otherNode + "] which is a quorum; " +
|
"have discovered [" + otherNode + "] which is a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
final DiscoveryNode yetAnotherNode = new DiscoveryNode("yetAnotherNode", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode yetAnotherNode = new DiscoveryNode("yetAnotherNode", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(yetAnotherNode), 0L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(yetAnotherNode), 0L, electionStrategy,
|
||||||
.getDescription(),
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires a node with id [otherNode], " +
|
is("master not discovered or elected yet, an election requires a node with id [otherNode], " +
|
||||||
"have discovered [" + yetAnotherNode + "] which is not a quorum; " +
|
"have discovered [" + yetAnotherNode + "] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2"), emptyList(), emptyList(), 0L, electionStrategy)
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2"), emptyList(), emptyList(), 0L, electionStrategy,
|
||||||
.getDescription(),
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires two nodes with ids [n1, n2], " +
|
is("master not discovered or elected yet, an election requires two nodes with ids [n1, n2], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", "n3"), emptyList(), emptyList(), 0L,
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", "n3"), emptyList(), emptyList(), 0L,
|
||||||
electionStrategy).getDescription(),
|
electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires at least 2 nodes with ids from [n1, n2, n3], " +
|
is("master not discovered or elected yet, an election requires at least 2 nodes with ids from [n1, n2, n3], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", BOOTSTRAP_PLACEHOLDER_PREFIX + "n3"),
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", BOOTSTRAP_PLACEHOLDER_PREFIX + "n3"),
|
||||||
emptyList(), emptyList(), 0L, electionStrategy).getDescription(),
|
emptyList(), emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires 2 nodes with ids [n1, n2], " +
|
is("master not discovered or elected yet, an election requires 2 nodes with ids [n1, n2], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", "n3", "n4"), emptyList(), emptyList(), 0L,
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", "n3", "n4"), emptyList(), emptyList(), 0L,
|
||||||
electionStrategy).getDescription(),
|
electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires at least 3 nodes with ids from [n1, n2, n3, n4], " +
|
is("master not discovered or elected yet, an election requires at least 3 nodes with ids from [n1, n2, n3, n4], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", "n3", "n4", "n5"), emptyList(), emptyList(), 0L,
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", "n3", "n4", "n5"), emptyList(), emptyList(), 0L,
|
||||||
electionStrategy).getDescription(),
|
electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires at least 3 nodes with ids from [n1, n2, n3, n4, n5], " +
|
is("master not discovered or elected yet, an election requires at least 3 nodes with ids from [n1, n2, n3, n4, n5], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", "n3", "n4", BOOTSTRAP_PLACEHOLDER_PREFIX + "n5"),
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", "n3", "n4", BOOTSTRAP_PLACEHOLDER_PREFIX + "n5"),
|
||||||
emptyList(), emptyList(), 0L, electionStrategy).getDescription(),
|
emptyList(), emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires at least 3 nodes with ids from [n1, n2, n3, n4], " +
|
is("master not discovered or elected yet, an election requires at least 3 nodes with ids from [n1, n2, n3, n4], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", "n3",
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, "n1", "n2", "n3",
|
||||||
BOOTSTRAP_PLACEHOLDER_PREFIX + "n4", BOOTSTRAP_PLACEHOLDER_PREFIX + "n5"), emptyList(), emptyList(), 0L, electionStrategy)
|
BOOTSTRAP_PLACEHOLDER_PREFIX + "n4", BOOTSTRAP_PLACEHOLDER_PREFIX + "n5"), emptyList(), emptyList(), 0L, electionStrategy,
|
||||||
.getDescription(),
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires 3 nodes with ids [n1, n2, n3], " +
|
is("master not discovered or elected yet, an election requires 3 nodes with ids [n1, n2, n3], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, new String[]{"n1"}, new String[]{"n1"}), emptyList(),
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, new String[]{"n1"}, new String[]{"n1"}), emptyList(),
|
||||||
emptyList(), 0L, electionStrategy).getDescription(),
|
emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires a node with id [n1], " +
|
is("master not discovered or elected yet, an election requires a node with id [n1], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, new String[]{"n1"}, new String[]{"n2"}), emptyList(),
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, new String[]{"n1"}, new String[]{"n2"}), emptyList(),
|
||||||
emptyList(), 0L, electionStrategy).getDescription(),
|
emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires a node with id [n1] and a node with id [n2], " +
|
is("master not discovered or elected yet, an election requires a node with id [n1] and a node with id [n2], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, new String[]{"n1"}, new String[]{"n2", "n3"}), emptyList(),
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, new String[]{"n1"}, new String[]{"n2", "n3"}), emptyList(),
|
||||||
emptyList(), 0L, electionStrategy).getDescription(),
|
emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires a node with id [n1] and two nodes with ids [n2, n3], " +
|
is("master not discovered or elected yet, an election requires a node with id [n1] and two nodes with ids [n2, n3], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
"discovery will continue using [] from hosts providers and [" + localNode +
|
"discovery will continue using [] from hosts providers and [" + localNode +
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, new String[]{"n1"}, new String[]{"n2", "n3", "n4"}),
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, new String[]{"n1"}, new String[]{"n2", "n3", "n4"}),
|
||||||
emptyList(), emptyList(), 0L, electionStrategy).getDescription(),
|
emptyList(), emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires a node with id [n1] and " +
|
is("master not discovered or elected yet, an election requires a node with id [n1] and " +
|
||||||
"at least 2 nodes with ids from [n2, n3, n4], " +
|
"at least 2 nodes with ids from [n2, n3, n4], " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
|
@ -400,7 +434,8 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
|
||||||
.lastCommittedConfiguration(config(configNodeIds)).build())).build();
|
.lastCommittedConfiguration(config(configNodeIds)).build())).build();
|
||||||
|
|
||||||
assertThat(
|
assertThat(
|
||||||
new ClusterFormationState(Settings.EMPTY, stateWithOtherNodes, emptyList(), emptyList(), 0L, electionStrategy).getDescription(),
|
new ClusterFormationState(Settings.EMPTY, stateWithOtherNodes, emptyList(), emptyList(), 0L, electionStrategy,
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
|
|
||||||
// nodes from last-known cluster state could be in either order
|
// nodes from last-known cluster state could be in either order
|
||||||
is(oneOf(
|
is(oneOf(
|
||||||
|
@ -415,7 +450,7 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
|
||||||
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0")));
|
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0")));
|
||||||
|
|
||||||
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, GatewayMetaState.STALE_STATE_CONFIG_NODE_ID), emptyList(),
|
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, GatewayMetaState.STALE_STATE_CONFIG_NODE_ID), emptyList(),
|
||||||
emptyList(), 0L, electionStrategy).getDescription(),
|
emptyList(), 0L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(),
|
||||||
is("master not discovered or elected yet, an election requires one or more nodes that have already participated as " +
|
is("master not discovered or elected yet, an election requires one or more nodes that have already participated as " +
|
||||||
"master-eligible nodes in the cluster but this node was not master-eligible the last time it joined the cluster, " +
|
"master-eligible nodes in the cluster but this node was not master-eligible the last time it joined the cluster, " +
|
||||||
"have discovered [] which is not a quorum; " +
|
"have discovered [] which is not a quorum; " +
|
||||||
|
|
|
@ -45,6 +45,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||||
import org.elasticsearch.discovery.DiscoveryModule;
|
import org.elasticsearch.discovery.DiscoveryModule;
|
||||||
import org.elasticsearch.discovery.zen.PublishClusterStateStats;
|
import org.elasticsearch.discovery.zen.PublishClusterStateStats;
|
||||||
import org.elasticsearch.gateway.GatewayService;
|
import org.elasticsearch.gateway.GatewayService;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.test.MockLogAppender;
|
import org.elasticsearch.test.MockLogAppender;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -54,6 +55,7 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ -75,8 +77,11 @@ import static org.elasticsearch.cluster.coordination.NoMasterBlockService.NO_MAS
|
||||||
import static org.elasticsearch.cluster.coordination.NoMasterBlockService.NO_MASTER_BLOCK_WRITES;
|
import static org.elasticsearch.cluster.coordination.NoMasterBlockService.NO_MASTER_BLOCK_WRITES;
|
||||||
import static org.elasticsearch.cluster.coordination.Reconfigurator.CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION;
|
import static org.elasticsearch.cluster.coordination.Reconfigurator.CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION;
|
||||||
import static org.elasticsearch.discovery.PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_SETTING;
|
import static org.elasticsearch.discovery.PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_SETTING;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
import static org.elasticsearch.test.NodeRoles.nonMasterNode;
|
import static org.elasticsearch.test.NodeRoles.nonMasterNode;
|
||||||
import static org.hamcrest.Matchers.allOf;
|
import static org.hamcrest.Matchers.allOf;
|
||||||
|
import static org.hamcrest.Matchers.anyOf;
|
||||||
import static org.hamcrest.Matchers.containsString;
|
import static org.hamcrest.Matchers.containsString;
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
|
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
|
||||||
|
@ -160,6 +165,77 @@ public class CoordinatorTests extends AbstractCoordinatorTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testUnhealthyNodesGetsRemoved() {
|
||||||
|
AtomicReference<StatusInfo> healthStatusInfo = new AtomicReference<>(
|
||||||
|
new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
|
try (Cluster cluster = new Cluster(3)) {
|
||||||
|
cluster.runRandomly();
|
||||||
|
cluster.stabilise();
|
||||||
|
|
||||||
|
final ClusterNode leader = cluster.getAnyLeader();
|
||||||
|
logger.info("--> adding two new healthy nodes");
|
||||||
|
ClusterNode newNode1 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings,
|
||||||
|
() -> healthStatusInfo.get());
|
||||||
|
ClusterNode newNode2 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings,
|
||||||
|
() -> healthStatusInfo.get());
|
||||||
|
cluster.clusterNodes.add(newNode1);
|
||||||
|
cluster.clusterNodes.add(newNode2);
|
||||||
|
cluster.stabilise(
|
||||||
|
// The first pinging discovers the master
|
||||||
|
defaultMillis(DISCOVERY_FIND_PEERS_INTERVAL_SETTING)
|
||||||
|
// One message delay to send a join
|
||||||
|
+ DEFAULT_DELAY_VARIABILITY
|
||||||
|
// Commit a new cluster state with the new node(s). Might be split into multiple commits, and each might need a
|
||||||
|
// followup reconfiguration
|
||||||
|
+ 2 * 2 * DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
|
||||||
|
|
||||||
|
{
|
||||||
|
assertThat(leader.coordinator.getMode(), is(Mode.LEADER));
|
||||||
|
final VotingConfiguration lastCommittedConfiguration = leader.getLastAppliedClusterState().getLastCommittedConfiguration();
|
||||||
|
assertThat(lastCommittedConfiguration + " should be all nodes", lastCommittedConfiguration.getNodeIds(),
|
||||||
|
equalTo(cluster.clusterNodes.stream().map(ClusterNode::getId).collect(Collectors.toSet())));
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("setting auto-shrink reconfiguration to true");
|
||||||
|
leader.submitSetAutoShrinkVotingConfiguration(true);
|
||||||
|
cluster.stabilise(DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
|
||||||
|
assertTrue(CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION.get(leader.getLastAppliedClusterState().metadata().settings()));
|
||||||
|
|
||||||
|
logger.info("--> changing health of newly added nodes to unhealthy");
|
||||||
|
healthStatusInfo.getAndSet(new StatusInfo(UNHEALTHY, "unhealthy-info"));
|
||||||
|
|
||||||
|
cluster.stabilise(Math.max(
|
||||||
|
// Each follower may have just sent a leader check, which receives no response
|
||||||
|
defaultMillis(LEADER_CHECK_TIMEOUT_SETTING)
|
||||||
|
// then wait for the follower to check the leader
|
||||||
|
+ defaultMillis(LEADER_CHECK_INTERVAL_SETTING)
|
||||||
|
// then wait for the exception response
|
||||||
|
+ DEFAULT_DELAY_VARIABILITY,
|
||||||
|
|
||||||
|
// ALSO the leader may have just sent a follower check, which receives no response
|
||||||
|
defaultMillis(FOLLOWER_CHECK_TIMEOUT_SETTING)
|
||||||
|
// wait for the leader to check its followers
|
||||||
|
+ defaultMillis(FOLLOWER_CHECK_INTERVAL_SETTING)
|
||||||
|
// then wait for the exception response
|
||||||
|
+ DEFAULT_DELAY_VARIABILITY)
|
||||||
|
|
||||||
|
// FINALLY:
|
||||||
|
|
||||||
|
// wait for the removal to be committed
|
||||||
|
+ DEFAULT_CLUSTER_STATE_UPDATE_DELAY
|
||||||
|
// then wait for the followup reconfiguration
|
||||||
|
+ DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
|
||||||
|
{
|
||||||
|
final ClusterNode newLeader = cluster.getAnyLeader();
|
||||||
|
final VotingConfiguration lastCommittedConfiguration
|
||||||
|
= newLeader.getLastAppliedClusterState().getLastCommittedConfiguration();
|
||||||
|
assertThat(lastCommittedConfiguration + " should be 3 nodes", lastCommittedConfiguration.getNodeIds().size(), equalTo(3));
|
||||||
|
assertFalse(lastCommittedConfiguration.getNodeIds().contains(newNode1.getId()));
|
||||||
|
assertFalse(lastCommittedConfiguration.getNodeIds().contains(newNode2.getId()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void testNodesJoinAfterStableCluster() {
|
public void testNodesJoinAfterStableCluster() {
|
||||||
try (Cluster cluster = new Cluster(randomIntBetween(1, 5))) {
|
try (Cluster cluster = new Cluster(randomIntBetween(1, 5))) {
|
||||||
cluster.runRandomly();
|
cluster.runRandomly();
|
||||||
|
@ -489,6 +565,66 @@ public class CoordinatorTests extends AbstractCoordinatorTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testUnHealthyLeaderRemoved() {
|
||||||
|
AtomicReference<StatusInfo> nodeHealthServiceStatus = new AtomicReference<>(new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
|
try (Cluster cluster = new Cluster(randomIntBetween(1, 3), true, Settings.EMPTY,
|
||||||
|
() -> nodeHealthServiceStatus.get())) {
|
||||||
|
cluster.runRandomly();
|
||||||
|
cluster.stabilise();
|
||||||
|
|
||||||
|
final ClusterNode leader = cluster.getAnyLeader();
|
||||||
|
|
||||||
|
logger.info("--> adding three new healthy nodes");
|
||||||
|
ClusterNode newNode1 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings,
|
||||||
|
() -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
|
ClusterNode newNode2 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings,
|
||||||
|
() -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
|
ClusterNode newNode3 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings,
|
||||||
|
() -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
|
cluster.clusterNodes.add(newNode1);
|
||||||
|
cluster.clusterNodes.add(newNode2);
|
||||||
|
cluster.clusterNodes.add(newNode3);
|
||||||
|
cluster.stabilise(
|
||||||
|
// The first pinging discovers the master
|
||||||
|
defaultMillis(DISCOVERY_FIND_PEERS_INTERVAL_SETTING)
|
||||||
|
// One message delay to send a join
|
||||||
|
+ DEFAULT_DELAY_VARIABILITY
|
||||||
|
// Commit a new cluster state with the new node(s). Might be split into multiple commits, and each might need a
|
||||||
|
// followup reconfiguration
|
||||||
|
+ 3 * 2 * DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
|
||||||
|
|
||||||
|
logger.info("--> changing health status of leader {} to unhealthy", leader);
|
||||||
|
nodeHealthServiceStatus.getAndSet(new StatusInfo(UNHEALTHY, "unhealthy-info"));
|
||||||
|
|
||||||
|
cluster.stabilise(
|
||||||
|
// first wait for all the followers to notice the leader has gone
|
||||||
|
(defaultMillis(LEADER_CHECK_INTERVAL_SETTING) + defaultMillis(LEADER_CHECK_TIMEOUT_SETTING))
|
||||||
|
// then wait for a follower to be promoted to leader
|
||||||
|
+ DEFAULT_ELECTION_DELAY
|
||||||
|
// and the first publication times out because of the unresponsive node
|
||||||
|
+ defaultMillis(PUBLISH_TIMEOUT_SETTING)
|
||||||
|
// there might be a term bump causing another election
|
||||||
|
+ DEFAULT_ELECTION_DELAY
|
||||||
|
|
||||||
|
// then wait for both of:
|
||||||
|
+ Math.max(
|
||||||
|
// 1. the term bumping publication to time out
|
||||||
|
defaultMillis(PUBLISH_TIMEOUT_SETTING),
|
||||||
|
// 2. the new leader to notice that the old leader is unresponsive
|
||||||
|
(defaultMillis(FOLLOWER_CHECK_INTERVAL_SETTING) + defaultMillis(FOLLOWER_CHECK_TIMEOUT_SETTING))
|
||||||
|
)
|
||||||
|
|
||||||
|
// then wait for the new leader to commit a state without the old leader
|
||||||
|
+ DEFAULT_CLUSTER_STATE_UPDATE_DELAY
|
||||||
|
// then wait for the followup reconfiguration
|
||||||
|
+ DEFAULT_CLUSTER_STATE_UPDATE_DELAY
|
||||||
|
);
|
||||||
|
|
||||||
|
assertThat(cluster.getAnyLeader().getId(), anyOf(equalTo(newNode1.getId()), equalTo(newNode2.getId()),
|
||||||
|
equalTo(newNode3.getId())));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void testFollowerDisconnectionDetectedQuickly() {
|
public void testFollowerDisconnectionDetectedQuickly() {
|
||||||
try (Cluster cluster = new Cluster(randomIntBetween(3, 5))) {
|
try (Cluster cluster = new Cluster(randomIntBetween(3, 5))) {
|
||||||
cluster.runRandomly();
|
cluster.runRandomly();
|
||||||
|
@ -1024,7 +1160,8 @@ public class CoordinatorTests extends AbstractCoordinatorTestCase {
|
||||||
|
|
||||||
final ClusterNode newNode = cluster1.new ClusterNode(nextNodeIndex.getAndIncrement(),
|
final ClusterNode newNode = cluster1.new ClusterNode(nextNodeIndex.getAndIncrement(),
|
||||||
nodeInOtherCluster.getLocalNode(), n -> cluster1.new MockPersistedState(n, nodeInOtherCluster.persistedState,
|
nodeInOtherCluster.getLocalNode(), n -> cluster1.new MockPersistedState(n, nodeInOtherCluster.persistedState,
|
||||||
Function.identity(), Function.identity()), nodeInOtherCluster.nodeSettings);
|
Function.identity(), Function.identity()), nodeInOtherCluster.nodeSettings,
|
||||||
|
() -> new StatusInfo(StatusInfo.Status.HEALTHY, "healthy-info"));
|
||||||
|
|
||||||
cluster1.clusterNodes.add(newNode);
|
cluster1.clusterNodes.add(newNode);
|
||||||
|
|
||||||
|
|
|
@ -29,6 +29,8 @@ import org.elasticsearch.cluster.node.DiscoveryNodes;
|
||||||
import org.elasticsearch.common.io.stream.StreamInput;
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.common.settings.Settings.Builder;
|
import org.elasticsearch.common.settings.Settings.Builder;
|
||||||
|
import org.elasticsearch.monitor.NodeHealthService;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.EqualsHashCodeTestUtils;
|
import org.elasticsearch.test.EqualsHashCodeTestUtils;
|
||||||
import org.elasticsearch.test.EqualsHashCodeTestUtils.CopyFunction;
|
import org.elasticsearch.test.EqualsHashCodeTestUtils.CopyFunction;
|
||||||
|
@ -63,6 +65,8 @@ import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_C
|
||||||
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_INTERVAL_SETTING;
|
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_INTERVAL_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING;
|
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_TIMEOUT_SETTING;
|
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_TIMEOUT_SETTING;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
||||||
import static org.elasticsearch.transport.TransportService.HANDSHAKE_ACTION_NAME;
|
import static org.elasticsearch.transport.TransportService.HANDSHAKE_ACTION_NAME;
|
||||||
import static org.hamcrest.Matchers.contains;
|
import static org.hamcrest.Matchers.contains;
|
||||||
|
@ -109,7 +113,7 @@ public class FollowersCheckerTests extends ESTestCase {
|
||||||
assert false : fcr;
|
assert false : fcr;
|
||||||
}, (node, reason) -> {
|
}, (node, reason) -> {
|
||||||
assert false : node;
|
assert false : node;
|
||||||
});
|
}, () -> new StatusInfo(StatusInfo.Status.HEALTHY, "healthy-info"));
|
||||||
|
|
||||||
followersChecker.setCurrentNodes(discoveryNodesHolder[0]);
|
followersChecker.setCurrentNodes(discoveryNodesHolder[0]);
|
||||||
deterministicTaskQueue.runAllTasks();
|
deterministicTaskQueue.runAllTasks();
|
||||||
|
@ -179,7 +183,8 @@ public class FollowersCheckerTests extends ESTestCase {
|
||||||
testBehaviourOfFailingNode(settings, () -> null,
|
testBehaviourOfFailingNode(settings, () -> null,
|
||||||
"followers check retry count exceeded",
|
"followers check retry count exceeded",
|
||||||
(FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis()
|
(FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis()
|
||||||
+ FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) * FOLLOWER_CHECK_TIMEOUT_SETTING.get(settings).millis());
|
+ FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) * FOLLOWER_CHECK_TIMEOUT_SETTING.get(settings).millis(),
|
||||||
|
() -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFailsNodeThatRejectsCheck() {
|
public void testFailsNodeThatRejectsCheck() {
|
||||||
|
@ -196,7 +201,8 @@ public class FollowersCheckerTests extends ESTestCase {
|
||||||
throw new ElasticsearchException("simulated exception");
|
throw new ElasticsearchException("simulated exception");
|
||||||
},
|
},
|
||||||
"followers check retry count exceeded",
|
"followers check retry count exceeded",
|
||||||
(FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis());
|
(FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis(),
|
||||||
|
() -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFailureCounterResetsOnSuccess() {
|
public void testFailureCounterResetsOnSuccess() {
|
||||||
|
@ -229,13 +235,13 @@ public class FollowersCheckerTests extends ESTestCase {
|
||||||
},
|
},
|
||||||
"followers check retry count exceeded",
|
"followers check retry count exceeded",
|
||||||
(FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) * (maxRecoveries + 1) - 1)
|
(FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) * (maxRecoveries + 1) - 1)
|
||||||
* FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis());
|
* FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis(), () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFailsNodeThatIsDisconnected() {
|
public void testFailsNodeThatIsDisconnected() {
|
||||||
testBehaviourOfFailingNode(Settings.EMPTY, () -> {
|
testBehaviourOfFailingNode(Settings.EMPTY, () -> {
|
||||||
throw new ConnectTransportException(null, "simulated exception");
|
throw new ConnectTransportException(null, "simulated exception");
|
||||||
}, "disconnected", 0);
|
}, "disconnected", 0, () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFailsNodeThatDisconnects() {
|
public void testFailsNodeThatDisconnects() {
|
||||||
|
@ -278,7 +284,7 @@ public class FollowersCheckerTests extends ESTestCase {
|
||||||
}, (node, reason) -> {
|
}, (node, reason) -> {
|
||||||
assertTrue(nodeFailed.compareAndSet(false, true));
|
assertTrue(nodeFailed.compareAndSet(false, true));
|
||||||
assertThat(reason, equalTo("disconnected"));
|
assertThat(reason, equalTo("disconnected"));
|
||||||
});
|
}, () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
|
|
||||||
DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().add(localNode).add(otherNode).localNodeId(localNode.getId()).build();
|
DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().add(localNode).add(otherNode).localNodeId(localNode.getId()).build();
|
||||||
followersChecker.setCurrentNodes(discoveryNodes);
|
followersChecker.setCurrentNodes(discoveryNodes);
|
||||||
|
@ -290,8 +296,23 @@ public class FollowersCheckerTests extends ESTestCase {
|
||||||
assertThat(followersChecker.getFaultyNodes(), contains(otherNode));
|
assertThat(followersChecker.getFaultyNodes(), contains(otherNode));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testFailsNodeThatIsUnhealthy() {
|
||||||
|
final Builder settingsBuilder = Settings.builder();
|
||||||
|
if (randomBoolean()) {
|
||||||
|
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
|
||||||
|
}
|
||||||
|
if (randomBoolean()) {
|
||||||
|
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
|
||||||
|
}
|
||||||
|
final Settings settings = settingsBuilder.build();
|
||||||
|
|
||||||
|
testBehaviourOfFailingNode(settings, () -> {
|
||||||
|
throw new NodeHealthCheckFailureException("non writable exception");
|
||||||
|
}, "health check failed", 0, () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
|
}
|
||||||
|
|
||||||
private void testBehaviourOfFailingNode(Settings testSettings, Supplier<TransportResponse.Empty> responder, String failureReason,
|
private void testBehaviourOfFailingNode(Settings testSettings, Supplier<TransportResponse.Empty> responder, String failureReason,
|
||||||
long expectedFailureTime) {
|
long expectedFailureTime, NodeHealthService nodeHealthService) {
|
||||||
final DiscoveryNode localNode = new DiscoveryNode("local-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode localNode = new DiscoveryNode("local-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
final DiscoveryNode otherNode = new DiscoveryNode("other-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode otherNode = new DiscoveryNode("other-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
final Settings settings = Settings.builder().put(NODE_NAME_SETTING.getKey(), localNode.getName()).put(testSettings).build();
|
final Settings settings = Settings.builder().put(NODE_NAME_SETTING.getKey(), localNode.getName()).put(testSettings).build();
|
||||||
|
@ -339,7 +360,7 @@ public class FollowersCheckerTests extends ESTestCase {
|
||||||
}, (node, reason) -> {
|
}, (node, reason) -> {
|
||||||
assertTrue(nodeFailed.compareAndSet(false, true));
|
assertTrue(nodeFailed.compareAndSet(false, true));
|
||||||
assertThat(reason, equalTo(failureReason));
|
assertThat(reason, equalTo(failureReason));
|
||||||
});
|
}, nodeHealthService);
|
||||||
|
|
||||||
DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().add(localNode).add(otherNode).localNodeId(localNode.getId()).build();
|
DiscoveryNodes discoveryNodes = DiscoveryNodes.builder().add(localNode).add(otherNode).localNodeId(localNode.getId()).build();
|
||||||
followersChecker.setCurrentNodes(discoveryNodes);
|
followersChecker.setCurrentNodes(discoveryNodes);
|
||||||
|
@ -401,6 +422,71 @@ public class FollowersCheckerTests extends ESTestCase {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testUnhealthyNodeRejectsImmediately(){
|
||||||
|
|
||||||
|
final DiscoveryNode leader = new DiscoveryNode("leader", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
|
final DiscoveryNode follower = new DiscoveryNode("follower", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
|
final Settings settings = Settings.builder().put(NODE_NAME_SETTING.getKey(), follower.getName()).build();
|
||||||
|
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
|
||||||
|
|
||||||
|
final MockTransport mockTransport = new MockTransport() {
|
||||||
|
@Override
|
||||||
|
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
|
||||||
|
throw new AssertionError("no requests expected");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
final TransportService transportService = mockTransport.createTransportService(settings, deterministicTaskQueue.getThreadPool(),
|
||||||
|
TransportService.NOOP_TRANSPORT_INTERCEPTOR, boundTransportAddress -> follower, null, emptySet());
|
||||||
|
transportService.start();
|
||||||
|
transportService.acceptIncomingRequests();
|
||||||
|
|
||||||
|
final AtomicBoolean calledCoordinator = new AtomicBoolean();
|
||||||
|
final AtomicReference<RuntimeException> coordinatorException = new AtomicReference<>();
|
||||||
|
|
||||||
|
final FollowersChecker followersChecker = new FollowersChecker(settings, transportService,
|
||||||
|
fcr -> {
|
||||||
|
assertTrue(calledCoordinator.compareAndSet(false, true));
|
||||||
|
final RuntimeException exception = coordinatorException.get();
|
||||||
|
if (exception != null) {
|
||||||
|
throw exception;
|
||||||
|
}
|
||||||
|
}, (node, reason) -> {
|
||||||
|
assert false : node;
|
||||||
|
}, () -> new StatusInfo(UNHEALTHY, "unhealthy-info"));
|
||||||
|
|
||||||
|
final long leaderTerm = randomLongBetween(2, Long.MAX_VALUE);
|
||||||
|
final long followerTerm = randomLongBetween(1, leaderTerm - 1);
|
||||||
|
followersChecker.updateFastResponseState(followerTerm, Mode.FOLLOWER);
|
||||||
|
final AtomicReference<TransportException> receivedException = new AtomicReference<>();
|
||||||
|
transportService.sendRequest(follower, FOLLOWER_CHECK_ACTION_NAME, new FollowerCheckRequest(leaderTerm, leader),
|
||||||
|
new TransportResponseHandler<TransportResponse.Empty>() {
|
||||||
|
@Override
|
||||||
|
public TransportResponse.Empty read(StreamInput in) {
|
||||||
|
return TransportResponse.Empty.INSTANCE;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void handleResponse(TransportResponse.Empty response) {
|
||||||
|
fail("unexpected success");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void handleException(TransportException exp) {
|
||||||
|
assertThat(exp, not(nullValue()));
|
||||||
|
assertTrue(receivedException.compareAndSet(null, exp));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String executor() {
|
||||||
|
return Names.SAME;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
deterministicTaskQueue.runAllTasks();
|
||||||
|
assertFalse(calledCoordinator.get());
|
||||||
|
assertThat(receivedException.get(), not(nullValue()));
|
||||||
|
}
|
||||||
|
|
||||||
public void testResponder() {
|
public void testResponder() {
|
||||||
final DiscoveryNode leader = new DiscoveryNode("leader", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode leader = new DiscoveryNode("leader", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
final DiscoveryNode follower = new DiscoveryNode("follower", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode follower = new DiscoveryNode("follower", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
|
@ -431,7 +517,7 @@ public class FollowersCheckerTests extends ESTestCase {
|
||||||
}
|
}
|
||||||
}, (node, reason) -> {
|
}, (node, reason) -> {
|
||||||
assert false : node;
|
assert false : node;
|
||||||
});
|
}, () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
|
|
||||||
{
|
{
|
||||||
// Does not call into the coordinator in the normal case
|
// Does not call into the coordinator in the normal case
|
||||||
|
@ -560,7 +646,7 @@ public class FollowersCheckerTests extends ESTestCase {
|
||||||
assert false : fcr;
|
assert false : fcr;
|
||||||
}, (node, reason) -> {
|
}, (node, reason) -> {
|
||||||
assert false : node;
|
assert false : node;
|
||||||
});
|
},() -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
followersChecker.setCurrentNodes(discoveryNodes);
|
followersChecker.setCurrentNodes(discoveryNodes);
|
||||||
List<DiscoveryNode> followerTargets = Stream.of(capturingTransport.getCapturedRequestsAndClear())
|
List<DiscoveryNode> followerTargets = Stream.of(capturingTransport.getCapturedRequestsAndClear())
|
||||||
.map(cr -> cr.node).collect(Collectors.toList());
|
.map(cr -> cr.node).collect(Collectors.toList());
|
||||||
|
|
|
@ -29,6 +29,7 @@ import org.elasticsearch.cluster.metadata.Metadata;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNode;
|
import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.discovery.zen.MembershipAction;
|
import org.elasticsearch.discovery.zen.MembershipAction;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.transport.CapturingTransport;
|
import org.elasticsearch.test.transport.CapturingTransport;
|
||||||
import org.elasticsearch.test.transport.CapturingTransport.CapturedRequest;
|
import org.elasticsearch.test.transport.CapturingTransport.CapturedRequest;
|
||||||
|
@ -40,7 +41,10 @@ import org.elasticsearch.transport.TransportService;
|
||||||
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
||||||
import static org.hamcrest.Matchers.containsString;
|
import static org.hamcrest.Matchers.containsString;
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
@ -58,7 +62,8 @@ public class JoinHelperTests extends ESTestCase {
|
||||||
x -> localNode, null, Collections.emptySet());
|
x -> localNode, null, Collections.emptySet());
|
||||||
JoinHelper joinHelper = new JoinHelper(Settings.EMPTY, null, null, transportService, () -> 0L, () -> null,
|
JoinHelper joinHelper = new JoinHelper(Settings.EMPTY, null, null, transportService, () -> 0L, () -> null,
|
||||||
(joinRequest, joinCallback) -> { throw new AssertionError(); }, startJoinRequest -> { throw new AssertionError(); },
|
(joinRequest, joinCallback) -> { throw new AssertionError(); }, startJoinRequest -> { throw new AssertionError(); },
|
||||||
Collections.emptyList(), (s, p, r) -> {});
|
Collections.emptyList(), (s, p, r) -> {},
|
||||||
|
() -> new StatusInfo(HEALTHY, "info"));
|
||||||
transportService.start();
|
transportService.start();
|
||||||
|
|
||||||
DiscoveryNode node1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), Version.CURRENT);
|
DiscoveryNode node1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
|
@ -164,7 +169,7 @@ public class JoinHelperTests extends ESTestCase {
|
||||||
x -> localNode, null, Collections.emptySet());
|
x -> localNode, null, Collections.emptySet());
|
||||||
new JoinHelper(Settings.EMPTY, null, null, transportService, () -> 0L, () -> localClusterState,
|
new JoinHelper(Settings.EMPTY, null, null, transportService, () -> 0L, () -> localClusterState,
|
||||||
(joinRequest, joinCallback) -> { throw new AssertionError(); }, startJoinRequest -> { throw new AssertionError(); },
|
(joinRequest, joinCallback) -> { throw new AssertionError(); }, startJoinRequest -> { throw new AssertionError(); },
|
||||||
Collections.emptyList(), (s, p, r) -> {}); // registers request handler
|
Collections.emptyList(), (s, p, r) -> {}, null); // registers request handler
|
||||||
transportService.start();
|
transportService.start();
|
||||||
transportService.acceptIncomingRequests();
|
transportService.acceptIncomingRequests();
|
||||||
|
|
||||||
|
@ -183,4 +188,54 @@ public class JoinHelperTests extends ESTestCase {
|
||||||
assertThat(coordinationStateRejectedException.getMessage(), containsString(localClusterState.metadata().clusterUUID()));
|
assertThat(coordinationStateRejectedException.getMessage(), containsString(localClusterState.metadata().clusterUUID()));
|
||||||
assertThat(coordinationStateRejectedException.getMessage(), containsString(otherClusterState.metadata().clusterUUID()));
|
assertThat(coordinationStateRejectedException.getMessage(), containsString(otherClusterState.metadata().clusterUUID()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testJoinFailureOnUnhealthyNodes() {
|
||||||
|
DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(
|
||||||
|
Settings.builder().put(NODE_NAME_SETTING.getKey(), "node0").build(), random());
|
||||||
|
CapturingTransport capturingTransport = new CapturingTransport();
|
||||||
|
DiscoveryNode localNode = new DiscoveryNode("node0", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
|
TransportService transportService = capturingTransport.createTransportService(Settings.EMPTY,
|
||||||
|
deterministicTaskQueue.getThreadPool(), TransportService.NOOP_TRANSPORT_INTERCEPTOR,
|
||||||
|
x -> localNode, null, Collections.emptySet());
|
||||||
|
AtomicReference<StatusInfo> nodeHealthServiceStatus = new AtomicReference<>
|
||||||
|
(new StatusInfo(UNHEALTHY, "unhealthy-info"));
|
||||||
|
JoinHelper joinHelper = new JoinHelper(Settings.EMPTY, null, null, transportService, () -> 0L, () -> null,
|
||||||
|
(joinRequest, joinCallback) -> { throw new AssertionError(); }, startJoinRequest -> { throw new AssertionError(); },
|
||||||
|
Collections.emptyList(), (s, p, r) -> {}, () -> nodeHealthServiceStatus.get());
|
||||||
|
transportService.start();
|
||||||
|
|
||||||
|
DiscoveryNode node1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
|
DiscoveryNode node2 = new DiscoveryNode("node2", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
|
|
||||||
|
assertFalse(joinHelper.isJoinPending());
|
||||||
|
|
||||||
|
// check that sending a join to node1 doesn't work
|
||||||
|
Optional<Join> optionalJoin1 = randomBoolean() ? Optional.empty() :
|
||||||
|
Optional.of(new Join(localNode, node1, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
|
||||||
|
joinHelper.sendJoinRequest(node1, randomNonNegativeLong(), optionalJoin1);
|
||||||
|
CapturedRequest[] capturedRequests1 = capturingTransport.getCapturedRequestsAndClear();
|
||||||
|
assertThat(capturedRequests1.length, equalTo(0));
|
||||||
|
|
||||||
|
assertFalse(joinHelper.isJoinPending());
|
||||||
|
|
||||||
|
// check that sending a join to node2 doesn't work
|
||||||
|
Optional<Join> optionalJoin2 = randomBoolean() ? Optional.empty() :
|
||||||
|
Optional.of(new Join(localNode, node2, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
|
||||||
|
|
||||||
|
transportService.start();
|
||||||
|
joinHelper.sendJoinRequest(node2, randomNonNegativeLong(), optionalJoin2);
|
||||||
|
|
||||||
|
CapturedRequest[] capturedRequests2 = capturingTransport.getCapturedRequestsAndClear();
|
||||||
|
assertThat(capturedRequests2.length, equalTo(0));
|
||||||
|
|
||||||
|
assertFalse(joinHelper.isJoinPending());
|
||||||
|
|
||||||
|
nodeHealthServiceStatus.getAndSet(new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
|
// check that sending another join to node1 now works again
|
||||||
|
joinHelper.sendJoinRequest(node1, 0L, optionalJoin1);
|
||||||
|
CapturedRequest[] capturedRequests1a = capturingTransport.getCapturedRequestsAndClear();
|
||||||
|
assertThat(capturedRequests1a.length, equalTo(1));
|
||||||
|
CapturedRequest capturedRequest1a = capturedRequests1a[0];
|
||||||
|
assertEquals(node1, capturedRequest1a.node);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,7 @@ import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
||||||
import org.elasticsearch.common.io.stream.StreamInput;
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.EqualsHashCodeTestUtils;
|
import org.elasticsearch.test.EqualsHashCodeTestUtils;
|
||||||
import org.elasticsearch.test.EqualsHashCodeTestUtils.CopyFunction;
|
import org.elasticsearch.test.EqualsHashCodeTestUtils.CopyFunction;
|
||||||
|
@ -43,12 +44,15 @@ import org.elasticsearch.transport.TransportService;
|
||||||
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
|
||||||
import static java.util.Collections.emptySet;
|
import static java.util.Collections.emptySet;
|
||||||
import static org.elasticsearch.cluster.coordination.LeaderChecker.LEADER_CHECK_ACTION_NAME;
|
import static org.elasticsearch.cluster.coordination.LeaderChecker.LEADER_CHECK_ACTION_NAME;
|
||||||
import static org.elasticsearch.cluster.coordination.LeaderChecker.LEADER_CHECK_INTERVAL_SETTING;
|
import static org.elasticsearch.cluster.coordination.LeaderChecker.LEADER_CHECK_INTERVAL_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING;
|
import static org.elasticsearch.cluster.coordination.LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING;
|
import static org.elasticsearch.cluster.coordination.LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
||||||
import static org.elasticsearch.transport.TransportService.HANDSHAKE_ACTION_NAME;
|
import static org.elasticsearch.transport.TransportService.HANDSHAKE_ACTION_NAME;
|
||||||
import static org.elasticsearch.transport.TransportService.NOOP_TRANSPORT_INTERCEPTOR;
|
import static org.elasticsearch.transport.TransportService.NOOP_TRANSPORT_INTERCEPTOR;
|
||||||
|
@ -152,7 +156,7 @@ public class LeaderCheckerTests extends ESTestCase {
|
||||||
e -> {
|
e -> {
|
||||||
assertThat(e.getMessage(), matchesRegex("node \\[.*\\] failed \\[[1-9][0-9]*\\] consecutive checks"));
|
assertThat(e.getMessage(), matchesRegex("node \\[.*\\] failed \\[[1-9][0-9]*\\] consecutive checks"));
|
||||||
assertTrue(leaderFailed.compareAndSet(false, true));
|
assertTrue(leaderFailed.compareAndSet(false, true));
|
||||||
});
|
}, () -> new StatusInfo(StatusInfo.Status.HEALTHY, "healthy-info"));
|
||||||
|
|
||||||
logger.info("--> creating first checker");
|
logger.info("--> creating first checker");
|
||||||
leaderChecker.updateLeader(leader1);
|
leaderChecker.updateLeader(leader1);
|
||||||
|
@ -257,7 +261,7 @@ public class LeaderCheckerTests extends ESTestCase {
|
||||||
e -> {
|
e -> {
|
||||||
assertThat(e.getMessage(), anyOf(endsWith("disconnected"), endsWith("disconnected during check")));
|
assertThat(e.getMessage(), anyOf(endsWith("disconnected"), endsWith("disconnected during check")));
|
||||||
assertTrue(leaderFailed.compareAndSet(false, true));
|
assertTrue(leaderFailed.compareAndSet(false, true));
|
||||||
});
|
}, () -> new StatusInfo(StatusInfo.Status.HEALTHY, "healthy-info"));
|
||||||
|
|
||||||
leaderChecker.updateLeader(leader);
|
leaderChecker.updateLeader(leader);
|
||||||
{
|
{
|
||||||
|
@ -314,23 +318,112 @@ public class LeaderCheckerTests extends ESTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testFollowerFailsImmediatelyOnHealthCheckFailure() {
|
||||||
|
final DiscoveryNode localNode = new DiscoveryNode("local-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
|
final DiscoveryNode leader = new DiscoveryNode("leader", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
|
|
||||||
|
final Response[] responseHolder = new Response[]{Response.SUCCESS};
|
||||||
|
|
||||||
|
final Settings settings = Settings.builder().put(NODE_NAME_SETTING.getKey(), localNode.getId()).build();
|
||||||
|
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
|
||||||
|
final MockTransport mockTransport = new MockTransport() {
|
||||||
|
@Override
|
||||||
|
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
|
||||||
|
if (action.equals(HANDSHAKE_ACTION_NAME)) {
|
||||||
|
handleResponse(requestId, new TransportService.HandshakeResponse(node, ClusterName.DEFAULT, Version.CURRENT));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
assertThat(action, equalTo(LEADER_CHECK_ACTION_NAME));
|
||||||
|
assertEquals(node, leader);
|
||||||
|
final Response response = responseHolder[0];
|
||||||
|
|
||||||
|
deterministicTaskQueue.scheduleNow(new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
switch (response) {
|
||||||
|
case SUCCESS:
|
||||||
|
handleResponse(requestId, Empty.INSTANCE);
|
||||||
|
break;
|
||||||
|
case REMOTE_ERROR:
|
||||||
|
handleRemoteError(requestId, new NodeHealthCheckFailureException("simulated error"));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return response + " response to request " + requestId;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
final TransportService transportService = mockTransport.createTransportService(settings,
|
||||||
|
deterministicTaskQueue.getThreadPool(), NOOP_TRANSPORT_INTERCEPTOR, boundTransportAddress -> localNode, null, emptySet());
|
||||||
|
transportService.start();
|
||||||
|
transportService.acceptIncomingRequests();
|
||||||
|
|
||||||
|
final AtomicBoolean leaderFailed = new AtomicBoolean();
|
||||||
|
final LeaderChecker leaderChecker = new LeaderChecker(settings, transportService,
|
||||||
|
e -> {
|
||||||
|
assertThat(e.getMessage(), endsWith("failed health checks"));
|
||||||
|
assertTrue(leaderFailed.compareAndSet(false, true));
|
||||||
|
}, () -> new StatusInfo(StatusInfo.Status.HEALTHY, "healthy-info"));
|
||||||
|
|
||||||
|
leaderChecker.updateLeader(leader);
|
||||||
|
|
||||||
|
{
|
||||||
|
while (deterministicTaskQueue.getCurrentTimeMillis() < 10 * LEADER_CHECK_INTERVAL_SETTING.get(Settings.EMPTY).millis()) {
|
||||||
|
deterministicTaskQueue.runAllRunnableTasks();
|
||||||
|
deterministicTaskQueue.advanceTime();
|
||||||
|
}
|
||||||
|
|
||||||
|
deterministicTaskQueue.runAllRunnableTasks();
|
||||||
|
assertFalse(leaderFailed.get());
|
||||||
|
|
||||||
|
responseHolder[0] = Response.REMOTE_ERROR;
|
||||||
|
|
||||||
|
deterministicTaskQueue.advanceTime();
|
||||||
|
deterministicTaskQueue.runAllRunnableTasks();
|
||||||
|
|
||||||
|
assertTrue(leaderFailed.get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void testLeaderBehaviour() {
|
public void testLeaderBehaviour() {
|
||||||
final DiscoveryNode localNode = new DiscoveryNode("local-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode localNode = new DiscoveryNode("local-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
final DiscoveryNode otherNode = new DiscoveryNode("other-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode otherNode = new DiscoveryNode("other-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
final Settings settings = Settings.builder().put(NODE_NAME_SETTING.getKey(), localNode.getId()).build();
|
final Settings settings = Settings.builder().put(NODE_NAME_SETTING.getKey(), localNode.getId()).build();
|
||||||
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
|
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
|
||||||
final CapturingTransport capturingTransport = new CapturingTransport();
|
final CapturingTransport capturingTransport = new CapturingTransport();
|
||||||
|
AtomicReference<StatusInfo> nodeHealthServiceStatus = new AtomicReference<>(new StatusInfo(UNHEALTHY, "unhealthy-info"));
|
||||||
|
|
||||||
final TransportService transportService = capturingTransport.createTransportService(settings,
|
final TransportService transportService = capturingTransport.createTransportService(settings,
|
||||||
deterministicTaskQueue.getThreadPool(), NOOP_TRANSPORT_INTERCEPTOR, boundTransportAddress -> localNode, null, emptySet());
|
deterministicTaskQueue.getThreadPool(), NOOP_TRANSPORT_INTERCEPTOR, boundTransportAddress -> localNode, null, emptySet());
|
||||||
transportService.start();
|
transportService.start();
|
||||||
transportService.acceptIncomingRequests();
|
transportService.acceptIncomingRequests();
|
||||||
|
|
||||||
final LeaderChecker leaderChecker = new LeaderChecker(settings, transportService, e -> fail("shouldn't be checking anything"));
|
final LeaderChecker leaderChecker = new LeaderChecker(settings, transportService, e -> fail("shouldn't be checking anything"),
|
||||||
|
() -> nodeHealthServiceStatus.get());
|
||||||
|
|
||||||
final DiscoveryNodes discoveryNodes
|
final DiscoveryNodes discoveryNodes
|
||||||
= DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId()).masterNodeId(localNode.getId()).build();
|
= DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId()).masterNodeId(localNode.getId()).build();
|
||||||
|
|
||||||
|
{
|
||||||
|
leaderChecker.setCurrentNodes(discoveryNodes);
|
||||||
|
|
||||||
|
final CapturingTransportResponseHandler handler = new CapturingTransportResponseHandler();
|
||||||
|
transportService.sendRequest(localNode, LEADER_CHECK_ACTION_NAME, new LeaderCheckRequest(otherNode), handler);
|
||||||
|
deterministicTaskQueue.runAllTasks();
|
||||||
|
|
||||||
|
assertFalse(handler.successfulResponseReceived);
|
||||||
|
assertThat(handler.transportException.getRootCause(), instanceOf(NodeHealthCheckFailureException.class));
|
||||||
|
NodeHealthCheckFailureException cause = (NodeHealthCheckFailureException) handler.transportException.getRootCause();
|
||||||
|
assertThat(cause.getMessage(), equalTo("rejecting leader check from [" + otherNode
|
||||||
|
+ "] since node is unhealthy [unhealthy-info]"));
|
||||||
|
}
|
||||||
|
|
||||||
|
nodeHealthServiceStatus.getAndSet(new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
{
|
{
|
||||||
leaderChecker.setCurrentNodes(discoveryNodes);
|
leaderChecker.setCurrentNodes(discoveryNodes);
|
||||||
|
|
||||||
|
|
|
@ -38,6 +38,8 @@ import org.elasticsearch.common.settings.ClusterSettings;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.common.util.concurrent.BaseFuture;
|
import org.elasticsearch.common.util.concurrent.BaseFuture;
|
||||||
import org.elasticsearch.common.util.concurrent.FutureUtils;
|
import org.elasticsearch.common.util.concurrent.FutureUtils;
|
||||||
|
import org.elasticsearch.monitor.NodeHealthService;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.node.Node;
|
import org.elasticsearch.node.Node;
|
||||||
import org.elasticsearch.test.ClusterServiceUtils;
|
import org.elasticsearch.test.ClusterServiceUtils;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
@ -73,6 +75,7 @@ import static java.util.Collections.emptyList;
|
||||||
import static java.util.Collections.emptyMap;
|
import static java.util.Collections.emptyMap;
|
||||||
import static java.util.Collections.singleton;
|
import static java.util.Collections.singleton;
|
||||||
import static java.util.Collections.singletonList;
|
import static java.util.Collections.singletonList;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
||||||
import static org.elasticsearch.transport.TransportService.HANDSHAKE_ACTION_NAME;
|
import static org.elasticsearch.transport.TransportService.HANDSHAKE_ACTION_NAME;
|
||||||
import static org.hamcrest.Matchers.containsString;
|
import static org.hamcrest.Matchers.containsString;
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
@ -121,13 +124,13 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
.blocks(ClusterBlocks.EMPTY_CLUSTER_BLOCK).build();
|
.blocks(ClusterBlocks.EMPTY_CLUSTER_BLOCK).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setupFakeMasterServiceAndCoordinator(long term, ClusterState initialState) {
|
private void setupFakeMasterServiceAndCoordinator(long term, ClusterState initialState, NodeHealthService nodeHealthService) {
|
||||||
deterministicTaskQueue
|
deterministicTaskQueue
|
||||||
= new DeterministicTaskQueue(Settings.builder().put(Node.NODE_NAME_SETTING.getKey(), "test").build(), random());
|
= new DeterministicTaskQueue(Settings.builder().put(Node.NODE_NAME_SETTING.getKey(), "test").build(), random());
|
||||||
final ThreadPool fakeThreadPool = deterministicTaskQueue.getThreadPool();
|
final ThreadPool fakeThreadPool = deterministicTaskQueue.getThreadPool();
|
||||||
FakeThreadPoolMasterService fakeMasterService = new FakeThreadPoolMasterService("test_node","test",
|
FakeThreadPoolMasterService fakeMasterService = new FakeThreadPoolMasterService("test_node","test",
|
||||||
fakeThreadPool, deterministicTaskQueue::scheduleNow);
|
fakeThreadPool, deterministicTaskQueue::scheduleNow);
|
||||||
setupMasterServiceAndCoordinator(term, initialState, fakeMasterService, fakeThreadPool, Randomness.get());
|
setupMasterServiceAndCoordinator(term, initialState, fakeMasterService, fakeThreadPool, Randomness.get(), nodeHealthService);
|
||||||
fakeMasterService.setClusterStatePublisher((event, publishListener, ackListener) -> {
|
fakeMasterService.setClusterStatePublisher((event, publishListener, ackListener) -> {
|
||||||
coordinator.handlePublishRequest(new PublishRequest(event.state()));
|
coordinator.handlePublishRequest(new PublishRequest(event.state()));
|
||||||
publishListener.onResponse(null);
|
publishListener.onResponse(null);
|
||||||
|
@ -143,13 +146,14 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
clusterStateRef.set(event.state());
|
clusterStateRef.set(event.state());
|
||||||
publishListener.onResponse(null);
|
publishListener.onResponse(null);
|
||||||
});
|
});
|
||||||
setupMasterServiceAndCoordinator(term, initialState, masterService, threadPool, new Random(Randomness.get().nextLong()));
|
setupMasterServiceAndCoordinator(term, initialState, masterService, threadPool, new Random(Randomness.get().nextLong()),
|
||||||
|
() -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
masterService.setClusterStateSupplier(clusterStateRef::get);
|
masterService.setClusterStateSupplier(clusterStateRef::get);
|
||||||
masterService.start();
|
masterService.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setupMasterServiceAndCoordinator(long term, ClusterState initialState, MasterService masterService,
|
private void setupMasterServiceAndCoordinator(long term, ClusterState initialState, MasterService masterService,
|
||||||
ThreadPool threadPool, Random random) {
|
ThreadPool threadPool, Random random, NodeHealthService nodeHealthService) {
|
||||||
if (this.masterService != null || coordinator != null) {
|
if (this.masterService != null || coordinator != null) {
|
||||||
throw new IllegalStateException("method setupMasterServiceAndCoordinator can only be called once");
|
throw new IllegalStateException("method setupMasterServiceAndCoordinator can only be called once");
|
||||||
}
|
}
|
||||||
|
@ -179,7 +183,7 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
() -> new InMemoryPersistedState(term, initialState), r -> emptyList(),
|
() -> new InMemoryPersistedState(term, initialState), r -> emptyList(),
|
||||||
new NoOpClusterApplier(),
|
new NoOpClusterApplier(),
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
random, (s, p, r) -> {}, ElectionStrategy.DEFAULT_INSTANCE);
|
random, (s, p, r) -> {}, ElectionStrategy.DEFAULT_INSTANCE, nodeHealthService);
|
||||||
transportService.start();
|
transportService.start();
|
||||||
transportService.acceptIncomingRequests();
|
transportService.acceptIncomingRequests();
|
||||||
transport = capturingTransport;
|
transport = capturingTransport;
|
||||||
|
@ -270,7 +274,8 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
long initialTerm = randomLongBetween(1, 10);
|
long initialTerm = randomLongBetween(1, 10);
|
||||||
long initialVersion = randomLongBetween(1, 10);
|
long initialVersion = randomLongBetween(1, 10);
|
||||||
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
||||||
VotingConfiguration.of(randomFrom(node0, node1))));
|
VotingConfiguration.of(randomFrom(node0, node1))),
|
||||||
|
() -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
assertFalse(isLocalNodeElectedMaster());
|
assertFalse(isLocalNodeElectedMaster());
|
||||||
assertNull(coordinator.getStateForMasterService().nodes().getMasterNodeId());
|
assertNull(coordinator.getStateForMasterService().nodes().getMasterNodeId());
|
||||||
long newTerm = initialTerm + randomLongBetween(1, 10);
|
long newTerm = initialTerm + randomLongBetween(1, 10);
|
||||||
|
@ -290,7 +295,7 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
long initialTerm = randomLongBetween(1, 10);
|
long initialTerm = randomLongBetween(1, 10);
|
||||||
long initialVersion = randomLongBetween(1, 10);
|
long initialVersion = randomLongBetween(1, 10);
|
||||||
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
||||||
VotingConfiguration.of(node1)));
|
VotingConfiguration.of(node1)), () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
assertFalse(isLocalNodeElectedMaster());
|
assertFalse(isLocalNodeElectedMaster());
|
||||||
long newTerm = initialTerm + randomLongBetween(1, 10);
|
long newTerm = initialTerm + randomLongBetween(1, 10);
|
||||||
long higherVersion = initialVersion + randomLongBetween(1, 10);
|
long higherVersion = initialVersion + randomLongBetween(1, 10);
|
||||||
|
@ -306,7 +311,7 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
long initialTerm = randomLongBetween(1, 10);
|
long initialTerm = randomLongBetween(1, 10);
|
||||||
long initialVersion = randomLongBetween(1, 10);
|
long initialVersion = randomLongBetween(1, 10);
|
||||||
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
||||||
VotingConfiguration.of(node0)));
|
VotingConfiguration.of(node0)), () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
assertFalse(isLocalNodeElectedMaster());
|
assertFalse(isLocalNodeElectedMaster());
|
||||||
long newTerm = initialTerm + randomLongBetween(1, 10);
|
long newTerm = initialTerm + randomLongBetween(1, 10);
|
||||||
long higherVersion = initialVersion + randomLongBetween(1, 10);
|
long higherVersion = initialVersion + randomLongBetween(1, 10);
|
||||||
|
@ -320,7 +325,7 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
long initialTerm = randomLongBetween(1, 10);
|
long initialTerm = randomLongBetween(1, 10);
|
||||||
long initialVersion = randomLongBetween(1, 10);
|
long initialVersion = randomLongBetween(1, 10);
|
||||||
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
||||||
VotingConfiguration.of(node0)));
|
VotingConfiguration.of(node0)), () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
assertFalse(isLocalNodeElectedMaster());
|
assertFalse(isLocalNodeElectedMaster());
|
||||||
long newTerm = initialTerm + randomLongBetween(1, 10);
|
long newTerm = initialTerm + randomLongBetween(1, 10);
|
||||||
joinNodeAndRun(new JoinRequest(node0, newTerm, Optional.of(new Join(node0, node0, newTerm, initialTerm, initialVersion))));
|
joinNodeAndRun(new JoinRequest(node0, newTerm, Optional.of(new Join(node0, node0, newTerm, initialTerm, initialVersion))));
|
||||||
|
@ -337,7 +342,7 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
long initialTerm = randomLongBetween(1, 10);
|
long initialTerm = randomLongBetween(1, 10);
|
||||||
long initialVersion = randomLongBetween(1, 10);
|
long initialVersion = randomLongBetween(1, 10);
|
||||||
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
||||||
VotingConfiguration.of(node0)));
|
VotingConfiguration.of(node0)), () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
long newTerm = initialTerm + randomLongBetween(1, 10);
|
long newTerm = initialTerm + randomLongBetween(1, 10);
|
||||||
|
|
||||||
joinNodeAndRun(new JoinRequest(node0, newTerm, Optional.of(new Join(node0, node0, newTerm, initialTerm, initialVersion))));
|
joinNodeAndRun(new JoinRequest(node0, newTerm, Optional.of(new Join(node0, node0, newTerm, initialTerm, initialVersion))));
|
||||||
|
@ -356,7 +361,7 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
long initialTerm = randomLongBetween(1, 10);
|
long initialTerm = randomLongBetween(1, 10);
|
||||||
long initialVersion = randomLongBetween(1, 10);
|
long initialVersion = randomLongBetween(1, 10);
|
||||||
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
||||||
VotingConfiguration.of(node2)));
|
VotingConfiguration.of(node2)), () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
assertFalse(isLocalNodeElectedMaster());
|
assertFalse(isLocalNodeElectedMaster());
|
||||||
long newTerm = initialTerm + randomLongBetween(1, 10);
|
long newTerm = initialTerm + randomLongBetween(1, 10);
|
||||||
SimpleFuture futNode0 = joinNodeAsync(new JoinRequest(node0, newTerm, Optional.of(
|
SimpleFuture futNode0 = joinNodeAsync(new JoinRequest(node0, newTerm, Optional.of(
|
||||||
|
@ -383,7 +388,7 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
long initialTerm = randomLongBetween(1, 10);
|
long initialTerm = randomLongBetween(1, 10);
|
||||||
long initialVersion = randomLongBetween(1, 10);
|
long initialVersion = randomLongBetween(1, 10);
|
||||||
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
||||||
VotingConfiguration.of(node0)));
|
VotingConfiguration.of(node0)), () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
long newTerm = initialTerm + randomLongBetween(1, 10);
|
long newTerm = initialTerm + randomLongBetween(1, 10);
|
||||||
handleStartJoinFrom(node1, newTerm);
|
handleStartJoinFrom(node1, newTerm);
|
||||||
handleFollowerCheckFrom(node1, newTerm);
|
handleFollowerCheckFrom(node1, newTerm);
|
||||||
|
@ -402,7 +407,8 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
CoordinationMetadata.VotingConfigExclusion.MISSING_VALUE_MARKER, "knownNodeName");
|
CoordinationMetadata.VotingConfigExclusion.MISSING_VALUE_MARKER, "knownNodeName");
|
||||||
|
|
||||||
setupFakeMasterServiceAndCoordinator(initialTerm, buildStateWithVotingConfigExclusion(initialNode, initialTerm,
|
setupFakeMasterServiceAndCoordinator(initialTerm, buildStateWithVotingConfigExclusion(initialNode, initialTerm,
|
||||||
initialVersion, votingConfigExclusion));
|
initialVersion, votingConfigExclusion),
|
||||||
|
() -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
|
|
||||||
DiscoveryNode knownJoiningNode = new DiscoveryNode("knownNodeName", "newNodeId", buildNewFakeTransportAddress(),
|
DiscoveryNode knownJoiningNode = new DiscoveryNode("knownNodeName", "newNodeId", buildNewFakeTransportAddress(),
|
||||||
emptyMap(), singleton(DiscoveryNodeRole.MASTER_ROLE), Version.CURRENT);
|
emptyMap(), singleton(DiscoveryNodeRole.MASTER_ROLE), Version.CURRENT);
|
||||||
|
@ -478,7 +484,7 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
long initialTerm = randomLongBetween(1, 10);
|
long initialTerm = randomLongBetween(1, 10);
|
||||||
long initialVersion = randomLongBetween(1, 10);
|
long initialVersion = randomLongBetween(1, 10);
|
||||||
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
||||||
VotingConfiguration.of(node0)));
|
VotingConfiguration.of(node0)), () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
long newTerm = initialTerm + randomLongBetween(1, 10);
|
long newTerm = initialTerm + randomLongBetween(1, 10);
|
||||||
handleStartJoinFrom(node1, newTerm);
|
handleStartJoinFrom(node1, newTerm);
|
||||||
handleFollowerCheckFrom(node1, newTerm);
|
handleFollowerCheckFrom(node1, newTerm);
|
||||||
|
@ -494,7 +500,7 @@ public class NodeJoinTests extends ESTestCase {
|
||||||
long initialTerm = randomLongBetween(1, 10);
|
long initialTerm = randomLongBetween(1, 10);
|
||||||
long initialVersion = randomLongBetween(1, 10);
|
long initialVersion = randomLongBetween(1, 10);
|
||||||
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion,
|
||||||
VotingConfiguration.of(node1)));
|
VotingConfiguration.of(node1)), () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
long newTerm = initialTerm + randomLongBetween(1, 10);
|
long newTerm = initialTerm + randomLongBetween(1, 10);
|
||||||
SimpleFuture fut = joinNodeAsync(new JoinRequest(node0, newTerm,
|
SimpleFuture fut = joinNodeAsync(new JoinRequest(node0, newTerm,
|
||||||
Optional.of(new Join(node0, node0, newTerm, initialTerm, initialVersion))));
|
Optional.of(new Join(node0, node0, newTerm, initialTerm, initialVersion))));
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||||
import org.elasticsearch.common.io.stream.StreamInput;
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
import org.elasticsearch.common.lease.Releasable;
|
import org.elasticsearch.common.lease.Releasable;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.transport.MockTransport;
|
import org.elasticsearch.test.transport.MockTransport;
|
||||||
import org.elasticsearch.transport.ConnectTransportException;
|
import org.elasticsearch.transport.ConnectTransportException;
|
||||||
|
@ -46,6 +47,8 @@ import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
|
||||||
import static java.util.Collections.emptySet;
|
import static java.util.Collections.emptySet;
|
||||||
import static org.elasticsearch.cluster.coordination.PreVoteCollector.REQUEST_PRE_VOTE_ACTION_NAME;
|
import static org.elasticsearch.cluster.coordination.PreVoteCollector.REQUEST_PRE_VOTE_ACTION_NAME;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
||||||
import static org.elasticsearch.threadpool.ThreadPool.Names.SAME;
|
import static org.elasticsearch.threadpool.ThreadPool.Names.SAME;
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
@ -63,6 +66,7 @@ public class PreVoteCollectorTests extends ESTestCase {
|
||||||
private Map<DiscoveryNode, PreVoteResponse> responsesByNode = new HashMap<>();
|
private Map<DiscoveryNode, PreVoteResponse> responsesByNode = new HashMap<>();
|
||||||
private long currentTerm, lastAcceptedTerm, lastAcceptedVersion;
|
private long currentTerm, lastAcceptedTerm, lastAcceptedVersion;
|
||||||
private TransportService transportService;
|
private TransportService transportService;
|
||||||
|
private StatusInfo healthStatus;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void createObjects() {
|
public void createObjects() {
|
||||||
|
@ -95,6 +99,11 @@ public class PreVoteCollectorTests extends ESTestCase {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void handleRemoteError(long requestId, Throwable t) {
|
||||||
|
logger.warn("Remote error", t);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
lastAcceptedTerm = randomNonNegativeLong();
|
lastAcceptedTerm = randomNonNegativeLong();
|
||||||
currentTerm = randomLongBetween(lastAcceptedTerm, Long.MAX_VALUE);
|
currentTerm = randomLongBetween(lastAcceptedTerm, Long.MAX_VALUE);
|
||||||
|
@ -102,6 +111,7 @@ public class PreVoteCollectorTests extends ESTestCase {
|
||||||
|
|
||||||
localNode = new DiscoveryNode("local-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
localNode = new DiscoveryNode("local-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
responsesByNode.put(localNode, new PreVoteResponse(currentTerm, lastAcceptedTerm, lastAcceptedVersion));
|
responsesByNode.put(localNode, new PreVoteResponse(currentTerm, lastAcceptedTerm, lastAcceptedVersion));
|
||||||
|
healthStatus = new StatusInfo(HEALTHY, "healthy-info");
|
||||||
transportService = mockTransport.createTransportService(settings,
|
transportService = mockTransport.createTransportService(settings,
|
||||||
deterministicTaskQueue.getThreadPool(), TransportService.NOOP_TRANSPORT_INTERCEPTOR,
|
deterministicTaskQueue.getThreadPool(), TransportService.NOOP_TRANSPORT_INTERCEPTOR,
|
||||||
boundTransportAddress -> localNode, null, emptySet());
|
boundTransportAddress -> localNode, null, emptySet());
|
||||||
|
@ -112,7 +122,7 @@ public class PreVoteCollectorTests extends ESTestCase {
|
||||||
assert electionOccurred == false;
|
assert electionOccurred == false;
|
||||||
electionOccurred = true;
|
electionOccurred = true;
|
||||||
}, l -> {
|
}, l -> {
|
||||||
}, ElectionStrategy.DEFAULT_INSTANCE);
|
}, ElectionStrategy.DEFAULT_INSTANCE, () -> healthStatus);
|
||||||
preVoteCollector.update(getLocalPreVoteResponse(), null);
|
preVoteCollector.update(getLocalPreVoteResponse(), null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -147,6 +157,13 @@ public class PreVoteCollectorTests extends ESTestCase {
|
||||||
assertTrue(electionOccurred);
|
assertTrue(electionOccurred);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testNoElectionStartIfLocalNodeIsOnlyNodeAndUnhealthy() {
|
||||||
|
healthStatus = new StatusInfo(UNHEALTHY, "unhealthy-info");
|
||||||
|
preVoteCollector.update(getLocalPreVoteResponse(), null);
|
||||||
|
startAndRunCollector(localNode);
|
||||||
|
assertFalse(electionOccurred);
|
||||||
|
}
|
||||||
|
|
||||||
public void testStartsElectionIfLocalNodeIsQuorum() {
|
public void testStartsElectionIfLocalNodeIsQuorum() {
|
||||||
final DiscoveryNode otherNode = new DiscoveryNode("other-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode otherNode = new DiscoveryNode("other-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
responsesByNode.put(otherNode, getLocalPreVoteResponse());
|
responsesByNode.put(otherNode, getLocalPreVoteResponse());
|
||||||
|
@ -169,6 +186,15 @@ public class PreVoteCollectorTests extends ESTestCase {
|
||||||
assertFalse(electionOccurred);
|
assertFalse(electionOccurred);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testUnhealthyNodeDoesNotOfferPreVote() {
|
||||||
|
final long term = randomNonNegativeLong();
|
||||||
|
healthStatus = new StatusInfo(UNHEALTHY, "unhealthy-info");
|
||||||
|
final DiscoveryNode otherNode = new DiscoveryNode("other-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
|
RemoteTransportException remoteTransportException = expectThrows(RemoteTransportException.class, () ->
|
||||||
|
handlePreVoteRequestViaTransportService(new PreVoteRequest(otherNode, term)));
|
||||||
|
assertThat(remoteTransportException.getCause(), instanceOf(NodeHealthCheckFailureException.class));
|
||||||
|
}
|
||||||
|
|
||||||
public void testDoesNotStartElectionIfStopped() {
|
public void testDoesNotStartElectionIfStopped() {
|
||||||
final DiscoveryNode otherNode = new DiscoveryNode("other-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
final DiscoveryNode otherNode = new DiscoveryNode("other-node", buildNewFakeTransportAddress(), Version.CURRENT);
|
||||||
responsesByNode.put(otherNode, getLocalPreVoteResponse());
|
responsesByNode.put(otherNode, getLocalPreVoteResponse());
|
||||||
|
|
|
@ -92,7 +92,7 @@ public class DiscoveryModuleTests extends ESTestCase {
|
||||||
private DiscoveryModule newModule(Settings settings, List<DiscoveryPlugin> plugins) {
|
private DiscoveryModule newModule(Settings settings, List<DiscoveryPlugin> plugins) {
|
||||||
return new DiscoveryModule(settings, threadPool, transportService, namedWriteableRegistry, null, masterService,
|
return new DiscoveryModule(settings, threadPool, transportService, namedWriteableRegistry, null, masterService,
|
||||||
clusterApplier, clusterSettings, plugins, null, createTempDir().toAbsolutePath(), gatewayMetaState,
|
clusterApplier, clusterSettings, plugins, null, createTempDir().toAbsolutePath(), gatewayMetaState,
|
||||||
mock(RerouteService.class));
|
mock(RerouteService.class), null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDefaults() {
|
public void testDefaults() {
|
||||||
|
|
|
@ -0,0 +1,345 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
package org.elasticsearch.monitor.fs;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.logging.log4j.Level;
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.apache.lucene.mockfile.FilterFileChannel;
|
||||||
|
import org.apache.lucene.mockfile.FilterFileSystemProvider;
|
||||||
|
import org.elasticsearch.cluster.coordination.DeterministicTaskQueue;
|
||||||
|
import org.elasticsearch.common.io.PathUtils;
|
||||||
|
import org.elasticsearch.common.io.PathUtilsForTesting;
|
||||||
|
import org.elasticsearch.common.logging.Loggers;
|
||||||
|
import org.elasticsearch.common.settings.ClusterSettings;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.env.NodeEnvironment;
|
||||||
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
import org.elasticsearch.test.MockLogAppender;
|
||||||
|
import org.elasticsearch.test.junit.annotations.TestLogging;
|
||||||
|
import org.elasticsearch.threadpool.TestThreadPool;
|
||||||
|
import org.elasticsearch.threadpool.ThreadPool;
|
||||||
|
import org.junit.Before;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.FileSystem;
|
||||||
|
import java.nio.file.OpenOption;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.attribute.FileAttribute;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
||||||
|
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
||||||
|
import static org.hamcrest.Matchers.is;
|
||||||
|
|
||||||
|
|
||||||
|
public class FsHealthServiceTests extends ESTestCase {
|
||||||
|
|
||||||
|
private DeterministicTaskQueue deterministicTaskQueue;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void createObjects() {
|
||||||
|
Settings settings = Settings.builder().put(NODE_NAME_SETTING.getKey(), "node").build();
|
||||||
|
deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSchedulesHealthCheckAtRefreshIntervals() throws Exception {
|
||||||
|
long refreshInterval = randomLongBetween(1000, 12000);
|
||||||
|
final Settings settings = Settings.builder().put(FsHealthService.REFRESH_INTERVAL_SETTING.getKey(), refreshInterval + "ms").build();
|
||||||
|
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
|
||||||
|
try (NodeEnvironment env = newNodeEnvironment()) {
|
||||||
|
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, deterministicTaskQueue.getThreadPool(), env);
|
||||||
|
final long startTimeMillis = deterministicTaskQueue.getCurrentTimeMillis();
|
||||||
|
fsHealthService.doStart();
|
||||||
|
assertFalse(deterministicTaskQueue.hasRunnableTasks());
|
||||||
|
assertTrue(deterministicTaskQueue.hasDeferredTasks());
|
||||||
|
int rescheduledCount = 0;
|
||||||
|
for (int i = 1; i <= randomIntBetween(5, 10); i++) {
|
||||||
|
if (deterministicTaskQueue.hasRunnableTasks()) {
|
||||||
|
deterministicTaskQueue.runRandomTask();
|
||||||
|
} else {
|
||||||
|
assertThat(deterministicTaskQueue.getLatestDeferredExecutionTime(), is(refreshInterval * (rescheduledCount + 1)));
|
||||||
|
deterministicTaskQueue.advanceTime();
|
||||||
|
rescheduledCount++;
|
||||||
|
}
|
||||||
|
assertThat(deterministicTaskQueue.getCurrentTimeMillis() - startTimeMillis, is(refreshInterval * rescheduledCount));
|
||||||
|
}
|
||||||
|
|
||||||
|
fsHealthService.doStop();
|
||||||
|
deterministicTaskQueue.runAllTasksInTimeOrder();
|
||||||
|
|
||||||
|
assertFalse(deterministicTaskQueue.hasRunnableTasks());
|
||||||
|
assertFalse(deterministicTaskQueue.hasDeferredTasks());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFailsHealthOnIOException() throws IOException {
|
||||||
|
FileSystem fileSystem = PathUtils.getDefaultFileSystem();
|
||||||
|
FileSystemIOExceptionProvider disruptFileSystemProvider = new FileSystemIOExceptionProvider(fileSystem);
|
||||||
|
fileSystem = disruptFileSystemProvider.getFileSystem(null);
|
||||||
|
PathUtilsForTesting.installMock(fileSystem);
|
||||||
|
final Settings settings = Settings.EMPTY;
|
||||||
|
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
|
||||||
|
TestThreadPool testThreadPool = new TestThreadPool(getClass().getName(), settings);
|
||||||
|
try (NodeEnvironment env = newNodeEnvironment()) {
|
||||||
|
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
|
||||||
|
fsHealthService.new FsHealthMonitor().run();
|
||||||
|
assertEquals(HEALTHY, fsHealthService.getHealth().getStatus());
|
||||||
|
assertEquals("health check passed", fsHealthService.getHealth().getInfo());
|
||||||
|
|
||||||
|
//disrupt file system
|
||||||
|
disruptFileSystemProvider.injectIOException.set(true);
|
||||||
|
fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
|
||||||
|
fsHealthService.new FsHealthMonitor().run();
|
||||||
|
assertEquals(UNHEALTHY, fsHealthService.getHealth().getStatus());
|
||||||
|
for (Path path : env.nodeDataPaths()) {
|
||||||
|
assertTrue(fsHealthService.getHealth().getInfo().contains(path.toString()));
|
||||||
|
}
|
||||||
|
assertEquals(env.nodeDataPaths().length, disruptFileSystemProvider.getInjectedPathCount());
|
||||||
|
} finally {
|
||||||
|
disruptFileSystemProvider.injectIOException.set(false);
|
||||||
|
PathUtilsForTesting.teardown();
|
||||||
|
ThreadPool.terminate(testThreadPool, 500, TimeUnit.MILLISECONDS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@TestLogging(value = "org.elasticsearch.monitor.fs:WARN", reason = "to ensure that we log on hung IO at WARN level")
|
||||||
|
public void testLoggingOnHungIO() throws Exception {
|
||||||
|
long slowLogThreshold = randomLongBetween(100, 200);
|
||||||
|
final Settings settings = Settings.builder().put(FsHealthService.SLOW_PATH_LOGGING_THRESHOLD_SETTING.getKey(),
|
||||||
|
slowLogThreshold + "ms").build();
|
||||||
|
FileSystem fileSystem = PathUtils.getDefaultFileSystem();
|
||||||
|
TestThreadPool testThreadPool = new TestThreadPool(getClass().getName(), settings);
|
||||||
|
FileSystemFsyncHungProvider disruptFileSystemProvider = new FileSystemFsyncHungProvider(fileSystem,
|
||||||
|
randomLongBetween(slowLogThreshold + 1 , 400), testThreadPool);
|
||||||
|
fileSystem = disruptFileSystemProvider.getFileSystem(null);
|
||||||
|
PathUtilsForTesting.installMock(fileSystem);
|
||||||
|
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
|
||||||
|
|
||||||
|
MockLogAppender mockAppender = new MockLogAppender();
|
||||||
|
mockAppender.start();
|
||||||
|
|
||||||
|
Logger logger = LogManager.getLogger(FsHealthService.class);
|
||||||
|
Loggers.addAppender(logger, mockAppender);
|
||||||
|
try (NodeEnvironment env = newNodeEnvironment()) {
|
||||||
|
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
|
||||||
|
int counter = 0;
|
||||||
|
for(Path path : env.nodeDataPaths()){
|
||||||
|
mockAppender.addExpectation(
|
||||||
|
new MockLogAppender.SeenEventExpectation(
|
||||||
|
"test" + ++counter,
|
||||||
|
FsHealthService.class.getCanonicalName(),
|
||||||
|
Level.WARN,
|
||||||
|
"health check of [" + path + "] took [*ms] which is above the warn threshold*"));
|
||||||
|
}
|
||||||
|
|
||||||
|
//disrupt file system
|
||||||
|
disruptFileSystemProvider.injectIOException.set(true);
|
||||||
|
fsHealthService.new FsHealthMonitor().run();
|
||||||
|
assertEquals(env.nodeDataPaths().length, disruptFileSystemProvider.getInjectedPathCount());
|
||||||
|
assertBusy(mockAppender::assertAllExpectationsMatched);
|
||||||
|
} finally {
|
||||||
|
Loggers.removeAppender(logger, mockAppender);
|
||||||
|
mockAppender.stop();
|
||||||
|
PathUtilsForTesting.teardown();
|
||||||
|
ThreadPool.terminate(testThreadPool, 500, TimeUnit.MILLISECONDS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFailsHealthOnSinglePathFsyncFailure() throws IOException {
|
||||||
|
FileSystem fileSystem = PathUtils.getDefaultFileSystem();
|
||||||
|
FileSystemFsyncIOExceptionProvider disruptFsyncFileSystemProvider = new FileSystemFsyncIOExceptionProvider(fileSystem);
|
||||||
|
fileSystem = disruptFsyncFileSystemProvider.getFileSystem(null);
|
||||||
|
PathUtilsForTesting.installMock(fileSystem);
|
||||||
|
final Settings settings = Settings.EMPTY;
|
||||||
|
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
|
||||||
|
TestThreadPool testThreadPool = new TestThreadPool(getClass().getName(), settings);
|
||||||
|
try (NodeEnvironment env = newNodeEnvironment()) {
|
||||||
|
Path[] paths = env.nodeDataPaths();
|
||||||
|
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
|
||||||
|
fsHealthService.new FsHealthMonitor().run();
|
||||||
|
assertEquals(HEALTHY, fsHealthService.getHealth().getStatus());
|
||||||
|
assertEquals("health check passed", fsHealthService.getHealth().getInfo());
|
||||||
|
|
||||||
|
//disrupt file system fsync on single path
|
||||||
|
disruptFsyncFileSystemProvider.injectIOException.set(true);
|
||||||
|
String disruptedPath = randomFrom(paths).toString();
|
||||||
|
disruptFsyncFileSystemProvider.restrictPathPrefix(disruptedPath);
|
||||||
|
fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
|
||||||
|
fsHealthService.new FsHealthMonitor().run();
|
||||||
|
assertEquals(UNHEALTHY, fsHealthService.getHealth().getStatus());
|
||||||
|
assertThat(fsHealthService.getHealth().getInfo(), is("health check failed on [" + disruptedPath + "]"));
|
||||||
|
assertEquals(1, disruptFsyncFileSystemProvider.getInjectedPathCount());
|
||||||
|
} finally {
|
||||||
|
disruptFsyncFileSystemProvider.injectIOException.set(false);
|
||||||
|
PathUtilsForTesting.teardown();
|
||||||
|
ThreadPool.terminate(testThreadPool, 500, TimeUnit.MILLISECONDS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFailsHealthOnSinglePathWriteFailure() throws IOException {
|
||||||
|
FileSystem fileSystem = PathUtils.getDefaultFileSystem();
|
||||||
|
FileSystemIOExceptionProvider disruptWritesFileSystemProvider = new FileSystemIOExceptionProvider(fileSystem);
|
||||||
|
fileSystem = disruptWritesFileSystemProvider.getFileSystem(null);
|
||||||
|
PathUtilsForTesting.installMock(fileSystem);
|
||||||
|
final Settings settings = Settings.EMPTY;
|
||||||
|
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
|
||||||
|
TestThreadPool testThreadPool = new TestThreadPool(getClass().getName(), settings);
|
||||||
|
try (NodeEnvironment env = newNodeEnvironment()) {
|
||||||
|
Path[] paths = env.nodeDataPaths();
|
||||||
|
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
|
||||||
|
fsHealthService.new FsHealthMonitor().run();
|
||||||
|
assertEquals(HEALTHY, fsHealthService.getHealth().getStatus());
|
||||||
|
assertEquals("health check passed", fsHealthService.getHealth().getInfo());
|
||||||
|
|
||||||
|
//disrupt file system writes on single path
|
||||||
|
disruptWritesFileSystemProvider.injectIOException.set(true);
|
||||||
|
String disruptedPath = randomFrom(paths).toString();
|
||||||
|
disruptWritesFileSystemProvider.restrictPathPrefix(disruptedPath);
|
||||||
|
fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
|
||||||
|
fsHealthService.new FsHealthMonitor().run();
|
||||||
|
assertEquals(UNHEALTHY, fsHealthService.getHealth().getStatus());
|
||||||
|
assertThat(fsHealthService.getHealth().getInfo(), is("health check failed on [" + disruptedPath + "]"));
|
||||||
|
assertEquals(1, disruptWritesFileSystemProvider.getInjectedPathCount());
|
||||||
|
} finally {
|
||||||
|
disruptWritesFileSystemProvider.injectIOException.set(false);
|
||||||
|
PathUtilsForTesting.teardown();
|
||||||
|
ThreadPool.terminate(testThreadPool, 500, TimeUnit.MILLISECONDS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class FileSystemIOExceptionProvider extends FilterFileSystemProvider {
|
||||||
|
|
||||||
|
AtomicBoolean injectIOException = new AtomicBoolean();
|
||||||
|
AtomicInteger injectedPaths = new AtomicInteger();
|
||||||
|
|
||||||
|
private String pathPrefix = "/";
|
||||||
|
|
||||||
|
FileSystemIOExceptionProvider(FileSystem inner) {
|
||||||
|
super("disrupt_fs_health://", inner);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void restrictPathPrefix(String pathPrefix){
|
||||||
|
this.pathPrefix = pathPrefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getInjectedPathCount(){
|
||||||
|
return injectedPaths.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public OutputStream newOutputStream(Path path, OpenOption... options) throws IOException {
|
||||||
|
if (injectIOException.get()){
|
||||||
|
if (path.toString().startsWith(pathPrefix) && path.toString().endsWith(".es_temp_file")) {
|
||||||
|
injectedPaths.incrementAndGet();
|
||||||
|
throw new IOException("fake IOException");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return super.newOutputStream(path, options);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class FileSystemFsyncIOExceptionProvider extends FilterFileSystemProvider {
|
||||||
|
|
||||||
|
AtomicBoolean injectIOException = new AtomicBoolean();
|
||||||
|
AtomicInteger injectedPaths = new AtomicInteger();
|
||||||
|
|
||||||
|
private String pathPrefix = "/";
|
||||||
|
|
||||||
|
FileSystemFsyncIOExceptionProvider(FileSystem inner) {
|
||||||
|
super("disrupt_fs_health://", inner);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void restrictPathPrefix(String pathPrefix){
|
||||||
|
this.pathPrefix = pathPrefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getInjectedPathCount(){
|
||||||
|
return injectedPaths.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FileChannel newFileChannel(Path path, Set<? extends OpenOption> options, FileAttribute<?>... attrs) throws IOException {
|
||||||
|
return new FilterFileChannel(super.newFileChannel(path, options, attrs)) {
|
||||||
|
@Override
|
||||||
|
public void force(boolean metaData) throws IOException {
|
||||||
|
if (injectIOException.get()) {
|
||||||
|
if (path.toString().startsWith(pathPrefix) && path.toString().endsWith(".es_temp_file")) {
|
||||||
|
injectedPaths.incrementAndGet();
|
||||||
|
throw new IOException("fake IOException");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
super.force(metaData);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class FileSystemFsyncHungProvider extends FilterFileSystemProvider {
|
||||||
|
|
||||||
|
AtomicBoolean injectIOException = new AtomicBoolean();
|
||||||
|
AtomicInteger injectedPaths = new AtomicInteger();
|
||||||
|
|
||||||
|
private String pathPrefix = "/";
|
||||||
|
private long delay;
|
||||||
|
private final ThreadPool threadPool;
|
||||||
|
|
||||||
|
FileSystemFsyncHungProvider(FileSystem inner, long delay, ThreadPool threadPool) {
|
||||||
|
super("disrupt_fs_health://", inner);
|
||||||
|
this.delay = delay;
|
||||||
|
this.threadPool = threadPool;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getInjectedPathCount(){
|
||||||
|
return injectedPaths.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FileChannel newFileChannel(Path path, Set<? extends OpenOption> options, FileAttribute<?>... attrs) throws IOException {
|
||||||
|
return new FilterFileChannel(super.newFileChannel(path, options, attrs)) {
|
||||||
|
@Override
|
||||||
|
public void force(boolean metaData) throws IOException {
|
||||||
|
if (injectIOException.get()) {
|
||||||
|
if (path.toString().startsWith(pathPrefix) && path.toString().endsWith(".es_temp_file")) {
|
||||||
|
injectedPaths.incrementAndGet();
|
||||||
|
final long startTimeMillis = threadPool.relativeTimeInMillis();
|
||||||
|
do {
|
||||||
|
try {
|
||||||
|
Thread.sleep(delay);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new AssertionError(e);
|
||||||
|
}
|
||||||
|
} while (threadPool.relativeTimeInMillis() <= startTimeMillis + delay);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
super.force(metaData);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -168,6 +168,7 @@ import org.elasticsearch.indices.recovery.PeerRecoverySourceService;
|
||||||
import org.elasticsearch.indices.recovery.PeerRecoveryTargetService;
|
import org.elasticsearch.indices.recovery.PeerRecoveryTargetService;
|
||||||
import org.elasticsearch.indices.recovery.RecoverySettings;
|
import org.elasticsearch.indices.recovery.RecoverySettings;
|
||||||
import org.elasticsearch.ingest.IngestService;
|
import org.elasticsearch.ingest.IngestService;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.node.ResponseCollectorService;
|
import org.elasticsearch.node.ResponseCollectorService;
|
||||||
import org.elasticsearch.plugins.PluginsService;
|
import org.elasticsearch.plugins.PluginsService;
|
||||||
import org.elasticsearch.repositories.RepositoriesService;
|
import org.elasticsearch.repositories.RepositoriesService;
|
||||||
|
@ -217,6 +218,7 @@ import static java.util.Collections.emptyMap;
|
||||||
import static java.util.Collections.emptySet;
|
import static java.util.Collections.emptySet;
|
||||||
import static org.elasticsearch.action.support.ActionTestUtils.assertNoFailureListener;
|
import static org.elasticsearch.action.support.ActionTestUtils.assertNoFailureListener;
|
||||||
import static org.elasticsearch.env.Environment.PATH_HOME_SETTING;
|
import static org.elasticsearch.env.Environment.PATH_HOME_SETTING;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
||||||
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
||||||
import static org.hamcrest.Matchers.contains;
|
import static org.hamcrest.Matchers.contains;
|
||||||
import static org.hamcrest.Matchers.containsInAnyOrder;
|
import static org.hamcrest.Matchers.containsInAnyOrder;
|
||||||
|
@ -1649,7 +1651,8 @@ public class SnapshotResiliencyTests extends ESTestCase {
|
||||||
hostsResolver -> nodes.values().stream().filter(n -> n.node.isMasterNode())
|
hostsResolver -> nodes.values().stream().filter(n -> n.node.isMasterNode())
|
||||||
.map(n -> n.node.getAddress()).collect(Collectors.toList()),
|
.map(n -> n.node.getAddress()).collect(Collectors.toList()),
|
||||||
clusterService.getClusterApplierService(), Collections.emptyList(), random(),
|
clusterService.getClusterApplierService(), Collections.emptyList(), random(),
|
||||||
new BatchedRerouteService(clusterService, allocationService::reroute), ElectionStrategy.DEFAULT_INSTANCE);
|
new BatchedRerouteService(clusterService, allocationService::reroute), ElectionStrategy.DEFAULT_INSTANCE,
|
||||||
|
() -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
masterService.setClusterStatePublisher(coordinator);
|
masterService.setClusterStatePublisher(coordinator);
|
||||||
coordinator.start();
|
coordinator.start();
|
||||||
clusterService.getClusterApplierService().setNodeConnectionsService(nodeConnectionsService);
|
clusterService.getClusterApplierService().setNodeConnectionsService(nodeConnectionsService);
|
||||||
|
|
|
@ -65,6 +65,8 @@ import org.elasticsearch.gateway.ClusterStateUpdaters;
|
||||||
import org.elasticsearch.gateway.GatewayService;
|
import org.elasticsearch.gateway.GatewayService;
|
||||||
import org.elasticsearch.gateway.MockGatewayMetaState;
|
import org.elasticsearch.gateway.MockGatewayMetaState;
|
||||||
import org.elasticsearch.gateway.PersistedClusterStateService;
|
import org.elasticsearch.gateway.PersistedClusterStateService;
|
||||||
|
import org.elasticsearch.monitor.NodeHealthService;
|
||||||
|
import org.elasticsearch.monitor.StatusInfo;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.disruption.DisruptableMockTransport;
|
import org.elasticsearch.test.disruption.DisruptableMockTransport;
|
||||||
import org.elasticsearch.test.disruption.DisruptableMockTransport.ConnectionStatus;
|
import org.elasticsearch.test.disruption.DisruptableMockTransport.ConnectionStatus;
|
||||||
|
@ -125,6 +127,7 @@ import static org.elasticsearch.cluster.coordination.NoMasterBlockService.NO_MAS
|
||||||
import static org.elasticsearch.cluster.coordination.Reconfigurator.CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION;
|
import static org.elasticsearch.cluster.coordination.Reconfigurator.CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION;
|
||||||
import static org.elasticsearch.discovery.PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_SETTING;
|
import static org.elasticsearch.discovery.PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_SETTING;
|
||||||
import static org.elasticsearch.gateway.GatewayService.STATE_NOT_RECOVERED_BLOCK;
|
import static org.elasticsearch.gateway.GatewayService.STATE_NOT_RECOVERED_BLOCK;
|
||||||
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
||||||
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
import static org.elasticsearch.node.Node.NODE_NAME_SETTING;
|
||||||
import static org.elasticsearch.transport.TransportService.NOOP_TRANSPORT_INTERCEPTOR;
|
import static org.elasticsearch.transport.TransportService.NOOP_TRANSPORT_INTERCEPTOR;
|
||||||
import static org.elasticsearch.transport.TransportSettings.CONNECT_TIMEOUT;
|
import static org.elasticsearch.transport.TransportSettings.CONNECT_TIMEOUT;
|
||||||
|
@ -254,6 +257,7 @@ public class AbstractCoordinatorTestCase extends ESTestCase {
|
||||||
private final Map<Long, ClusterState> committedStatesByVersion = new HashMap<>();
|
private final Map<Long, ClusterState> committedStatesByVersion = new HashMap<>();
|
||||||
private final LinearizabilityChecker linearizabilityChecker = new LinearizabilityChecker();
|
private final LinearizabilityChecker linearizabilityChecker = new LinearizabilityChecker();
|
||||||
private final History history = new History();
|
private final History history = new History();
|
||||||
|
private NodeHealthService nodeHealthService;
|
||||||
|
|
||||||
private final Function<DiscoveryNode, MockPersistedState> defaultPersistedStateSupplier = MockPersistedState::new;
|
private final Function<DiscoveryNode, MockPersistedState> defaultPersistedStateSupplier = MockPersistedState::new;
|
||||||
|
|
||||||
|
@ -265,6 +269,11 @@ public class AbstractCoordinatorTestCase extends ESTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
Cluster(int initialNodeCount, boolean allNodesMasterEligible, Settings nodeSettings) {
|
Cluster(int initialNodeCount, boolean allNodesMasterEligible, Settings nodeSettings) {
|
||||||
|
this(initialNodeCount, allNodesMasterEligible, nodeSettings, () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||||
|
}
|
||||||
|
|
||||||
|
Cluster(int initialNodeCount, boolean allNodesMasterEligible, Settings nodeSettings, NodeHealthService nodeHealthService) {
|
||||||
|
this.nodeHealthService = nodeHealthService;
|
||||||
deterministicTaskQueue.setExecutionDelayVariabilityMillis(DEFAULT_DELAY_VARIABILITY);
|
deterministicTaskQueue.setExecutionDelayVariabilityMillis(DEFAULT_DELAY_VARIABILITY);
|
||||||
|
|
||||||
assertThat(initialNodeCount, greaterThan(0));
|
assertThat(initialNodeCount, greaterThan(0));
|
||||||
|
@ -273,7 +282,7 @@ public class AbstractCoordinatorTestCase extends ESTestCase {
|
||||||
clusterNodes = new ArrayList<>(initialNodeCount);
|
clusterNodes = new ArrayList<>(initialNodeCount);
|
||||||
for (int i = 0; i < initialNodeCount; i++) {
|
for (int i = 0; i < initialNodeCount; i++) {
|
||||||
final ClusterNode clusterNode = new ClusterNode(nextNodeIndex.getAndIncrement(),
|
final ClusterNode clusterNode = new ClusterNode(nextNodeIndex.getAndIncrement(),
|
||||||
allNodesMasterEligible || i == 0 || randomBoolean(), nodeSettings);
|
allNodesMasterEligible || i == 0 || randomBoolean(), nodeSettings, nodeHealthService);
|
||||||
clusterNodes.add(clusterNode);
|
clusterNodes.add(clusterNode);
|
||||||
if (clusterNode.getLocalNode().isMasterNode()) {
|
if (clusterNode.getLocalNode().isMasterNode()) {
|
||||||
masterEligibleNodeIds.add(clusterNode.getId());
|
masterEligibleNodeIds.add(clusterNode.getId());
|
||||||
|
@ -309,7 +318,8 @@ public class AbstractCoordinatorTestCase extends ESTestCase {
|
||||||
|
|
||||||
final List<ClusterNode> addedNodes = new ArrayList<>();
|
final List<ClusterNode> addedNodes = new ArrayList<>();
|
||||||
for (int i = 0; i < newNodesCount; i++) {
|
for (int i = 0; i < newNodesCount; i++) {
|
||||||
final ClusterNode clusterNode = new ClusterNode(nextNodeIndex.getAndIncrement(), true, Settings.EMPTY);
|
final ClusterNode clusterNode = new ClusterNode(nextNodeIndex.getAndIncrement(), true, Settings.EMPTY,
|
||||||
|
nodeHealthService);
|
||||||
addedNodes.add(clusterNode);
|
addedNodes.add(clusterNode);
|
||||||
}
|
}
|
||||||
clusterNodes.addAll(addedNodes);
|
clusterNodes.addAll(addedNodes);
|
||||||
|
@ -635,7 +645,8 @@ public class AbstractCoordinatorTestCase extends ESTestCase {
|
||||||
private boolean isConnectedPair(ClusterNode n1, ClusterNode n2) {
|
private boolean isConnectedPair(ClusterNode n1, ClusterNode n2) {
|
||||||
return n1 == n2 ||
|
return n1 == n2 ||
|
||||||
(getConnectionStatus(n1.getLocalNode(), n2.getLocalNode()) == ConnectionStatus.CONNECTED
|
(getConnectionStatus(n1.getLocalNode(), n2.getLocalNode()) == ConnectionStatus.CONNECTED
|
||||||
&& getConnectionStatus(n2.getLocalNode(), n1.getLocalNode()) == ConnectionStatus.CONNECTED);
|
&& getConnectionStatus(n2.getLocalNode(), n1.getLocalNode()) == ConnectionStatus.CONNECTED) &&
|
||||||
|
(n1.nodeHealthService.getHealth().getStatus() == HEALTHY && n2.nodeHealthService.getHealth().getStatus() == HEALTHY);
|
||||||
}
|
}
|
||||||
|
|
||||||
ClusterNode getAnyLeader() {
|
ClusterNode getAnyLeader() {
|
||||||
|
@ -881,14 +892,18 @@ public class AbstractCoordinatorTestCase extends ESTestCase {
|
||||||
private ClusterService clusterService;
|
private ClusterService clusterService;
|
||||||
TransportService transportService;
|
TransportService transportService;
|
||||||
private DisruptableMockTransport mockTransport;
|
private DisruptableMockTransport mockTransport;
|
||||||
|
private NodeHealthService nodeHealthService;
|
||||||
List<BiConsumer<DiscoveryNode, ClusterState>> extraJoinValidators = new ArrayList<>();
|
List<BiConsumer<DiscoveryNode, ClusterState>> extraJoinValidators = new ArrayList<>();
|
||||||
|
|
||||||
ClusterNode(int nodeIndex, boolean masterEligible, Settings nodeSettings) {
|
|
||||||
this(nodeIndex, createDiscoveryNode(nodeIndex, masterEligible), defaultPersistedStateSupplier, nodeSettings);
|
ClusterNode(int nodeIndex, boolean masterEligible, Settings nodeSettings, NodeHealthService nodeHealthService) {
|
||||||
|
this(nodeIndex, createDiscoveryNode(nodeIndex, masterEligible), defaultPersistedStateSupplier, nodeSettings,
|
||||||
|
nodeHealthService);
|
||||||
}
|
}
|
||||||
|
|
||||||
ClusterNode(int nodeIndex, DiscoveryNode localNode, Function<DiscoveryNode, MockPersistedState> persistedStateSupplier,
|
ClusterNode(int nodeIndex, DiscoveryNode localNode, Function<DiscoveryNode, MockPersistedState> persistedStateSupplier,
|
||||||
Settings nodeSettings) {
|
Settings nodeSettings, NodeHealthService nodeHealthService) {
|
||||||
|
this.nodeHealthService = nodeHealthService;
|
||||||
this.nodeIndex = nodeIndex;
|
this.nodeIndex = nodeIndex;
|
||||||
this.localNode = localNode;
|
this.localNode = localNode;
|
||||||
this.nodeSettings = nodeSettings;
|
this.nodeSettings = nodeSettings;
|
||||||
|
@ -944,7 +959,7 @@ public class AbstractCoordinatorTestCase extends ESTestCase {
|
||||||
coordinator = new Coordinator("test_node", settings, clusterSettings, transportService, writableRegistry(),
|
coordinator = new Coordinator("test_node", settings, clusterSettings, transportService, writableRegistry(),
|
||||||
allocationService, masterService, this::getPersistedState,
|
allocationService, masterService, this::getPersistedState,
|
||||||
Cluster.this::provideSeedHosts, clusterApplierService, onJoinValidators, Randomness.get(), (s, p, r) -> {},
|
Cluster.this::provideSeedHosts, clusterApplierService, onJoinValidators, Randomness.get(), (s, p, r) -> {},
|
||||||
getElectionStrategy());
|
getElectionStrategy(), nodeHealthService);
|
||||||
masterService.setClusterStatePublisher(coordinator);
|
masterService.setClusterStatePublisher(coordinator);
|
||||||
final GatewayService gatewayService
|
final GatewayService gatewayService
|
||||||
= new GatewayService(settings, allocationService, clusterService, threadPool, null, coordinator);
|
= new GatewayService(settings, allocationService, clusterService, threadPool, null, coordinator);
|
||||||
|
@ -984,7 +999,8 @@ public class AbstractCoordinatorTestCase extends ESTestCase {
|
||||||
localNode.isMasterNode() && DiscoveryNode.isMasterNode(nodeSettings)
|
localNode.isMasterNode() && DiscoveryNode.isMasterNode(nodeSettings)
|
||||||
? DiscoveryNodeRole.BUILT_IN_ROLES : emptySet(), Version.CURRENT);
|
? DiscoveryNodeRole.BUILT_IN_ROLES : emptySet(), Version.CURRENT);
|
||||||
return new ClusterNode(nodeIndex, newLocalNode,
|
return new ClusterNode(nodeIndex, newLocalNode,
|
||||||
node -> new MockPersistedState(newLocalNode, persistedState, adaptGlobalMetadata, adaptCurrentTerm), nodeSettings);
|
node -> new MockPersistedState(newLocalNode, persistedState, adaptGlobalMetadata, adaptCurrentTerm), nodeSettings,
|
||||||
|
nodeHealthService);
|
||||||
}
|
}
|
||||||
|
|
||||||
private CoordinationState.PersistedState getPersistedState() {
|
private CoordinationState.PersistedState getPersistedState() {
|
||||||
|
|
Loading…
Reference in New Issue