Fix node health-check-related test failures (#59277)
In #52680 we introduced a new health check mechanism. This commit fixes up some sporadic related test failures, and improves the behaviour of the `FollowersChecker` slightly in the case that no retries are configured. Closes #59252 Closes #59172
This commit is contained in:
parent
c80a9e2ec2
commit
d56fc72ee5
|
@ -344,16 +344,16 @@ public class FollowersChecker {
|
|||
failureCountSinceLastSuccess++;
|
||||
|
||||
final String reason;
|
||||
if (failureCountSinceLastSuccess >= followerCheckRetryCount) {
|
||||
logger.debug(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp);
|
||||
reason = "followers check retry count exceeded";
|
||||
} else if (exp instanceof ConnectTransportException
|
||||
if (exp instanceof ConnectTransportException
|
||||
|| exp.getCause() instanceof ConnectTransportException) {
|
||||
logger.debug(() -> new ParameterizedMessage("{} disconnected", FollowerChecker.this), exp);
|
||||
reason = "disconnected";
|
||||
} else if (exp.getCause() instanceof NodeHealthCheckFailureException) {
|
||||
logger.debug(() -> new ParameterizedMessage("{} health check failed", FollowerChecker.this), exp);
|
||||
reason = "health check failed";
|
||||
} else if (failureCountSinceLastSuccess >= followerCheckRetryCount) {
|
||||
logger.debug(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp);
|
||||
reason = "followers check retry count exceeded";
|
||||
} else {
|
||||
logger.debug(() -> new ParameterizedMessage("{} failed, retrying", FollowerChecker.this), exp);
|
||||
scheduleNextWakeUp();
|
||||
|
|
|
@ -129,7 +129,7 @@ public class FsHealthService extends AbstractLifecycleComponent implements NodeH
|
|||
|
||||
class FsHealthMonitor implements Runnable {
|
||||
|
||||
private static final String TEMP_FILE_NAME = ".es_temp_file";
|
||||
static final String TEMP_FILE_NAME = ".es_temp_file";
|
||||
private byte[] byteToWrite;
|
||||
|
||||
FsHealthMonitor(){
|
||||
|
|
|
@ -168,18 +168,7 @@ public class FollowersCheckerTests extends ESTestCase {
|
|||
}
|
||||
|
||||
public void testFailsNodeThatDoesNotRespond() {
|
||||
final Builder settingsBuilder = Settings.builder();
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), randomIntBetween(1, 100000) + "ms");
|
||||
}
|
||||
final Settings settings = settingsBuilder.build();
|
||||
|
||||
final Settings settings = randomSettings();
|
||||
testBehaviourOfFailingNode(settings, () -> null,
|
||||
"followers check retry count exceeded",
|
||||
(FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis()
|
||||
|
@ -188,15 +177,7 @@ public class FollowersCheckerTests extends ESTestCase {
|
|||
}
|
||||
|
||||
public void testFailsNodeThatRejectsCheck() {
|
||||
final Builder settingsBuilder = Settings.builder();
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
|
||||
}
|
||||
final Settings settings = settingsBuilder.build();
|
||||
|
||||
final Settings settings = randomSettings();
|
||||
testBehaviourOfFailingNode(settings, () -> {
|
||||
throw new ElasticsearchException("simulated exception");
|
||||
},
|
||||
|
@ -206,15 +187,7 @@ public class FollowersCheckerTests extends ESTestCase {
|
|||
}
|
||||
|
||||
public void testFailureCounterResetsOnSuccess() {
|
||||
final Builder settingsBuilder = Settings.builder();
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(2, 10));
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
|
||||
}
|
||||
final Settings settings = settingsBuilder.build();
|
||||
|
||||
final Settings settings = randomSettings();
|
||||
final int retryCount = FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings);
|
||||
final int maxRecoveries = randomIntBetween(3, 10);
|
||||
|
||||
|
@ -297,16 +270,7 @@ public class FollowersCheckerTests extends ESTestCase {
|
|||
}
|
||||
|
||||
public void testFailsNodeThatIsUnhealthy() {
|
||||
final Builder settingsBuilder = Settings.builder();
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
|
||||
}
|
||||
final Settings settings = settingsBuilder.build();
|
||||
|
||||
testBehaviourOfFailingNode(settings, () -> {
|
||||
testBehaviourOfFailingNode(randomSettings(), () -> {
|
||||
throw new NodeHealthCheckFailureException("non writable exception");
|
||||
}, "health check failed", 0, () -> new StatusInfo(HEALTHY, "healthy-info"));
|
||||
}
|
||||
|
@ -321,7 +285,7 @@ public class FollowersCheckerTests extends ESTestCase {
|
|||
final MockTransport mockTransport = new MockTransport() {
|
||||
@Override
|
||||
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
|
||||
assertFalse(node.equals(localNode));
|
||||
assertNotEquals(node, localNode);
|
||||
deterministicTaskQueue.scheduleNow(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
|
@ -674,6 +638,20 @@ public class FollowersCheckerTests extends ESTestCase {
|
|||
Version.CURRENT);
|
||||
}
|
||||
|
||||
private static Settings randomSettings() {
|
||||
final Builder settingsBuilder = Settings.builder();
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
|
||||
}
|
||||
if (randomBoolean()) {
|
||||
settingsBuilder.put(FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), randomIntBetween(1, 100000) + "ms");
|
||||
}
|
||||
return settingsBuilder.build();
|
||||
}
|
||||
|
||||
private static class ExpectsSuccess implements TransportResponseHandler<Empty> {
|
||||
private final AtomicBoolean responseReceived = new AtomicBoolean();
|
||||
|
||||
|
|
|
@ -305,8 +305,7 @@ public class FsHealthServiceTests extends ESTestCase {
|
|||
AtomicBoolean injectIOException = new AtomicBoolean();
|
||||
AtomicInteger injectedPaths = new AtomicInteger();
|
||||
|
||||
private String pathPrefix = "/";
|
||||
private long delay;
|
||||
private final long delay;
|
||||
private final ThreadPool threadPool;
|
||||
|
||||
FileSystemFsyncHungProvider(FileSystem inner, long delay, ThreadPool threadPool) {
|
||||
|
@ -325,7 +324,7 @@ public class FsHealthServiceTests extends ESTestCase {
|
|||
@Override
|
||||
public void force(boolean metaData) throws IOException {
|
||||
if (injectIOException.get()) {
|
||||
if (path.toString().startsWith(pathPrefix) && path.toString().endsWith(".es_temp_file")) {
|
||||
if (path.getFileName().toString().equals(FsHealthService.FsHealthMonitor.TEMP_FILE_NAME)) {
|
||||
injectedPaths.incrementAndGet();
|
||||
final long startTimeMillis = threadPool.relativeTimeInMillis();
|
||||
do {
|
||||
|
|
Loading…
Reference in New Issue