Fix node health-check-related test failures (#59277)

In #52680 we introduced a new health check mechanism. This commit fixes
up some sporadic related test failures, and improves the behaviour of
the `FollowersChecker` slightly in the case that no retries are
configured.

Closes #59252
Closes #59172
This commit is contained in:
David Turner 2020-07-09 12:45:55 +01:00
parent c80a9e2ec2
commit d56fc72ee5
4 changed files with 26 additions and 49 deletions

View File

@ -344,16 +344,16 @@ public class FollowersChecker {
failureCountSinceLastSuccess++;
final String reason;
if (failureCountSinceLastSuccess >= followerCheckRetryCount) {
logger.debug(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp);
reason = "followers check retry count exceeded";
} else if (exp instanceof ConnectTransportException
if (exp instanceof ConnectTransportException
|| exp.getCause() instanceof ConnectTransportException) {
logger.debug(() -> new ParameterizedMessage("{} disconnected", FollowerChecker.this), exp);
reason = "disconnected";
} else if (exp.getCause() instanceof NodeHealthCheckFailureException) {
logger.debug(() -> new ParameterizedMessage("{} health check failed", FollowerChecker.this), exp);
reason = "health check failed";
} else if (failureCountSinceLastSuccess >= followerCheckRetryCount) {
logger.debug(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp);
reason = "followers check retry count exceeded";
} else {
logger.debug(() -> new ParameterizedMessage("{} failed, retrying", FollowerChecker.this), exp);
scheduleNextWakeUp();

View File

@ -129,7 +129,7 @@ public class FsHealthService extends AbstractLifecycleComponent implements NodeH
class FsHealthMonitor implements Runnable {
private static final String TEMP_FILE_NAME = ".es_temp_file";
static final String TEMP_FILE_NAME = ".es_temp_file";
private byte[] byteToWrite;
FsHealthMonitor(){

View File

@ -168,18 +168,7 @@ public class FollowersCheckerTests extends ESTestCase {
}
public void testFailsNodeThatDoesNotRespond() {
final Builder settingsBuilder = Settings.builder();
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), randomIntBetween(1, 100000) + "ms");
}
final Settings settings = settingsBuilder.build();
final Settings settings = randomSettings();
testBehaviourOfFailingNode(settings, () -> null,
"followers check retry count exceeded",
(FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis()
@ -188,15 +177,7 @@ public class FollowersCheckerTests extends ESTestCase {
}
public void testFailsNodeThatRejectsCheck() {
final Builder settingsBuilder = Settings.builder();
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
}
final Settings settings = settingsBuilder.build();
final Settings settings = randomSettings();
testBehaviourOfFailingNode(settings, () -> {
throw new ElasticsearchException("simulated exception");
},
@ -206,15 +187,7 @@ public class FollowersCheckerTests extends ESTestCase {
}
public void testFailureCounterResetsOnSuccess() {
final Builder settingsBuilder = Settings.builder();
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(2, 10));
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
}
final Settings settings = settingsBuilder.build();
final Settings settings = randomSettings();
final int retryCount = FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings);
final int maxRecoveries = randomIntBetween(3, 10);
@ -297,16 +270,7 @@ public class FollowersCheckerTests extends ESTestCase {
}
public void testFailsNodeThatIsUnhealthy() {
final Builder settingsBuilder = Settings.builder();
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
}
final Settings settings = settingsBuilder.build();
testBehaviourOfFailingNode(settings, () -> {
testBehaviourOfFailingNode(randomSettings(), () -> {
throw new NodeHealthCheckFailureException("non writable exception");
}, "health check failed", 0, () -> new StatusInfo(HEALTHY, "healthy-info"));
}
@ -321,7 +285,7 @@ public class FollowersCheckerTests extends ESTestCase {
final MockTransport mockTransport = new MockTransport() {
@Override
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
assertFalse(node.equals(localNode));
assertNotEquals(node, localNode);
deterministicTaskQueue.scheduleNow(new Runnable() {
@Override
public void run() {
@ -674,6 +638,20 @@ public class FollowersCheckerTests extends ESTestCase {
Version.CURRENT);
}
private static Settings randomSettings() {
final Builder settingsBuilder = Settings.builder();
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), randomIntBetween(1, 10));
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_INTERVAL_SETTING.getKey(), randomIntBetween(100, 100000) + "ms");
}
if (randomBoolean()) {
settingsBuilder.put(FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), randomIntBetween(1, 100000) + "ms");
}
return settingsBuilder.build();
}
private static class ExpectsSuccess implements TransportResponseHandler<Empty> {
private final AtomicBoolean responseReceived = new AtomicBoolean();

View File

@ -305,8 +305,7 @@ public class FsHealthServiceTests extends ESTestCase {
AtomicBoolean injectIOException = new AtomicBoolean();
AtomicInteger injectedPaths = new AtomicInteger();
private String pathPrefix = "/";
private long delay;
private final long delay;
private final ThreadPool threadPool;
FileSystemFsyncHungProvider(FileSystem inner, long delay, ThreadPool threadPool) {
@ -325,7 +324,7 @@ public class FsHealthServiceTests extends ESTestCase {
@Override
public void force(boolean metaData) throws IOException {
if (injectIOException.get()) {
if (path.toString().startsWith(pathPrefix) && path.toString().endsWith(".es_temp_file")) {
if (path.getFileName().toString().equals(FsHealthService.FsHealthMonitor.TEMP_FILE_NAME)) {
injectedPaths.incrementAndGet();
final long startTimeMillis = threadPool.relativeTimeInMillis();
do {