From c8d59f6f0f3ff54873e6cf8a756c02aa8b6bc2ea Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 19 Feb 2019 08:53:20 +0100 Subject: [PATCH] Fix shard follow task startup error handling (#39053) Prior to this commit, if during fetch leader / follower GCP a fatal error occurred, then the shard follow task was removed. This is unexpected, because if such an error occurs during the lifetime of shard follow task then replication is stopped and the fatal error flag is set. This allows the ccr stats api to report the fatal exception that has occurred (instead of the user grepping through the elasticsearch logs). This issue was found by a rare failure of the `FollowStatsIT#testFollowStatsApiIncludeShardFollowStatsWithRemovedFollowerIndex` test. Closes #38779 --- .../xpack/ccr/action/ShardFollowNodeTask.java | 8 ++++++-- .../xpack/ccr/action/ShardFollowTasksExecutor.java | 2 +- .../java/org/elasticsearch/xpack/ccr/FollowStatsIT.java | 1 - 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowNodeTask.java b/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowNodeTask.java index a4f02707bc4..3918b815e91 100644 --- a/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowNodeTask.java +++ b/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowNodeTask.java @@ -452,11 +452,15 @@ public abstract class ShardFollowNodeTask extends AllocatedPersistentTask { scheduler.accept(TimeValue.timeValueMillis(delay), task); } } else { - fatalException = ExceptionsHelper.convertToElastic(e); - LOGGER.warn("shard follow task encounter non-retryable error", e); + setFatalException(e); } } + void setFatalException(Exception e) { + fatalException = ExceptionsHelper.convertToElastic(e); + LOGGER.warn("shard follow task encounter non-retryable error", e); + } + static long computeDelay(int currentRetry, long maxRetryDelayInMillis) { // Cap currentRetry to avoid overflow when computing n variable int maxCurrentRetry = Math.min(currentRetry, 24); diff --git a/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowTasksExecutor.java b/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowTasksExecutor.java index 1a1a60678de..46b3c6e54f5 100644 --- a/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowTasksExecutor.java +++ b/x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/action/ShardFollowTasksExecutor.java @@ -282,7 +282,7 @@ public class ShardFollowTasksExecutor extends PersistentTasksExecutor nodeOperation(task, params, state), params.getMaxRetryDelay(), Ccr.CCR_THREAD_POOL_NAME); } else { - shardFollowNodeTask.markAsFailed(e); + shardFollowNodeTask.setFatalException(e); } }; diff --git a/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/FollowStatsIT.java b/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/FollowStatsIT.java index e6662f3770d..1f1c6cd5c64 100644 --- a/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/FollowStatsIT.java +++ b/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/FollowStatsIT.java @@ -149,7 +149,6 @@ public class FollowStatsIT extends CcrSingleNodeTestCase { assertAcked(client().execute(PauseFollowAction.INSTANCE, new PauseFollowAction.Request("follower1")).actionGet()); } - @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/38779") public void testFollowStatsApiIncludeShardFollowStatsWithRemovedFollowerIndex() throws Exception { final String leaderIndexSettings = getIndexSettings(1, 0, singletonMap(IndexSettings.INDEX_SOFT_DELETES_SETTING.getKey(), "true"));