Cancel persistent task recheck when no longer master (#58539)

If a persistent task cannot be assigned on the first attempt
then the master node will schedule periodic rechecks to see
if the assignment requirements have been met.

These periodic rechecks should be cancelled if the node ceases
to be master.  Previously they weren't, leading to exceptions
being logged repeatedly.  This PR cancels the rechecks on
learning that the node is no longer the master.

Fixes #58531
This commit is contained in:
David Roberts 2020-06-25 15:15:52 +01:00
parent 958b21d727
commit 1742b1c39e
2 changed files with 63 additions and 4 deletions

View File

@ -28,6 +28,7 @@ import org.elasticsearch.cluster.ClusterChangedEvent;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ClusterStateListener;
import org.elasticsearch.cluster.ClusterStateUpdateTask;
import org.elasticsearch.cluster.NotMasterException;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.service.ClusterService;
@ -78,6 +79,11 @@ public class PersistentTasksClusterService implements ClusterStateListener, Clos
periodicRechecker.setInterval(recheckInterval);
}
// visible for testing only
PeriodicRechecker getPeriodicRechecker() {
return periodicRechecker;
}
@Override
public void close() {
periodicRechecker.close();
@ -317,6 +323,8 @@ public class PersistentTasksClusterService implements ClusterStateListener, Clos
logger.trace("checking task reassignment for cluster state {}", event.state().getVersion());
reassignPersistentTasks();
}
} else {
periodicRechecker.cancel();
}
}
@ -333,9 +341,12 @@ public class PersistentTasksClusterService implements ClusterStateListener, Clos
@Override
public void onFailure(String source, Exception e) {
logger.warn("failed to reassign persistent tasks", e);
// There must be a task that's worth rechecking because there was one
// that caused this method to be called and the method failed to assign it
periodicRechecker.rescheduleIfNecessary();
if (e instanceof NotMasterException == false) {
// There must be a task that's worth rechecking because there was one
// that caused this method to be called and the method failed to assign it,
// but only do this if the node is still the master
periodicRechecker.rescheduleIfNecessary();
}
}
@Override
@ -450,7 +461,7 @@ public class PersistentTasksClusterService implements ClusterStateListener, Clos
/**
* Class to periodically try to reassign unassigned persistent tasks.
*/
private class PeriodicRechecker extends AbstractAsyncTask {
class PeriodicRechecker extends AbstractAsyncTask {
PeriodicRechecker(TimeValue recheckInterval) {
super(logger, threadPool, recheckInterval, false);

View File

@ -471,6 +471,54 @@ public class PersistentTasksClusterServiceTests extends ESTestCase {
});
}
public void testPeriodicRecheckOffMaster() {
ClusterState initialState = initialState();
ClusterState.Builder builder = ClusterState.builder(initialState);
PersistentTasksCustomMetadata.Builder tasks = PersistentTasksCustomMetadata.builder(
initialState.metadata().custom(PersistentTasksCustomMetadata.TYPE));
DiscoveryNodes.Builder nodes = DiscoveryNodes.builder(initialState.nodes());
addTestNodes(nodes, randomIntBetween(1, 3));
addTask(tasks, "assign_based_on_non_cluster_state_condition", null);
Metadata.Builder metadata = Metadata.builder(initialState.metadata()).putCustom(PersistentTasksCustomMetadata.TYPE, tasks.build());
ClusterState clusterState = builder.metadata(metadata).nodes(nodes).build();
nonClusterStateCondition = false;
ClusterService recheckTestClusterService = createRecheckTestClusterService(clusterState, false);
PersistentTasksClusterService service = createService(recheckTestClusterService,
(params, currentState) -> assignBasedOnNonClusterStateCondition(currentState.nodes()));
ClusterChangedEvent event = new ClusterChangedEvent("test", clusterState, initialState);
service.clusterChanged(event);
ClusterState newClusterState = recheckTestClusterService.state();
{
PersistentTasksCustomMetadata tasksInProgress = newClusterState.getMetadata().custom(PersistentTasksCustomMetadata.TYPE);
assertThat(tasksInProgress, notNullValue());
for (PersistentTask<?> task : tasksInProgress.tasks()) {
assertThat(task.getExecutorNode(), nullValue());
assertThat(task.isAssigned(), equalTo(false));
assertThat(task.getAssignment().getExplanation(), equalTo("non-cluster state condition prevents assignment"));
}
assertThat(tasksInProgress.tasks().size(), equalTo(1));
}
// The rechecker should recheck indefinitely on the master node as the
// task can never be assigned while nonClusterStateCondition = false
assertTrue(service.getPeriodicRechecker().isScheduled());
// Now simulate the node ceasing to be the master
builder = ClusterState.builder(clusterState);
nodes = DiscoveryNodes.builder(clusterState.nodes());
nodes.add(DiscoveryNode.createLocal(Settings.EMPTY, buildNewFakeTransportAddress(), "a_new_master_node"));
nodes.masterNodeId("a_new_master_node");
ClusterState nonMasterClusterState = builder.nodes(nodes).build();
event = new ClusterChangedEvent("test", nonMasterClusterState, clusterState);
service.clusterChanged(event);
// The service should have cancelled the rechecker on learning it is no longer running on the master node
assertFalse(service.getPeriodicRechecker().isScheduled());
}
public void testUnassignTask() {
ClusterState clusterState = initialState();
ClusterState.Builder builder = ClusterState.builder(clusterState);