lifecycle: Verify whether Watcher was stopped manually under a lock
Before if timing permitted it a start and stop could be executed very close to each other time wise. This could make the manual stopped check seem to be valid, because the stop hadn't yet been performed. Original commit: elastic/x-pack-elasticsearch@c1865c1069
This commit is contained in:
parent
8af461daaf
commit
c4cb85104e
|
@ -24,7 +24,8 @@ public class WatcherLifeCycleService extends AbstractComponent implements Cluste
|
||||||
private final WatcherService watcherService;
|
private final WatcherService watcherService;
|
||||||
private final ClusterService clusterService;
|
private final ClusterService clusterService;
|
||||||
|
|
||||||
// Maybe this should be a setting in the cluster settings?
|
// TODO: If Watcher was stopped via api and the master is changed then Watcher will start regardless of the previous
|
||||||
|
// stop command, so at some point this needs to be a cluster setting
|
||||||
private volatile boolean manuallyStopped;
|
private volatile boolean manuallyStopped;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
|
@ -46,24 +47,65 @@ public class WatcherLifeCycleService extends AbstractComponent implements Cluste
|
||||||
}
|
}
|
||||||
|
|
||||||
public void start() {
|
public void start() {
|
||||||
start(clusterService.state());
|
start(clusterService.state(), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void stop() {
|
public void stop() {
|
||||||
stop(true);
|
stop(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private synchronized void start(ClusterState state) {
|
|
||||||
watcherService.start(state);
|
|
||||||
}
|
|
||||||
|
|
||||||
private synchronized void stop(boolean manual) {
|
private synchronized void stop(boolean manual) {
|
||||||
manuallyStopped = manual;
|
manuallyStopped = manual;
|
||||||
watcherService.stop();
|
watcherService.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private synchronized void start(ClusterState state, boolean manual) {
|
||||||
|
WatcherState watcherState = watcherService.state();
|
||||||
|
if (watcherState != WatcherState.STOPPED) {
|
||||||
|
logger.debug("Not starting, because state [{}] while [{}] is expected", watcherState, WatcherState.STOPPED);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we start from a cluster state update we need to check if previously we stopped manually
|
||||||
|
// otherwise Watcher would start upon the next cluster state update while the user instructed Watcher to not run
|
||||||
|
if (!manual && manuallyStopped) {
|
||||||
|
logger.debug("Not starting, because watcher has been stopped manually, so watcher can't be started automatically");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!watcherService.validate(state)) {
|
||||||
|
logger.debug("Not starting, because the cluster state isn't valid");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int attempts = 0;
|
||||||
|
while(true) {
|
||||||
|
try {
|
||||||
|
logger.debug("Start attempt [{}], based on cluster state version [{}]", attempts, state.getVersion());
|
||||||
|
watcherService.start(state);
|
||||||
|
return;
|
||||||
|
} catch (Exception e) {
|
||||||
|
if (++attempts < 3) {
|
||||||
|
logger.warn("error occurred while starting, retrying...", e);
|
||||||
|
try {
|
||||||
|
Thread.sleep(1000);
|
||||||
|
} catch (InterruptedException ie) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
}
|
||||||
|
if (!clusterService.localNode().masterNode()) {
|
||||||
|
logger.error("abort retry, we are no longer master");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger.error("attempted to start Watcher [{}] times, aborting now, please try to start Watcher manually", attempts);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void clusterChanged(ClusterChangedEvent event) {
|
public void clusterChanged(final ClusterChangedEvent event) {
|
||||||
if (!event.localNodeMaster()) {
|
if (!event.localNodeMaster()) {
|
||||||
// We're no longer the master so we need to stop the watcher.
|
// We're no longer the master so we need to stop the watcher.
|
||||||
// Stopping the watcher may take a while since it will wait on the scheduler to complete shutdown,
|
// Stopping the watcher may take a while since it will wait on the scheduler to complete shutdown,
|
||||||
|
@ -77,46 +119,18 @@ public class WatcherLifeCycleService extends AbstractComponent implements Cluste
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
if (event.state().blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)) {
|
if (event.state().blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)) {
|
||||||
// wait until the gateway has recovered from disk, otherwise we think may not have .watchs and
|
// wait until the gateway has recovered from disk, otherwise we think may not have .watches and
|
||||||
// a .watch_history index, but they may not have been restored from the cluster state on disk
|
// a .triggered_watches index, but they may not have been restored from the cluster state on disk
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
final ClusterState state = event.state();
|
final ClusterState state = event.state();
|
||||||
if (!watcherService.validate(state)) {
|
threadPool.executor(ThreadPool.Names.GENERIC).execute(new Runnable() {
|
||||||
return;
|
@Override
|
||||||
}
|
public void run() {
|
||||||
|
start(state, false);
|
||||||
if (watcherService.state() == WatcherState.STOPPED && !manuallyStopped) {
|
}
|
||||||
threadPool.executor(ThreadPool.Names.GENERIC).execute(new Runnable() {
|
});
|
||||||
@Override
|
|
||||||
public void run() {
|
|
||||||
int attempts = 0;
|
|
||||||
while(true) {
|
|
||||||
try {
|
|
||||||
start(state);
|
|
||||||
return;
|
|
||||||
} catch (Exception e) {
|
|
||||||
if (++attempts < 3) {
|
|
||||||
logger.warn("error occurred while starting, retrying...", e);
|
|
||||||
try {
|
|
||||||
Thread.sleep(1000);
|
|
||||||
} catch (InterruptedException ie) {
|
|
||||||
Thread.currentThread().interrupt();
|
|
||||||
}
|
|
||||||
if (!clusterService.localNode().masterNode()) {
|
|
||||||
logger.error("abort retry, we are no longer master");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
logger.error("attempted to start Watcher [{}] times, aborting now, please try to start Watcher manually", attempts);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,7 @@ import static org.mockito.Mockito.*;
|
||||||
*/
|
*/
|
||||||
public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase {
|
public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase {
|
||||||
|
|
||||||
|
private ClusterService clusterService;
|
||||||
private WatcherService watcherService;
|
private WatcherService watcherService;
|
||||||
private WatcherLifeCycleService lifeCycleService;
|
private WatcherLifeCycleService lifeCycleService;
|
||||||
|
|
||||||
|
@ -34,7 +35,7 @@ public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase {
|
||||||
ThreadPool threadPool = mock(ThreadPool.class);
|
ThreadPool threadPool = mock(ThreadPool.class);
|
||||||
when(threadPool.executor(anyString())).thenReturn(MoreExecutors.newDirectExecutorService());
|
when(threadPool.executor(anyString())).thenReturn(MoreExecutors.newDirectExecutorService());
|
||||||
watcherService = mock(WatcherService.class);
|
watcherService = mock(WatcherService.class);
|
||||||
ClusterService clusterService = mock(ClusterService.class);
|
clusterService = mock(ClusterService.class);
|
||||||
lifeCycleService = new WatcherLifeCycleService(Settings.EMPTY, clusterService, threadPool, watcherService);
|
lifeCycleService = new WatcherLifeCycleService(Settings.EMPTY, clusterService, threadPool, watcherService);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,6 +79,14 @@ public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testManualStartStop() {
|
public void testManualStartStop() {
|
||||||
|
DiscoveryNodes.Builder nodes = new DiscoveryNodes.Builder().masterNodeId("id1").localNodeId("id1");
|
||||||
|
ClusterState clusterState = ClusterState.builder(new ClusterName("my-cluster"))
|
||||||
|
.nodes(nodes).build();
|
||||||
|
when(clusterService.state()).thenReturn(clusterState);
|
||||||
|
when(watcherService.state()).thenReturn(WatcherState.STOPPED);
|
||||||
|
when(watcherService.validate(clusterState)).thenReturn(true);
|
||||||
|
|
||||||
|
|
||||||
lifeCycleService.start();
|
lifeCycleService.start();
|
||||||
verify(watcherService, times(1)).start(any(ClusterState.class));
|
verify(watcherService, times(1)).start(any(ClusterState.class));
|
||||||
verify(watcherService, never()).stop();
|
verify(watcherService, never()).stop();
|
||||||
|
@ -87,9 +96,6 @@ public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase {
|
||||||
verify(watcherService, times(1)).stop();
|
verify(watcherService, times(1)).stop();
|
||||||
|
|
||||||
// Starting via cluster state update, we shouldn't start because we have been stopped manually.
|
// Starting via cluster state update, we shouldn't start because we have been stopped manually.
|
||||||
DiscoveryNodes.Builder nodes = new DiscoveryNodes.Builder().masterNodeId("id1").localNodeId("id1");
|
|
||||||
ClusterState clusterState = ClusterState.builder(new ClusterName("my-cluster"))
|
|
||||||
.nodes(nodes).build();
|
|
||||||
when(watcherService.state()).thenReturn(WatcherState.STOPPED);
|
when(watcherService.state()).thenReturn(WatcherState.STOPPED);
|
||||||
lifeCycleService.clusterChanged(new ClusterChangedEvent("any", clusterState, clusterState));
|
lifeCycleService.clusterChanged(new ClusterChangedEvent("any", clusterState, clusterState));
|
||||||
verify(watcherService, times(1)).start(any(ClusterState.class));
|
verify(watcherService, times(1)).start(any(ClusterState.class));
|
||||||
|
@ -120,4 +126,34 @@ public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase {
|
||||||
verify(watcherService, times(2)).stop();
|
verify(watcherService, times(2)).stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testManualStartStop_clusterStateNotValid() {
|
||||||
|
DiscoveryNodes.Builder nodes = new DiscoveryNodes.Builder().masterNodeId("id1").localNodeId("id1");
|
||||||
|
ClusterState clusterState = ClusterState.builder(new ClusterName("my-cluster"))
|
||||||
|
.nodes(nodes).build();
|
||||||
|
when(clusterService.state()).thenReturn(clusterState);
|
||||||
|
when(watcherService.state()).thenReturn(WatcherState.STOPPED);
|
||||||
|
when(watcherService.validate(clusterState)).thenReturn(false);
|
||||||
|
|
||||||
|
|
||||||
|
lifeCycleService.start();
|
||||||
|
verify(watcherService, never()).start(any(ClusterState.class));
|
||||||
|
verify(watcherService, never()).stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testManualStartStop_watcherNotStopped() {
|
||||||
|
DiscoveryNodes.Builder nodes = new DiscoveryNodes.Builder().masterNodeId("id1").localNodeId("id1");
|
||||||
|
ClusterState clusterState = ClusterState.builder(new ClusterName("my-cluster"))
|
||||||
|
.nodes(nodes).build();
|
||||||
|
when(clusterService.state()).thenReturn(clusterState);
|
||||||
|
when(watcherService.state()).thenReturn(WatcherState.STOPPING);
|
||||||
|
|
||||||
|
|
||||||
|
lifeCycleService.start();
|
||||||
|
verify(watcherService, never()).validate(any(ClusterState.class));
|
||||||
|
verify(watcherService, never()).start(any(ClusterState.class));
|
||||||
|
verify(watcherService, never()).stop();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,7 @@ import static org.hamcrest.core.Is.is;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*/
|
*/
|
||||||
@TestLogging("cluster:DEBUG,action.admin.cluster.settings:DEBUG")
|
@TestLogging("cluster:DEBUG,action.admin.cluster.settings:DEBUG,watcher:DEBUG")
|
||||||
@ElasticsearchIntegrationTest.ClusterScope(scope = TEST, numClientNodes = 0, transportClientRatio = 0, randomDynamicTemplates = false, numDataNodes = 1)
|
@ElasticsearchIntegrationTest.ClusterScope(scope = TEST, numClientNodes = 0, transportClientRatio = 0, randomDynamicTemplates = false, numDataNodes = 1)
|
||||||
public class HistoryStoreSettingsTests extends AbstractWatcherIntegrationTests {
|
public class HistoryStoreSettingsTests extends AbstractWatcherIntegrationTests {
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue