lifecycle: Verify whether Watcher was stopped manually under a lock

Before if timing permitted it a start and stop could be executed very close to each other time wise. This could make the manual stopped check seem to be valid, because the stop hadn't yet been performed.

Original commit: elastic/x-pack-elasticsearch@c1865c1069
This commit is contained in:
Martijn van Groningen 2015-06-18 13:45:55 +02:00
parent 8af461daaf
commit c4cb85104e
3 changed files with 98 additions and 48 deletions

View File

@ -24,7 +24,8 @@ public class WatcherLifeCycleService extends AbstractComponent implements Cluste
private final WatcherService watcherService; private final WatcherService watcherService;
private final ClusterService clusterService; private final ClusterService clusterService;
// Maybe this should be a setting in the cluster settings? // TODO: If Watcher was stopped via api and the master is changed then Watcher will start regardless of the previous
// stop command, so at some point this needs to be a cluster setting
private volatile boolean manuallyStopped; private volatile boolean manuallyStopped;
@Inject @Inject
@ -46,24 +47,65 @@ public class WatcherLifeCycleService extends AbstractComponent implements Cluste
} }
public void start() { public void start() {
start(clusterService.state()); start(clusterService.state(), true);
} }
public void stop() { public void stop() {
stop(true); stop(true);
} }
private synchronized void start(ClusterState state) {
watcherService.start(state);
}
private synchronized void stop(boolean manual) { private synchronized void stop(boolean manual) {
manuallyStopped = manual; manuallyStopped = manual;
watcherService.stop(); watcherService.stop();
} }
private synchronized void start(ClusterState state, boolean manual) {
WatcherState watcherState = watcherService.state();
if (watcherState != WatcherState.STOPPED) {
logger.debug("Not starting, because state [{}] while [{}] is expected", watcherState, WatcherState.STOPPED);
return;
}
// If we start from a cluster state update we need to check if previously we stopped manually
// otherwise Watcher would start upon the next cluster state update while the user instructed Watcher to not run
if (!manual && manuallyStopped) {
logger.debug("Not starting, because watcher has been stopped manually, so watcher can't be started automatically");
return;
}
if (!watcherService.validate(state)) {
logger.debug("Not starting, because the cluster state isn't valid");
return;
}
int attempts = 0;
while(true) {
try {
logger.debug("Start attempt [{}], based on cluster state version [{}]", attempts, state.getVersion());
watcherService.start(state);
return;
} catch (Exception e) {
if (++attempts < 3) {
logger.warn("error occurred while starting, retrying...", e);
try {
Thread.sleep(1000);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
}
if (!clusterService.localNode().masterNode()) {
logger.error("abort retry, we are no longer master");
return;
}
} else {
logger.error("attempted to start Watcher [{}] times, aborting now, please try to start Watcher manually", attempts);
return;
}
}
}
}
@Override @Override
public void clusterChanged(ClusterChangedEvent event) { public void clusterChanged(final ClusterChangedEvent event) {
if (!event.localNodeMaster()) { if (!event.localNodeMaster()) {
// We're no longer the master so we need to stop the watcher. // We're no longer the master so we need to stop the watcher.
// Stopping the watcher may take a while since it will wait on the scheduler to complete shutdown, // Stopping the watcher may take a while since it will wait on the scheduler to complete shutdown,
@ -77,46 +119,18 @@ public class WatcherLifeCycleService extends AbstractComponent implements Cluste
}); });
} else { } else {
if (event.state().blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)) { if (event.state().blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)) {
// wait until the gateway has recovered from disk, otherwise we think may not have .watchs and // wait until the gateway has recovered from disk, otherwise we think may not have .watches and
// a .watch_history index, but they may not have been restored from the cluster state on disk // a .triggered_watches index, but they may not have been restored from the cluster state on disk
return; return;
} }
final ClusterState state = event.state(); final ClusterState state = event.state();
if (!watcherService.validate(state)) { threadPool.executor(ThreadPool.Names.GENERIC).execute(new Runnable() {
return; @Override
} public void run() {
start(state, false);
if (watcherService.state() == WatcherState.STOPPED && !manuallyStopped) { }
threadPool.executor(ThreadPool.Names.GENERIC).execute(new Runnable() { });
@Override
public void run() {
int attempts = 0;
while(true) {
try {
start(state);
return;
} catch (Exception e) {
if (++attempts < 3) {
logger.warn("error occurred while starting, retrying...", e);
try {
Thread.sleep(1000);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
}
if (!clusterService.localNode().masterNode()) {
logger.error("abort retry, we are no longer master");
return;
}
} else {
logger.error("attempted to start Watcher [{}] times, aborting now, please try to start Watcher manually", attempts);
return;
}
}
}
}
});
}
} }
} }
} }

View File

@ -26,6 +26,7 @@ import static org.mockito.Mockito.*;
*/ */
public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase { public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase {
private ClusterService clusterService;
private WatcherService watcherService; private WatcherService watcherService;
private WatcherLifeCycleService lifeCycleService; private WatcherLifeCycleService lifeCycleService;
@ -34,7 +35,7 @@ public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase {
ThreadPool threadPool = mock(ThreadPool.class); ThreadPool threadPool = mock(ThreadPool.class);
when(threadPool.executor(anyString())).thenReturn(MoreExecutors.newDirectExecutorService()); when(threadPool.executor(anyString())).thenReturn(MoreExecutors.newDirectExecutorService());
watcherService = mock(WatcherService.class); watcherService = mock(WatcherService.class);
ClusterService clusterService = mock(ClusterService.class); clusterService = mock(ClusterService.class);
lifeCycleService = new WatcherLifeCycleService(Settings.EMPTY, clusterService, threadPool, watcherService); lifeCycleService = new WatcherLifeCycleService(Settings.EMPTY, clusterService, threadPool, watcherService);
} }
@ -78,6 +79,14 @@ public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase {
@Test @Test
public void testManualStartStop() { public void testManualStartStop() {
DiscoveryNodes.Builder nodes = new DiscoveryNodes.Builder().masterNodeId("id1").localNodeId("id1");
ClusterState clusterState = ClusterState.builder(new ClusterName("my-cluster"))
.nodes(nodes).build();
when(clusterService.state()).thenReturn(clusterState);
when(watcherService.state()).thenReturn(WatcherState.STOPPED);
when(watcherService.validate(clusterState)).thenReturn(true);
lifeCycleService.start(); lifeCycleService.start();
verify(watcherService, times(1)).start(any(ClusterState.class)); verify(watcherService, times(1)).start(any(ClusterState.class));
verify(watcherService, never()).stop(); verify(watcherService, never()).stop();
@ -87,9 +96,6 @@ public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase {
verify(watcherService, times(1)).stop(); verify(watcherService, times(1)).stop();
// Starting via cluster state update, we shouldn't start because we have been stopped manually. // Starting via cluster state update, we shouldn't start because we have been stopped manually.
DiscoveryNodes.Builder nodes = new DiscoveryNodes.Builder().masterNodeId("id1").localNodeId("id1");
ClusterState clusterState = ClusterState.builder(new ClusterName("my-cluster"))
.nodes(nodes).build();
when(watcherService.state()).thenReturn(WatcherState.STOPPED); when(watcherService.state()).thenReturn(WatcherState.STOPPED);
lifeCycleService.clusterChanged(new ClusterChangedEvent("any", clusterState, clusterState)); lifeCycleService.clusterChanged(new ClusterChangedEvent("any", clusterState, clusterState));
verify(watcherService, times(1)).start(any(ClusterState.class)); verify(watcherService, times(1)).start(any(ClusterState.class));
@ -120,4 +126,34 @@ public class WatcherLifeCycleServiceTests extends ElasticsearchTestCase {
verify(watcherService, times(2)).stop(); verify(watcherService, times(2)).stop();
} }
@Test
public void testManualStartStop_clusterStateNotValid() {
DiscoveryNodes.Builder nodes = new DiscoveryNodes.Builder().masterNodeId("id1").localNodeId("id1");
ClusterState clusterState = ClusterState.builder(new ClusterName("my-cluster"))
.nodes(nodes).build();
when(clusterService.state()).thenReturn(clusterState);
when(watcherService.state()).thenReturn(WatcherState.STOPPED);
when(watcherService.validate(clusterState)).thenReturn(false);
lifeCycleService.start();
verify(watcherService, never()).start(any(ClusterState.class));
verify(watcherService, never()).stop();
}
@Test
public void testManualStartStop_watcherNotStopped() {
DiscoveryNodes.Builder nodes = new DiscoveryNodes.Builder().masterNodeId("id1").localNodeId("id1");
ClusterState clusterState = ClusterState.builder(new ClusterName("my-cluster"))
.nodes(nodes).build();
when(clusterService.state()).thenReturn(clusterState);
when(watcherService.state()).thenReturn(WatcherState.STOPPING);
lifeCycleService.start();
verify(watcherService, never()).validate(any(ClusterState.class));
verify(watcherService, never()).start(any(ClusterState.class));
verify(watcherService, never()).stop();
}
} }

View File

@ -20,7 +20,7 @@ import static org.hamcrest.core.Is.is;
/** /**
*/ */
@TestLogging("cluster:DEBUG,action.admin.cluster.settings:DEBUG") @TestLogging("cluster:DEBUG,action.admin.cluster.settings:DEBUG,watcher:DEBUG")
@ElasticsearchIntegrationTest.ClusterScope(scope = TEST, numClientNodes = 0, transportClientRatio = 0, randomDynamicTemplates = false, numDataNodes = 1) @ElasticsearchIntegrationTest.ClusterScope(scope = TEST, numClientNodes = 0, transportClientRatio = 0, randomDynamicTemplates = false, numDataNodes = 1)
public class HistoryStoreSettingsTests extends AbstractWatcherIntegrationTests { public class HistoryStoreSettingsTests extends AbstractWatcherIntegrationTests {