Discovery: Improve the lifecycle management of the join control thread in zen discovery.
Also added: * Better exception handling in UnicastZenPing#ping and MulticastZenPing#ping * In the join thread that runs the innerJoinCluster loop, remember the last known exception and throw that when assertions are enabled. We loop until inner join has completed and if continues exceptions are thrown we should fail the test, because the exception shouldn't occur in production (at least not too often). Applied feedback 3 Closes #8327
This commit is contained in:
parent
82278bb7bc
commit
4ddb0575b5
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.elasticsearch;
|
||||
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.logging.ESLogger;
|
||||
import org.elasticsearch.common.logging.Loggers;
|
||||
import org.elasticsearch.rest.RestStatus;
|
||||
|
@ -184,4 +185,18 @@ public final class ExceptionsHelper {
|
|||
)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws the specified exception. If null if specified then <code>true</code> is returned.
|
||||
*/
|
||||
public static boolean reThrowIfNotNull(@Nullable Throwable e) {
|
||||
if (e != null) {
|
||||
if (e instanceof RuntimeException) {
|
||||
throw (RuntimeException) e;
|
||||
} else {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -231,8 +231,8 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
|
||||
@Override
|
||||
protected void doStart() throws ElasticsearchException {
|
||||
|
||||
nodesFD.setLocalNode(clusterService.localNode());
|
||||
joinThreadControl.start();
|
||||
pingService.start();
|
||||
|
||||
// start the join thread from a cluster state update. See {@link JoinThreadControl} for details.
|
||||
|
@ -249,7 +249,6 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
logger.warn("failed to start initial join process", t);
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -344,8 +343,8 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
}
|
||||
|
||||
/**
|
||||
* returns true if there is a currently a background thread active for (re)joining the cluster
|
||||
* used for testing.
|
||||
* returns true if zen discovery is started and there is a currently a background thread active for (re)joining
|
||||
* the cluster used for testing.
|
||||
*/
|
||||
public boolean joiningCluster() {
|
||||
return joinThreadControl.joinThreadActive();
|
||||
|
@ -1278,21 +1277,22 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
private class JoinThreadControl {
|
||||
|
||||
private final ThreadPool threadPool;
|
||||
private final AtomicBoolean running = new AtomicBoolean(false);
|
||||
private final AtomicReference<Thread> currentJoinThread = new AtomicReference<>();
|
||||
|
||||
public JoinThreadControl(ThreadPool threadPool) {
|
||||
this.threadPool = threadPool;
|
||||
}
|
||||
|
||||
/** returns true if there is currently an active join thread */
|
||||
/** returns true if join thread control is started and there is currently an active join thread */
|
||||
public boolean joinThreadActive() {
|
||||
Thread currentThread = currentJoinThread.get();
|
||||
return currentThread != null && currentThread.isAlive();
|
||||
return running.get() && currentThread != null && currentThread.isAlive();
|
||||
}
|
||||
|
||||
/** returns true if the supplied thread is the currently active joinThread */
|
||||
/** returns true if join thread control is started and the supplied thread is the currently active joinThread */
|
||||
public boolean joinThreadActive(Thread joinThread) {
|
||||
return joinThread.equals(currentJoinThread.get());
|
||||
return running.get() && joinThread.equals(currentJoinThread.get());
|
||||
}
|
||||
|
||||
/** cleans any running joining thread and calls {@link #rejoin} */
|
||||
|
@ -1302,7 +1302,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
return rejoin(clusterState, reason);
|
||||
}
|
||||
|
||||
/** starts a new joining thread if there is no currently active one */
|
||||
/** starts a new joining thread if there is no currently active one and join thread controlling is started */
|
||||
public void startNewThreadIfNotRunning() {
|
||||
assertClusterStateThread();
|
||||
if (joinThreadActive()) {
|
||||
|
@ -1315,15 +1315,18 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
if (!currentJoinThread.compareAndSet(null, currentThread)) {
|
||||
return;
|
||||
}
|
||||
while (joinThreadActive(currentThread)) {
|
||||
while (running.get() && joinThreadActive(currentThread)) {
|
||||
try {
|
||||
innerJoinCluster();
|
||||
return;
|
||||
} catch (Throwable t) {
|
||||
logger.error("unexpected error while joining cluster, trying again", t);
|
||||
} catch (Exception e) {
|
||||
logger.error("unexpected error while joining cluster, trying again", e);
|
||||
// Because we catch any exception here, we want to know in
|
||||
// tests if an uncaught exception got to this point and the test infra uncaught exception
|
||||
// leak detection can catch this. In practise no uncaught exception should leak
|
||||
assert ExceptionsHelper.reThrowIfNotNull(e);
|
||||
}
|
||||
}
|
||||
|
||||
// cleaning the current thread from currentJoinThread is done by explicit calls.
|
||||
}
|
||||
});
|
||||
|
@ -1348,6 +1351,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
}
|
||||
|
||||
public void stop() {
|
||||
running.set(false);
|
||||
Thread joinThread = currentJoinThread.getAndSet(null);
|
||||
if (joinThread != null) {
|
||||
try {
|
||||
|
@ -1355,8 +1359,17 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
try {
|
||||
joinThread.join(10000);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void start() {
|
||||
running.set(true);
|
||||
}
|
||||
|
||||
private void assertClusterStateThread() {
|
||||
assert Thread.currentThread().getName().contains(InternalClusterService.UPDATE_THREAD_NAME) : "not called from the cluster state update thread";
|
||||
|
|
|
@ -185,6 +185,7 @@ public class MulticastZenPing extends AbstractLifecycleComponent<ZenPing> implem
|
|||
return;
|
||||
}
|
||||
final int id = pingIdGenerator.incrementAndGet();
|
||||
try {
|
||||
receivedResponses.put(id, new PingCollection());
|
||||
sendPingRequest(id);
|
||||
// try and send another ping request halfway through (just in case someone woke up during it...)
|
||||
|
@ -229,7 +230,10 @@ public class MulticastZenPing extends AbstractLifecycleComponent<ZenPing> implem
|
|||
});
|
||||
}
|
||||
});
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.warn("failed to ping", e);
|
||||
finalizePingCycle(id, listener);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -201,6 +201,7 @@ public class UnicastZenPing extends AbstractLifecycleComponent<ZenPing> implemen
|
|||
@Override
|
||||
public void ping(final PingListener listener, final TimeValue timeout) throws ElasticsearchException {
|
||||
final SendPingsHandler sendPingsHandler = new SendPingsHandler(pingHandlerIdGenerator.incrementAndGet());
|
||||
try {
|
||||
receivedResponses.put(sendPingsHandler.id(), sendPingsHandler);
|
||||
try {
|
||||
sendPings(timeout, null, sendPingsHandler);
|
||||
|
@ -239,6 +240,10 @@ public class UnicastZenPing extends AbstractLifecycleComponent<ZenPing> implemen
|
|||
sendPingsHandler.close();
|
||||
}
|
||||
});
|
||||
} catch (Exception e) {
|
||||
sendPingsHandler.close();
|
||||
throw new ElasticsearchException("Ping execution failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
class SendPingsHandler implements Closeable {
|
||||
|
|
Loading…
Reference in New Issue