From 83d9dab79882700fb21cb496282bafdb1d28a5ca Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Thu, 6 Nov 2014 11:21:56 +0100 Subject: [PATCH] Discovery: a more lenient wait joinThread when stopping When a node stops, we cancel any ongoing join process. With #8327, we improved this logic and wait for it to complete before shutting down the node. In our tests we typically shutdown an entire cluster at once, which makes it very likely for nodes to be joining while shutting down. This introduces a race condition where the joinThread.interrupt can happen before the thread starts waiting on pings which causes shutdown logic to be slow. This commits improves by repeatedly trying to stop the thread in smaller waits. Another side effect of the change is that we are now more likely to ping ourselves while shutting down, we results in an ugly warn level log. We now log all remote exception during pings at a debug level. Closes #8359 --- .../discovery/zen/ZenDiscovery.java | 17 ++++++++--------- .../zen/ping/unicast/UnicastZenPing.java | 3 +++ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java index 794215410e9..38dded7599c 100644 --- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java +++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java @@ -253,6 +253,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen @Override protected void doStop() throws ElasticsearchException { + joinThreadControl.stop(); pingService.stop(); masterFD.stop("zen disco stop"); nodesFD.stop(); @@ -282,7 +283,6 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen } } } - joinThreadControl.stop(); } @Override @@ -1354,15 +1354,14 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen running.set(false); Thread joinThread = currentJoinThread.getAndSet(null); if (joinThread != null) { - try { + for (int i = 0; i < 10 && joinThread.isAlive(); i++) { joinThread.interrupt(); - } catch (Exception e) { - // ignore - } - try { - joinThread.join(10000); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); + try { + joinThread.join(200); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } } } } diff --git a/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java b/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java index b35621b78d2..b318a87f3fa 100644 --- a/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java +++ b/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java @@ -373,6 +373,9 @@ public class UnicastZenPing extends AbstractLifecycleComponent implemen } catch (ConnectTransportException e) { // can't connect to the node - this is a more common path! logger.trace("[{}] failed to connect to {}", e, sendPingsHandler.id(), finalNodeToSend); + } catch (RemoteTransportException e) { + // something went wrong on the other side + logger.debug("[{}] received a remote error as a response to ping {}", e, sendPingsHandler.id(), finalNodeToSend); } catch (Throwable e) { logger.warn("[{}] failed send ping to {}", e, sendPingsHandler.id(), finalNodeToSend); } finally {