From deff09434361b854b0ec554364ec593c13170ace Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Mon, 30 Apr 2012 00:57:52 +0300 Subject: [PATCH] When a node disconnects from the cluster (not enough master nodes, or a client node) and rejoins it might not update its internal routing table, closes # --- .../discovery/zen/ZenDiscovery.java | 2 +- .../zen/ping/unicast/UnicastZenPing.java | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java index dd3e8686e6b..c1be4de6100 100644 --- a/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java +++ b/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java @@ -610,7 +610,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent implemen .build(); // clear the routing table, we have no master, so we need to recreate the routing when we reform the cluster - RoutingTable routingTable = RoutingTable.builder().version(clusterState.routingTable().version()).build(); + RoutingTable routingTable = RoutingTable.builder().build(); // we also clean the metadata, since we are going to recover it if we become master MetaData metaData = MetaData.builder().build(); diff --git a/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java b/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java index c1b01ff0871..c5be33bd50a 100644 --- a/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java +++ b/src/main/java/org/elasticsearch/discovery/zen/ping/unicast/UnicastZenPing.java @@ -185,6 +185,7 @@ public class UnicastZenPing extends AbstractLifecycleComponent implemen ConcurrentMap responses = receivedResponses.remove(sendPingsHandler.id()); listener.onPing(responses.values().toArray(new PingResponse[responses.size()])); for (DiscoveryNode node : sendPingsHandler.nodeToDisconnect) { + logger.trace("[{}] disconnecting from {}", sendPingsHandler.id(), node); transportService.disconnectFromNode(node); } sendPingsHandler.close(); @@ -268,12 +269,21 @@ public class UnicastZenPing extends AbstractLifecycleComponent implemen try { // connect to the node, see if we manage to do it, if not, bail if (!nodeFoundByAddress) { + logger.trace("[{}] connecting (light) to {}", sendPingsHandler.id(), nodeToSend); transportService.connectToNodeLight(nodeToSend); } else { + logger.trace("[{}] connecting to {}", sendPingsHandler.id(), nodeToSend); transportService.connectToNode(nodeToSend); } - // we are connected, send the ping request - sendPingRequestToNode(sendPingsHandler.id(), timeout, pingRequest, latch, node, nodeToSend); + logger.trace("[{}] connected to {}", sendPingsHandler.id(), node); + if (receivedResponses.containsKey(sendPingsHandler.id())) { + // we are connected and still in progress, send the ping request + sendPingRequestToNode(sendPingsHandler.id(), timeout, pingRequest, latch, node, nodeToSend); + } else { + // connect took too long, just log it and bail + latch.countDown(); + logger.trace("[{}] connect to {} was too long outside of ping window, bailing", sendPingsHandler.id(), node); + } } catch (ConnectTransportException e) { // can't connect to the node logger.trace("[{}] failed to connect to {}", e, sendPingsHandler.id(), nodeToSend); @@ -295,7 +305,7 @@ public class UnicastZenPing extends AbstractLifecycleComponent implemen } private void sendPingRequestToNode(final int id, TimeValue timeout, UnicastPingRequest pingRequest, final CountDownLatch latch, final DiscoveryNode node, final DiscoveryNode nodeToSend) { - logger.trace("[{}] connecting to {}", id, nodeToSend); + logger.trace("[{}] sending to {}", id, nodeToSend); transportService.sendRequest(nodeToSend, UnicastPingRequestHandler.ACTION, pingRequest, TransportRequestOptions.options().withTimeout((long) (timeout.millis() * 1.25)), new BaseTransportResponseHandler() { @Override