From e9c6effd07f5bd14ce87c8cea6ced64a33d8d6b0 Mon Sep 17 00:00:00 2001 From: Christine Poerschke Date: Tue, 17 Jul 2018 17:17:27 +0100 Subject: [PATCH] SOLR-12454: Tweak Overseer leadership transition related logging for easier troubleshooting. --- solr/CHANGES.txt | 2 ++ .../java/org/apache/solr/cloud/Overseer.java | 21 ++++++++++++------- .../solr/cloud/OverseerTaskProcessor.java | 8 +++++-- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 9991e3a7544..5acb0becbe3 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -185,6 +185,8 @@ Other Changes * SOLR-12464: Reduce Overseer.close() logging (for non-Overseer leaders) (Christine Poerschke) +* SOLR-12454: Tweak Overseer leadership transition related logging for easier troubleshooting. (Christine Poerschke) + ================== 7.4.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java b/solr/core/src/java/org/apache/solr/cloud/Overseer.java index 0a991ffd454..8c7e9646978 100644 --- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java +++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java @@ -132,7 +132,7 @@ public class Overseer implements SolrCloseable { isLeader = amILeader(); // not a no, not a yes, try ask again } - log.debug("Starting to work on the main queue"); + log.info("Starting to work on the main queue : {}", LeaderElector.getNodeName(myId)); try { ZkStateWriter zkStateWriter = null; ClusterState clusterState = null; @@ -302,12 +302,12 @@ public class Overseer implements SolrCloseable { return;//shutting down no need to go further } org.apache.zookeeper.data.Stat stat = new org.apache.zookeeper.data.Stat(); - String path = OVERSEER_ELECT + "/leader"; + final String path = OVERSEER_ELECT + "/leader"; byte[] data; try { data = zkClient.getData(path, null, stat, true); } catch (Exception e) { - log.error("could not read the data" ,e); + log.error("could not read the "+path+" data" ,e); return; } try { @@ -315,16 +315,17 @@ public class Overseer implements SolrCloseable { String id = (String) m.get(ID); if(overseerCollectionConfigSetProcessor.getId().equals(id)){ try { - log.warn("I'm exiting, but I'm still the leader"); + log.warn("I (id={}) am exiting, but I'm still the leader", + overseerCollectionConfigSetProcessor.getId()); zkClient.delete(path,stat.getVersion(),true); } catch (KeeperException.BadVersionException e) { //no problem ignore it some other Overseer has already taken over } catch (Exception e) { - log.error("Could not delete my leader node ", e); + log.error("Could not delete my leader node "+path, e); } } else{ - log.debug("somebody else has already taken up the overseer position"); + log.info("somebody else (id={}) has already taken up the overseer position", id); } } finally { //if I am not shutting down, Then I need to rejoin election @@ -413,10 +414,12 @@ public class Overseer implements SolrCloseable { private LeaderStatus amILeader() { Timer.Context timerContext = stats.time("am_i_leader"); boolean success = true; + String propsId = null; try { ZkNodeProps props = ZkNodeProps.load(zkClient.getData( OVERSEER_ELECT + "/leader", null, null, true)); - if (myId.equals(props.getStr(ID))) { + propsId = props.getStr(ID); + if (myId.equals(propsId)) { return LeaderStatus.YES; } } catch (KeeperException e) { @@ -426,6 +429,8 @@ public class Overseer implements SolrCloseable { return LeaderStatus.DONT_KNOW; } else if (e.code() != KeeperException.Code.SESSIONEXPIRED) { log.warn("", e); + } else { + log.debug("", e); } } catch (InterruptedException e) { success = false; @@ -438,7 +443,7 @@ public class Overseer implements SolrCloseable { stats.error("am_i_leader"); } } - log.info("According to ZK I (id=" + myId + ") am no longer a leader."); + log.info("According to ZK I (id={}) am no longer a leader. propsId={}", myId, propsId); return LeaderStatus.NO; } diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java index 86e356497a1..01afc431cbf 100644 --- a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java +++ b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java @@ -388,10 +388,12 @@ public class OverseerTaskProcessor implements Runnable, Closeable { String statsName = "collection_am_i_leader"; Timer.Context timerContext = stats.time(statsName); boolean success = true; + String propsId = null; try { ZkNodeProps props = ZkNodeProps.load(zkStateReader.getZkClient().getData( Overseer.OVERSEER_ELECT + "/leader", null, null, true)); - if (myId.equals(props.getStr(ID))) { + propsId = props.getStr(ID); + if (myId.equals(propsId)) { return LeaderStatus.YES; } } catch (KeeperException e) { @@ -401,6 +403,8 @@ public class OverseerTaskProcessor implements Runnable, Closeable { return LeaderStatus.DONT_KNOW; } else if (e.code() != KeeperException.Code.SESSIONEXPIRED) { log.warn("", e); + } else { + log.debug("", e); } } catch (InterruptedException e) { success = false; @@ -413,7 +417,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable { stats.error(statsName); } } - log.info("According to ZK I (id=" + myId + ") am no longer a leader."); + log.info("According to ZK I (id={}) am no longer a leader. propsId={}", myId, propsId); return LeaderStatus.NO; }