SOLR-12454: Tweak Overseer leadership transition related logging for easier troubleshooting.

This commit is contained in:
Christine Poerschke 2018-07-17 17:17:27 +01:00
parent f7aed28760
commit e9c6effd07
3 changed files with 21 additions and 10 deletions

View File

@ -185,6 +185,8 @@ Other Changes
* SOLR-12464: Reduce Overseer.close() logging (for non-Overseer leaders) (Christine Poerschke)
* SOLR-12454: Tweak Overseer leadership transition related logging for easier troubleshooting. (Christine Poerschke)
================== 7.4.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.

View File

@ -132,7 +132,7 @@ public class Overseer implements SolrCloseable {
isLeader = amILeader(); // not a no, not a yes, try ask again
}
log.debug("Starting to work on the main queue");
log.info("Starting to work on the main queue : {}", LeaderElector.getNodeName(myId));
try {
ZkStateWriter zkStateWriter = null;
ClusterState clusterState = null;
@ -302,12 +302,12 @@ public class Overseer implements SolrCloseable {
return;//shutting down no need to go further
}
org.apache.zookeeper.data.Stat stat = new org.apache.zookeeper.data.Stat();
String path = OVERSEER_ELECT + "/leader";
final String path = OVERSEER_ELECT + "/leader";
byte[] data;
try {
data = zkClient.getData(path, null, stat, true);
} catch (Exception e) {
log.error("could not read the data" ,e);
log.error("could not read the "+path+" data" ,e);
return;
}
try {
@ -315,16 +315,17 @@ public class Overseer implements SolrCloseable {
String id = (String) m.get(ID);
if(overseerCollectionConfigSetProcessor.getId().equals(id)){
try {
log.warn("I'm exiting, but I'm still the leader");
log.warn("I (id={}) am exiting, but I'm still the leader",
overseerCollectionConfigSetProcessor.getId());
zkClient.delete(path,stat.getVersion(),true);
} catch (KeeperException.BadVersionException e) {
//no problem ignore it some other Overseer has already taken over
} catch (Exception e) {
log.error("Could not delete my leader node ", e);
log.error("Could not delete my leader node "+path, e);
}
} else{
log.debug("somebody else has already taken up the overseer position");
log.info("somebody else (id={}) has already taken up the overseer position", id);
}
} finally {
//if I am not shutting down, Then I need to rejoin election
@ -413,10 +414,12 @@ public class Overseer implements SolrCloseable {
private LeaderStatus amILeader() {
Timer.Context timerContext = stats.time("am_i_leader");
boolean success = true;
String propsId = null;
try {
ZkNodeProps props = ZkNodeProps.load(zkClient.getData(
OVERSEER_ELECT + "/leader", null, null, true));
if (myId.equals(props.getStr(ID))) {
propsId = props.getStr(ID);
if (myId.equals(propsId)) {
return LeaderStatus.YES;
}
} catch (KeeperException e) {
@ -426,6 +429,8 @@ public class Overseer implements SolrCloseable {
return LeaderStatus.DONT_KNOW;
} else if (e.code() != KeeperException.Code.SESSIONEXPIRED) {
log.warn("", e);
} else {
log.debug("", e);
}
} catch (InterruptedException e) {
success = false;
@ -438,7 +443,7 @@ public class Overseer implements SolrCloseable {
stats.error("am_i_leader");
}
}
log.info("According to ZK I (id=" + myId + ") am no longer a leader.");
log.info("According to ZK I (id={}) am no longer a leader. propsId={}", myId, propsId);
return LeaderStatus.NO;
}

View File

@ -388,10 +388,12 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
String statsName = "collection_am_i_leader";
Timer.Context timerContext = stats.time(statsName);
boolean success = true;
String propsId = null;
try {
ZkNodeProps props = ZkNodeProps.load(zkStateReader.getZkClient().getData(
Overseer.OVERSEER_ELECT + "/leader", null, null, true));
if (myId.equals(props.getStr(ID))) {
propsId = props.getStr(ID);
if (myId.equals(propsId)) {
return LeaderStatus.YES;
}
} catch (KeeperException e) {
@ -401,6 +403,8 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
return LeaderStatus.DONT_KNOW;
} else if (e.code() != KeeperException.Code.SESSIONEXPIRED) {
log.warn("", e);
} else {
log.debug("", e);
}
} catch (InterruptedException e) {
success = false;
@ -413,7 +417,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
stats.error(statsName);
}
}
log.info("According to ZK I (id=" + myId + ") am no longer a leader.");
log.info("According to ZK I (id={}) am no longer a leader. propsId={}", myId, propsId);
return LeaderStatus.NO;
}