YARN-4101. RM should print alert messages if Zookeeper and Resourcemanager gets connection issue. Contributed by Xuan Gong

This commit is contained in:
Jian He 2015-09-02 17:45:23 -07:00
parent d31a41c359
commit 09c64ba1ba
9 changed files with 43 additions and 3 deletions

View File

@ -1141,4 +1141,8 @@ public String toString() {
((appData == null) ? "null" : StringUtils.byteToHexString(appData)) +
" cb=" + appClient;
}
public String getHAZookeeperConnectionState() {
return this.zkConnectionState.name();
}
}

View File

@ -871,6 +871,9 @@ Release 2.7.2 - UNRELEASED
YARN-3893. Both RM in active state when Admin#transitionToActive failure
from refeshAll() (Bibin A Chundatt via rohithsharmaks)
YARN-4101. RM should print alert messages if Zookeeper and Resourcemanager
gets connection issue. (Xuan Gong via jianhe)
Release 2.7.1 - 2015-07-06
INCOMPATIBLE CHANGES

View File

@ -782,4 +782,13 @@ public RefreshClusterMaxPriorityResponse refreshClusterMaxPriority(
throw logAndWrapException(e, user.getShortUserName(), argName, msg);
}
}
public String getHAZookeeperConnectionState() {
if (!rmContext.isHAEnabled()) {
return "ResourceManager HA is not enabled.";
} else if (!autoFailoverEnabled) {
return "Auto Failover is not enabled.";
}
return this.embeddedElector.getHAZookeeperConnectionState();
}
}

View File

@ -205,4 +205,8 @@ public void resetLeaderElection() {
elector.quitElection(false);
elector.joinElection(localActiveNodeInfo);
}
public String getHAZookeeperConnectionState() {
return elector.getHAZookeeperConnectionState();
}
}

View File

@ -44,6 +44,8 @@ protected void render(Block html) {
_("Cluster ID:", cinfo.getClusterId()).
_("ResourceManager state:", cinfo.getState()).
_("ResourceManager HA state:", cinfo.getHAState()).
_("ResourceManager HA zookeeper connection state:",
cinfo.getHAZookeeperConnectionState()).
_("ResourceManager RMStateStore:", cinfo.getRMStateStore()).
_("ResourceManager started on:", Times.format(cinfo.getStartedOn())).
_("ResourceManager version:", cinfo.getRMBuildVersion() +

View File

@ -117,4 +117,9 @@ private String buildRedirectPath() {
}
return path;
}
public String getHAZookeeperConnectionState() {
return rm.getRMContext().getRMAdminService()
.getHAZookeeperConnectionState();
}
}

View File

@ -58,6 +58,7 @@ public class RMWebAppFilter extends GuiceContainer {
private String path;
private static final int BASIC_SLEEP_TIME = 5;
private static final int MAX_SLEEP_TIME = 5 * 60;
private static final Random randnum = new Random();
@Inject
public RMWebAppFilter(Injector injector, Configuration conf) {
@ -126,6 +127,8 @@ && shouldRedirect(rmWebApp, uri)) {
String redirectMsg =
doRetry ? "Can not find any active RM. Will retry in next " + next
+ " seconds." : "There is no active RM right now.";
redirectMsg += "\nHA Zookeeper Connection State: "
+ rmWebApp.getHAZookeeperConnectionState();
PrintWriter out = response.getWriter();
out.println(redirectMsg);
if (doRetry) {
@ -172,6 +175,6 @@ private String appendOrReplaceParamter(String uri, String newQuery) {
private static int calculateExponentialTime(int retries) {
long baseTime = BASIC_SLEEP_TIME * (1L << retries);
return (int) (baseTime * ((new Random()).nextDouble() + 0.5));
return (int) (baseTime * (randnum.nextDouble() + 0.5));
}
}

View File

@ -43,6 +43,7 @@ public class ClusterInfo {
protected String hadoopVersion;
protected String hadoopBuildVersion;
protected String hadoopVersionBuiltOn;
protected String haZooKeeperConnectionState;
public ClusterInfo() {
} // JAXB needs this
@ -62,6 +63,8 @@ public ClusterInfo(ResourceManager rm) {
this.hadoopVersion = VersionInfo.getVersion();
this.hadoopBuildVersion = VersionInfo.getBuildVersion();
this.hadoopVersionBuiltOn = VersionInfo.getDate();
this.haZooKeeperConnectionState =
rm.getRMContext().getRMAdminService().getHAZookeeperConnectionState();
}
public String getState() {
@ -108,4 +111,7 @@ public long getStartedOn() {
return this.startedOn;
}
public String getHAZookeeperConnectionState() {
return this.haZooKeeperConnectionState;
}
}

View File

@ -285,6 +285,8 @@ public void verifyClusterInfoXML(String xml) throws JSONException, Exception {
WebServicesTestUtils.getXmlLong(element, "startedOn"),
WebServicesTestUtils.getXmlString(element, "state"),
WebServicesTestUtils.getXmlString(element, "haState"),
WebServicesTestUtils.getXmlString(
element, "haZooKeeperConnectionState"),
WebServicesTestUtils.getXmlString(element, "hadoopVersionBuiltOn"),
WebServicesTestUtils.getXmlString(element, "hadoopBuildVersion"),
WebServicesTestUtils.getXmlString(element, "hadoopVersion"),
@ -300,9 +302,10 @@ public void verifyClusterInfo(JSONObject json) throws JSONException,
Exception {
assertEquals("incorrect number of elements", 1, json.length());
JSONObject info = json.getJSONObject("clusterInfo");
assertEquals("incorrect number of elements", 11, info.length());
assertEquals("incorrect number of elements", 12, info.length());
verifyClusterGeneric(info.getLong("id"), info.getLong("startedOn"),
info.getString("state"), info.getString("haState"),
info.getString("haZooKeeperConnectionState"),
info.getString("hadoopVersionBuiltOn"),
info.getString("hadoopBuildVersion"), info.getString("hadoopVersion"),
info.getString("resourceManagerVersionBuiltOn"),
@ -312,7 +315,8 @@ public void verifyClusterInfo(JSONObject json) throws JSONException,
}
public void verifyClusterGeneric(long clusterid, long startedon,
String state, String haState, String hadoopVersionBuiltOn,
String state, String haState, String haZooKeeperConnectionState,
String hadoopVersionBuiltOn,
String hadoopBuildVersion, String hadoopVersion,
String resourceManagerVersionBuiltOn, String resourceManagerBuildVersion,
String resourceManagerVersion) {