YARN-4101. RM should print alert messages if Zookeeper and Resourcemanager gets connection issue. Contributed by Xuan Gong
(cherry picked from commit 09c64ba1ba
)
This commit is contained in:
parent
c2ed7e4a09
commit
2d1ff2e1ca
|
@ -1118,4 +1118,8 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
|
||||||
((appData == null) ? "null" : StringUtils.byteToHexString(appData)) +
|
((appData == null) ? "null" : StringUtils.byteToHexString(appData)) +
|
||||||
" cb=" + appClient;
|
" cb=" + appClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getHAZookeeperConnectionState() {
|
||||||
|
return this.zkConnectionState.name();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -64,6 +64,9 @@ Release 2.7.2 - UNRELEASED
|
||||||
YARN-3893. Both RM in active state when Admin#transitionToActive failure
|
YARN-3893. Both RM in active state when Admin#transitionToActive failure
|
||||||
from refeshAll() (Bibin A Chundatt via rohithsharmaks)
|
from refeshAll() (Bibin A Chundatt via rohithsharmaks)
|
||||||
|
|
||||||
|
YARN-4101. RM should print alert messages if Zookeeper and Resourcemanager
|
||||||
|
gets connection issue. (Xuan Gong via jianhe)
|
||||||
|
|
||||||
Release 2.7.1 - 2015-07-06
|
Release 2.7.1 - 2015-07-06
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -701,4 +701,13 @@ public class AdminService extends CompositeService implements
|
||||||
"AdminService", "Exception " + msg);
|
"AdminService", "Exception " + msg);
|
||||||
return RPCUtil.getRemoteException(ioe);
|
return RPCUtil.getRemoteException(ioe);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getHAZookeeperConnectionState() {
|
||||||
|
if (!rmContext.isHAEnabled()) {
|
||||||
|
return "ResourceManager HA is not enabled.";
|
||||||
|
} else if (!autoFailoverEnabled) {
|
||||||
|
return "Auto Failover is not enabled.";
|
||||||
|
}
|
||||||
|
return this.embeddedElector.getHAZookeeperConnectionState();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -205,4 +205,8 @@ public class EmbeddedElectorService extends AbstractService
|
||||||
elector.quitElection(false);
|
elector.quitElection(false);
|
||||||
elector.joinElection(localActiveNodeInfo);
|
elector.joinElection(localActiveNodeInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getHAZookeeperConnectionState() {
|
||||||
|
return elector.getHAZookeeperConnectionState();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,6 +44,8 @@ public class AboutBlock extends HtmlBlock {
|
||||||
_("Cluster ID:", cinfo.getClusterId()).
|
_("Cluster ID:", cinfo.getClusterId()).
|
||||||
_("ResourceManager state:", cinfo.getState()).
|
_("ResourceManager state:", cinfo.getState()).
|
||||||
_("ResourceManager HA state:", cinfo.getHAState()).
|
_("ResourceManager HA state:", cinfo.getHAState()).
|
||||||
|
_("ResourceManager HA zookeeper connection state:",
|
||||||
|
cinfo.getHAZookeeperConnectionState()).
|
||||||
_("ResourceManager RMStateStore:", cinfo.getRMStateStore()).
|
_("ResourceManager RMStateStore:", cinfo.getRMStateStore()).
|
||||||
_("ResourceManager started on:", Times.format(cinfo.getStartedOn())).
|
_("ResourceManager started on:", Times.format(cinfo.getStartedOn())).
|
||||||
_("ResourceManager version:", cinfo.getRMBuildVersion() +
|
_("ResourceManager version:", cinfo.getRMBuildVersion() +
|
||||||
|
|
|
@ -114,4 +114,9 @@ public class RMWebApp extends WebApp implements YarnWebParams {
|
||||||
}
|
}
|
||||||
return path;
|
return path;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getHAZookeeperConnectionState() {
|
||||||
|
return rm.getRMContext().getRMAdminService()
|
||||||
|
.getHAZookeeperConnectionState();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -58,6 +58,7 @@ public class RMWebAppFilter extends GuiceContainer {
|
||||||
private String path;
|
private String path;
|
||||||
private static final int BASIC_SLEEP_TIME = 5;
|
private static final int BASIC_SLEEP_TIME = 5;
|
||||||
private static final int MAX_SLEEP_TIME = 5 * 60;
|
private static final int MAX_SLEEP_TIME = 5 * 60;
|
||||||
|
private static final Random randnum = new Random();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public RMWebAppFilter(Injector injector, Configuration conf) {
|
public RMWebAppFilter(Injector injector, Configuration conf) {
|
||||||
|
@ -126,6 +127,8 @@ public class RMWebAppFilter extends GuiceContainer {
|
||||||
String redirectMsg =
|
String redirectMsg =
|
||||||
doRetry ? "Can not find any active RM. Will retry in next " + next
|
doRetry ? "Can not find any active RM. Will retry in next " + next
|
||||||
+ " seconds." : "There is no active RM right now.";
|
+ " seconds." : "There is no active RM right now.";
|
||||||
|
redirectMsg += "\nHA Zookeeper Connection State: "
|
||||||
|
+ rmWebApp.getHAZookeeperConnectionState();
|
||||||
PrintWriter out = response.getWriter();
|
PrintWriter out = response.getWriter();
|
||||||
out.println(redirectMsg);
|
out.println(redirectMsg);
|
||||||
if (doRetry) {
|
if (doRetry) {
|
||||||
|
@ -172,6 +175,6 @@ public class RMWebAppFilter extends GuiceContainer {
|
||||||
|
|
||||||
private static int calculateExponentialTime(int retries) {
|
private static int calculateExponentialTime(int retries) {
|
||||||
long baseTime = BASIC_SLEEP_TIME * (1L << retries);
|
long baseTime = BASIC_SLEEP_TIME * (1L << retries);
|
||||||
return (int) (baseTime * ((new Random()).nextDouble() + 0.5));
|
return (int) (baseTime * (randnum.nextDouble() + 0.5));
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -43,6 +43,7 @@ public class ClusterInfo {
|
||||||
protected String hadoopVersion;
|
protected String hadoopVersion;
|
||||||
protected String hadoopBuildVersion;
|
protected String hadoopBuildVersion;
|
||||||
protected String hadoopVersionBuiltOn;
|
protected String hadoopVersionBuiltOn;
|
||||||
|
protected String haZooKeeperConnectionState;
|
||||||
|
|
||||||
public ClusterInfo() {
|
public ClusterInfo() {
|
||||||
} // JAXB needs this
|
} // JAXB needs this
|
||||||
|
@ -62,6 +63,8 @@ public class ClusterInfo {
|
||||||
this.hadoopVersion = VersionInfo.getVersion();
|
this.hadoopVersion = VersionInfo.getVersion();
|
||||||
this.hadoopBuildVersion = VersionInfo.getBuildVersion();
|
this.hadoopBuildVersion = VersionInfo.getBuildVersion();
|
||||||
this.hadoopVersionBuiltOn = VersionInfo.getDate();
|
this.hadoopVersionBuiltOn = VersionInfo.getDate();
|
||||||
|
this.haZooKeeperConnectionState =
|
||||||
|
rm.getRMContext().getRMAdminService().getHAZookeeperConnectionState();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getState() {
|
public String getState() {
|
||||||
|
@ -108,4 +111,7 @@ public class ClusterInfo {
|
||||||
return this.startedOn;
|
return this.startedOn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getHAZookeeperConnectionState() {
|
||||||
|
return this.haZooKeeperConnectionState;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -277,6 +277,8 @@ public class TestRMWebServices extends JerseyTestBase {
|
||||||
WebServicesTestUtils.getXmlLong(element, "startedOn"),
|
WebServicesTestUtils.getXmlLong(element, "startedOn"),
|
||||||
WebServicesTestUtils.getXmlString(element, "state"),
|
WebServicesTestUtils.getXmlString(element, "state"),
|
||||||
WebServicesTestUtils.getXmlString(element, "haState"),
|
WebServicesTestUtils.getXmlString(element, "haState"),
|
||||||
|
WebServicesTestUtils.getXmlString(
|
||||||
|
element, "haZooKeeperConnectionState"),
|
||||||
WebServicesTestUtils.getXmlString(element, "hadoopVersionBuiltOn"),
|
WebServicesTestUtils.getXmlString(element, "hadoopVersionBuiltOn"),
|
||||||
WebServicesTestUtils.getXmlString(element, "hadoopBuildVersion"),
|
WebServicesTestUtils.getXmlString(element, "hadoopBuildVersion"),
|
||||||
WebServicesTestUtils.getXmlString(element, "hadoopVersion"),
|
WebServicesTestUtils.getXmlString(element, "hadoopVersion"),
|
||||||
|
@ -292,9 +294,10 @@ public class TestRMWebServices extends JerseyTestBase {
|
||||||
Exception {
|
Exception {
|
||||||
assertEquals("incorrect number of elements", 1, json.length());
|
assertEquals("incorrect number of elements", 1, json.length());
|
||||||
JSONObject info = json.getJSONObject("clusterInfo");
|
JSONObject info = json.getJSONObject("clusterInfo");
|
||||||
assertEquals("incorrect number of elements", 11, info.length());
|
assertEquals("incorrect number of elements", 12, info.length());
|
||||||
verifyClusterGeneric(info.getLong("id"), info.getLong("startedOn"),
|
verifyClusterGeneric(info.getLong("id"), info.getLong("startedOn"),
|
||||||
info.getString("state"), info.getString("haState"),
|
info.getString("state"), info.getString("haState"),
|
||||||
|
info.getString("haZooKeeperConnectionState"),
|
||||||
info.getString("hadoopVersionBuiltOn"),
|
info.getString("hadoopVersionBuiltOn"),
|
||||||
info.getString("hadoopBuildVersion"), info.getString("hadoopVersion"),
|
info.getString("hadoopBuildVersion"), info.getString("hadoopVersion"),
|
||||||
info.getString("resourceManagerVersionBuiltOn"),
|
info.getString("resourceManagerVersionBuiltOn"),
|
||||||
|
@ -304,7 +307,8 @@ public class TestRMWebServices extends JerseyTestBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void verifyClusterGeneric(long clusterid, long startedon,
|
public void verifyClusterGeneric(long clusterid, long startedon,
|
||||||
String state, String haState, String hadoopVersionBuiltOn,
|
String state, String haState, String haZooKeeperConnectionState,
|
||||||
|
String hadoopVersionBuiltOn,
|
||||||
String hadoopBuildVersion, String hadoopVersion,
|
String hadoopBuildVersion, String hadoopVersion,
|
||||||
String resourceManagerVersionBuiltOn, String resourceManagerBuildVersion,
|
String resourceManagerVersionBuiltOn, String resourceManagerBuildVersion,
|
||||||
String resourceManagerVersion) {
|
String resourceManagerVersion) {
|
||||||
|
|
Loading…
Reference in New Issue