HADOOP-13933. Add haadmin -getAllServiceState option to get the HA state of all the NameNodes/ResourceManagers. Contributed by Surendra Singh Lilhore.

(cherry picked from commit e407449ddb)
This commit is contained in:
Akira Ajisaka 2017-01-17 10:10:23 +09:00
parent 861e275646
commit 2d6be7ea23
8 changed files with 144 additions and 17 deletions

View File

@ -80,6 +80,8 @@ public abstract class HAAdmin extends Configured implements Tool {
"--" + FORCEACTIVE + " option is used.")) "--" + FORCEACTIVE + " option is used."))
.put("-getServiceState", .put("-getServiceState",
new UsageInfo("<serviceId>", "Returns the state of the service")) new UsageInfo("<serviceId>", "Returns the state of the service"))
.put("-getAllServiceState",
new UsageInfo(null, "Returns the state of all the services"))
.put("-checkHealth", .put("-checkHealth",
new UsageInfo("<serviceId>", new UsageInfo("<serviceId>",
"Requests that the service perform a health check.\n" + "Requests that the service perform a health check.\n" +
@ -119,8 +121,12 @@ public abstract class HAAdmin extends Configured implements Tool {
String cmd = e.getKey(); String cmd = e.getKey();
UsageInfo usage = e.getValue(); UsageInfo usage = e.getValue();
if (usage.args == null) {
errOut.println(" [" + cmd + "]");
} else {
errOut.println(" [" + cmd + " " + usage.args + "]"); errOut.println(" [" + cmd + " " + usage.args + "]");
} }
}
errOut.println(); errOut.println();
ToolRunner.printGenericCommandUsage(errOut); ToolRunner.printGenericCommandUsage(errOut);
} }
@ -130,8 +136,12 @@ public abstract class HAAdmin extends Configured implements Tool {
if (usage == null) { if (usage == null) {
throw new RuntimeException("No usage for cmd " + cmd); throw new RuntimeException("No usage for cmd " + cmd);
} }
if (usage.args == null) {
errOut.println(getUsageString() + " [" + cmd + "]");
} else {
errOut.println(getUsageString() + " [" + cmd + " " + usage.args + "]"); errOut.println(getUsageString() + " [" + cmd + " " + usage.args + "]");
} }
}
private int transitionToActive(final CommandLine cmd) private int transitionToActive(final CommandLine cmd)
throws IOException, ServiceFailedException { throws IOException, ServiceFailedException {
@ -455,6 +465,8 @@ public abstract class HAAdmin extends Configured implements Tool {
return failover(cmdLine); return failover(cmdLine);
} else if ("-getServiceState".equals(cmd)) { } else if ("-getServiceState".equals(cmd)) {
return getServiceState(cmdLine); return getServiceState(cmdLine);
} else if ("-getAllServiceState".equals(cmd)) {
return getAllServiceState();
} else if ("-checkHealth".equals(cmd)) { } else if ("-checkHealth".equals(cmd)) {
return checkHealth(cmdLine); return checkHealth(cmdLine);
} else if ("-help".equals(cmd)) { } else if ("-help".equals(cmd)) {
@ -466,6 +478,29 @@ public abstract class HAAdmin extends Configured implements Tool {
} }
} }
protected int getAllServiceState() {
Collection<String> targetIds = getTargetIds(null);
if (targetIds.isEmpty()) {
errOut.println("Failed to get service IDs");
return -1;
}
for (String targetId : targetIds) {
HAServiceTarget target = resolveTarget(targetId);
String address = target.getAddress().getHostName() + ":"
+ target.getAddress().getPort();
try {
HAServiceProtocol proto = target.getProxy(getConf(),
rpcTimeoutForChecks);
out.println(String.format("%-50s %-10s", address, proto
.getServiceStatus().getState()));
} catch (IOException e) {
out.println(String.format("%-50s %-10s", address,
"Failed to connect: " + e.getMessage()));
}
}
return 0;
}
private boolean confirmForceManual() throws IOException { private boolean confirmForceManual() throws IOException {
return ToolRunner.confirmPrompt( return ToolRunner.confirmPrompt(
"You have specified the --" + FORCEMANUAL + " flag. This flag is " + "You have specified the --" + FORCEMANUAL + " flag. This flag is " +
@ -532,7 +567,11 @@ public abstract class HAAdmin extends Configured implements Tool {
return -1; return -1;
} }
if (usageInfo.args == null) {
out.println(cmd + ": " + usageInfo.help);
} else {
out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help); out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help);
}
return 0; return 0;
} }

View File

@ -434,6 +434,7 @@ Usage:
hdfs haadmin -transitionToStandby <serviceId> hdfs haadmin -transitionToStandby <serviceId>
hdfs haadmin -failover [--forcefence] [--forceactive] <serviceId> <serviceId> hdfs haadmin -failover [--forcefence] [--forceactive] <serviceId> <serviceId>
hdfs haadmin -getServiceState <serviceId> hdfs haadmin -getServiceState <serviceId>
hdfs haadmin -getAllServiceState
hdfs haadmin -checkHealth <serviceId> hdfs haadmin -checkHealth <serviceId>
hdfs haadmin -help <command> hdfs haadmin -help <command>
@ -443,6 +444,7 @@ Usage:
| `-checkHealth` | check the health of the given NameNode | | `-checkHealth` | check the health of the given NameNode |
| `-failover` | initiate a failover between two NameNodes | | `-failover` | initiate a failover between two NameNodes |
| `-getServiceState` | determine whether the given NameNode is Active or Standby | | `-getServiceState` | determine whether the given NameNode is Active or Standby |
| `-getAllServiceState` | returns the state of all the NameNodes | |
| `-transitionToActive` | transition the state of the given NameNode to Active (Warning: No fencing is done) | | `-transitionToActive` | transition the state of the given NameNode to Active (Warning: No fencing is done) |
| `-transitionToStandby` | transition the state of the given NameNode to Standby (Warning: No fencing is done) | | `-transitionToStandby` | transition the state of the given NameNode to Standby (Warning: No fencing is done) |
| `-help` [cmd] | Displays help for the given command or all commands if none is specified. | | `-help` [cmd] | Displays help for the given command or all commands if none is specified. |

View File

@ -345,6 +345,7 @@ Now that your HA NameNodes are configured and started, you will have access to s
[-transitionToStandby <serviceId>] [-transitionToStandby <serviceId>]
[-failover [--forcefence] [--forceactive] <serviceId> <serviceId>] [-failover [--forcefence] [--forceactive] <serviceId> <serviceId>]
[-getServiceState <serviceId>] [-getServiceState <serviceId>]
[-getAllServiceState]
[-checkHealth <serviceId>] [-checkHealth <serviceId>]
[-help <command>] [-help <command>]
@ -376,6 +377,11 @@ This guide describes high-level uses of each of these subcommands. For specific
used by cron jobs or monitoring scripts which need to behave differently based used by cron jobs or monitoring scripts which need to behave differently based
on whether the NameNode is currently Active or Standby. on whether the NameNode is currently Active or Standby.
* **getAllServiceState** - returns the state of all the NameNodes
Connect to the configured NameNodes to determine the current state, print
either "standby" or "active" to STDOUT appropriately.
* **checkHealth** - check the health of the given NameNode * **checkHealth** - check the health of the given NameNode
Connect to the provided NameNode to check its health. The NameNode is capable Connect to the provided NameNode to check its health. The NameNode is capable

View File

@ -391,6 +391,7 @@ Now that your HA NameNodes are configured and started, you will have access to s
[-transitionToStandby <serviceId>] [-transitionToStandby <serviceId>]
[-failover [--forcefence] [--forceactive] <serviceId> <serviceId>] [-failover [--forcefence] [--forceactive] <serviceId> <serviceId>]
[-getServiceState <serviceId>] [-getServiceState <serviceId>]
[-getAllServiceState]
[-checkHealth <serviceId>] [-checkHealth <serviceId>]
[-help <command>] [-help <command>]
@ -422,6 +423,11 @@ This guide describes high-level uses of each of these subcommands. For specific
used by cron jobs or monitoring scripts which need to behave differently based used by cron jobs or monitoring scripts which need to behave differently based
on whether the NameNode is currently Active or Standby. on whether the NameNode is currently Active or Standby.
* **getAllServiceState** - returns the state of all the NameNodes
Connect to the configured NameNodes to determine the current state, print
either "standby" or "active" to STDOUT appropriately.
* **checkHealth** - check the health of the given NameNode * **checkHealth** - check the health of the given NameNode
Connect to the provided NameNode to check its health. The NameNode is capable Connect to the provided NameNode to check its health. The NameNode is capable

View File

@ -174,6 +174,17 @@ public class TestDFSHAAdmin {
assertOutputContains("Transitions the service into Active"); assertOutputContains("Transitions the service into Active");
} }
@Test
public void testGetAllServiceState() throws Exception {
Mockito.doReturn(STANDBY_READY_RESULT).when(mockProtocol)
.getServiceStatus();
assertEquals(0, runTool("-getAllServiceState"));
assertOutputContains(String.format("%-50s %-10s", (HOST_A + ":" + 12345),
STANDBY_READY_RESULT.getState()));
assertOutputContains(String.format("%-50s %-10s", (HOST_B + ":" + 12345),
STANDBY_READY_RESULT.getState()));
}
@Test @Test
public void testTransitionToActive() throws Exception { public void testTransitionToActive() throws Exception {
Mockito.doReturn(STANDBY_READY_RESULT).when(mockProtocol).getServiceStatus(); Mockito.doReturn(STANDBY_READY_RESULT).when(mockProtocol).getServiceStatus();

View File

@ -174,6 +174,10 @@ public class RMAdminCLI extends HAAdmin {
this.errOut = errOut; this.errOut = errOut;
} }
protected void setOut(PrintStream out) {
this.out = out;
}
private static void appendHAUsage(final StringBuilder usageBuilder) { private static void appendHAUsage(final StringBuilder usageBuilder) {
for (Map.Entry<String,UsageInfo> cmdEntry : USAGE.entrySet()) { for (Map.Entry<String,UsageInfo> cmdEntry : USAGE.entrySet()) {
if (cmdEntry.getKey().equals("-help") if (cmdEntry.getKey().equals("-help")
@ -181,7 +185,12 @@ public class RMAdminCLI extends HAAdmin {
continue; continue;
} }
UsageInfo usageInfo = cmdEntry.getValue(); UsageInfo usageInfo = cmdEntry.getValue();
usageBuilder.append(" [" + cmdEntry.getKey() + " " + usageInfo.args + "]"); if (usageInfo.args == null) {
usageBuilder.append(" [" + cmdEntry.getKey() + "]");
} else {
usageBuilder.append(" [" + cmdEntry.getKey() + " " + usageInfo.args
+ "]");
}
} }
} }
@ -193,9 +202,13 @@ public class RMAdminCLI extends HAAdmin {
return; return;
} }
} }
if (usageInfo.args == null) {
builder.append(" " + cmd + ": " + usageInfo.help);
} else {
String space = (usageInfo.args == "") ? "" : " "; String space = (usageInfo.args == "") ? "" : " ";
builder.append(" " + cmd + space + usageInfo.args + ": " + builder.append(" " + cmd + space + usageInfo.args + ": "
usageInfo.help); + usageInfo.help);
}
} }
private static void buildIndividualUsageMsg(String cmd, private static void buildIndividualUsageMsg(String cmd,
@ -209,10 +222,13 @@ public class RMAdminCLI extends HAAdmin {
} }
isHACommand = true; isHACommand = true;
} }
if (usageInfo.args == null) {
builder.append("Usage: yarn rmadmin [" + cmd + "]\n");
} else {
String space = (usageInfo.args == "") ? "" : " "; String space = (usageInfo.args == "") ? "" : " ";
builder.append("Usage: yarn rmadmin [" builder.append("Usage: yarn rmadmin [" + cmd + space + usageInfo.args
+ cmd + space + usageInfo.args
+ "]\n"); + "]\n");
}
if (isHACommand) { if (isHACommand) {
builder.append(cmd + " can only be used when RM HA is enabled"); builder.append(cmd + " can only be used when RM HA is enabled");
} }
@ -230,10 +246,14 @@ public class RMAdminCLI extends HAAdmin {
String cmdKey = cmdEntry.getKey(); String cmdKey = cmdEntry.getKey();
if (!cmdKey.equals("-help")) { if (!cmdKey.equals("-help")) {
UsageInfo usageInfo = cmdEntry.getValue(); UsageInfo usageInfo = cmdEntry.getValue();
if (usageInfo.args == null) {
builder.append(" " + cmdKey + "\n");
} else {
builder.append(" " + cmdKey + " " + usageInfo.args + "\n"); builder.append(" " + cmdKey + " " + usageInfo.args + "\n");
} }
} }
} }
}
builder.append(" -help" + " [cmd]\n"); builder.append(" -help" + " [cmd]\n");
} }

View File

@ -45,11 +45,13 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.ha.HAServiceProtocol; import org.apache.hadoop.ha.HAServiceProtocol;
import org.apache.hadoop.ha.HAServiceStatus; import org.apache.hadoop.ha.HAServiceStatus;
import org.apache.hadoop.ha.HAServiceTarget; import org.apache.hadoop.ha.HAServiceTarget;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.service.Service.STATE; import org.apache.hadoop.service.Service.STATE;
import org.apache.hadoop.yarn.api.records.DecommissionType; import org.apache.hadoop.yarn.api.records.DecommissionType;
import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceOption; import org.apache.hadoop.yarn.api.records.ResourceOption;
import org.apache.hadoop.yarn.conf.HAUtil;
import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager; import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
@ -67,13 +69,13 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.RefreshServiceAclsReque
import org.apache.hadoop.yarn.server.api.protocolrecords.RefreshSuperUserGroupsConfigurationRequest; import org.apache.hadoop.yarn.server.api.protocolrecords.RefreshSuperUserGroupsConfigurationRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.RefreshUserToGroupsMappingsRequest; import org.apache.hadoop.yarn.server.api.protocolrecords.RefreshUserToGroupsMappingsRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.UpdateNodeResourceRequest; import org.apache.hadoop.yarn.server.api.protocolrecords.UpdateNodeResourceRequest;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.util.Records; import org.apache.hadoop.yarn.util.Records;
import org.apache.hadoop.yarn.util.resource.Resources; import org.apache.hadoop.yarn.util.resource.Resources;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.mockito.ArgumentCaptor; import org.mockito.ArgumentCaptor;
import org.mockito.ArgumentMatcher; import org.mockito.ArgumentMatcher;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock; import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer; import org.mockito.stubbing.Answer;
@ -88,6 +90,8 @@ public class TestRMAdminCLI {
private RMAdminCLI rmAdminCLIWithHAEnabled; private RMAdminCLI rmAdminCLIWithHAEnabled;
private CommonNodeLabelsManager dummyNodeLabelsManager; private CommonNodeLabelsManager dummyNodeLabelsManager;
private boolean remoteAdminServiceAccessed = false; private boolean remoteAdminServiceAccessed = false;
private static final String HOST_A = "1.2.3.1";
private static final String HOST_B = "1.2.3.2";
@SuppressWarnings("static-access") @SuppressWarnings("static-access")
@Before @Before
@ -130,6 +134,14 @@ public class TestRMAdminCLI {
YarnConfiguration conf = new YarnConfiguration(); YarnConfiguration conf = new YarnConfiguration();
conf.setBoolean(YarnConfiguration.RM_HA_ENABLED, true); conf.setBoolean(YarnConfiguration.RM_HA_ENABLED, true);
conf.set(YarnConfiguration.RM_HA_IDS, "rm1,rm2"); conf.set(YarnConfiguration.RM_HA_IDS, "rm1,rm2");
conf.set(HAUtil.addSuffix(YarnConfiguration.RM_ADDRESS, "rm1"), HOST_A
+ ":12345");
conf.set(HAUtil.addSuffix(YarnConfiguration.RM_ADMIN_ADDRESS, "rm1"),
HOST_A + ":12346");
conf.set(HAUtil.addSuffix(YarnConfiguration.RM_ADDRESS, "rm2"), HOST_B
+ ":12345");
conf.set(HAUtil.addSuffix(YarnConfiguration.RM_ADMIN_ADDRESS, "rm2"),
HOST_B + ":12346");
rmAdminCLIWithHAEnabled = new RMAdminCLI(conf) { rmAdminCLIWithHAEnabled = new RMAdminCLI(conf) {
@Override @Override
@ -140,7 +152,17 @@ public class TestRMAdminCLI {
@Override @Override
protected HAServiceTarget resolveTarget(String rmId) { protected HAServiceTarget resolveTarget(String rmId) {
return haServiceTarget; HAServiceTarget target = super.resolveTarget(rmId);
HAServiceTarget spy = Mockito.spy(target);
// Override the target to return our mock protocol
try {
Mockito.doReturn(haadmin).when(spy)
.getProxy(Mockito.<Configuration> any(), Mockito.anyInt());
Mockito.doReturn(false).when(spy).isAutoFailoverEnabled();
} catch (IOException e) {
throw new AssertionError(e); // mock setup doesn't really throw
}
return spy;
} }
}; };
} }
@ -424,6 +446,24 @@ public class TestRMAdminCLI {
verify(haadmin).getServiceStatus(); verify(haadmin).getServiceStatus();
} }
@Test
public void testGetAllServiceState() throws Exception {
HAServiceStatus standbyStatus = new HAServiceStatus(
HAServiceState.STANDBY).setReadyToBecomeActive();
Mockito.doReturn(standbyStatus).when(haadmin).getServiceStatus();
ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
rmAdminCLIWithHAEnabled.setOut(new PrintStream(dataOut));
String[] args = {"-getAllServiceState"};
assertEquals(0, rmAdminCLIWithHAEnabled.run(args));
assertTrue(dataOut.toString().contains(
String.format("%-50s %-10s", (HOST_A + ":" + 12346),
standbyStatus.getState())));
assertTrue(dataOut.toString().contains(
String.format("%-50s %-10s", (HOST_B + ":" + 12346),
standbyStatus.getState())));
rmAdminCLIWithHAEnabled.setOut(System.out);
}
@Test(timeout = 500) @Test(timeout = 500)
public void testCheckHealth() throws Exception { public void testCheckHealth() throws Exception {
String[] args = {"-checkHealth", "rm1"}; String[] args = {"-checkHealth", "rm1"};
@ -572,7 +612,8 @@ public class TestRMAdminCLI {
+ "([OvercommitTimeout]) " + "([OvercommitTimeout]) "
+ "[-transitionToActive [--forceactive] <serviceId>] " + "[-transitionToActive [--forceactive] <serviceId>] "
+ "[-transitionToStandby <serviceId>] " + "[-transitionToStandby <serviceId>] "
+ "[-getServiceState <serviceId>] [-checkHealth <serviceId>] [-help [cmd]]"; + "[-getServiceState <serviceId>] [-getAllServiceState] "
+ "[-checkHealth <serviceId>] [-help [cmd]]";
String actualHelpMsg = dataOut.toString(); String actualHelpMsg = dataOut.toString();
assertTrue(String.format("Help messages: %n " + actualHelpMsg + " %n doesn't include expected " + assertTrue(String.format("Help messages: %n " + actualHelpMsg + " %n doesn't include expected " +
"messages: %n" + expectedHelpMsg), actualHelpMsg.contains(expectedHelpMsg "messages: %n" + expectedHelpMsg), actualHelpMsg.contains(expectedHelpMsg

View File

@ -226,6 +226,7 @@ Usage:
-transitionToStandby <serviceId> -transitionToStandby <serviceId>
-failover [--forcefence] [--forceactive] <serviceId> <serviceId> -failover [--forcefence] [--forceactive] <serviceId> <serviceId>
-getServiceState <serviceId> -getServiceState <serviceId>
-getAllServiceState
-checkHealth <serviceId> -checkHealth <serviceId>
-help [cmd] -help [cmd]
``` ```
@ -250,6 +251,7 @@ Usage:
| -transitionToStandby [--forcemanual] \<serviceId\> | Transitions the service into Standby state. This command can not be used if automatic failover is enabled. Though you can override this by --forcemanual option, you need caution. | | -transitionToStandby [--forcemanual] \<serviceId\> | Transitions the service into Standby state. This command can not be used if automatic failover is enabled. Though you can override this by --forcemanual option, you need caution. |
| -failover [--forceactive] \<serviceId1\> \<serviceId2\> | Initiate a failover from serviceId1 to serviceId2. Try to failover to the target service even if it is not ready if the --forceactive option is used. This command can not be used if automatic failover is enabled. | | -failover [--forceactive] \<serviceId1\> \<serviceId2\> | Initiate a failover from serviceId1 to serviceId2. Try to failover to the target service even if it is not ready if the --forceactive option is used. This command can not be used if automatic failover is enabled. |
| -getServiceState \<serviceId\> | Returns the state of the service. | | -getServiceState \<serviceId\> | Returns the state of the service. |
| -getAllServiceState | Returns the state of all the services. |
| -checkHealth \<serviceId\> | Requests that the service perform a health check. The RMAdmin tool will exit with a non-zero exit code if the check fails. | | -checkHealth \<serviceId\> | Requests that the service perform a health check. The RMAdmin tool will exit with a non-zero exit code if the check fails. |
| -help [cmd] | Displays help for the given command or all commands if none is specified. | | -help [cmd] | Displays help for the given command or all commands if none is specified. |