From 3833c616e087518196bcb77ac2479c66a0b188d8 Mon Sep 17 00:00:00 2001 From: Chen Liang Date: Mon, 20 Jul 2020 12:49:58 -0700 Subject: [PATCH] HDFS-15404. ShellCommandFencer should expose info about source. Contributed by Chen Liang. --- .../apache/hadoop/ha/FailoverController.java | 2 +- .../org/apache/hadoop/ha/HAServiceTarget.java | 15 ++++++ .../java/org/apache/hadoop/ha/NodeFencer.java | 23 +++++++-- .../apache/hadoop/ha/ShellCommandFencer.java | 48 ++++++++++++++++++- .../hadoop/ha/TestFailoverController.java | 10 ++-- .../hadoop/ha/TestShellCommandFencer.java | 31 ++++++++++++ .../apache/hadoop/hdfs/tools/DFSHAAdmin.java | 5 ++ .../hdfs/tools/TestDFSHAAdminMiniCluster.java | 8 ++-- 8 files changed, 127 insertions(+), 15 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java index 4fc52d557cf..5ad71f373f2 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/FailoverController.java @@ -213,7 +213,7 @@ public class FailoverController { // Fence fromSvc if it's required or forced by the user if (tryFence) { - if (!fromSvc.getFencer().fence(fromSvc)) { + if (!fromSvc.getFencer().fence(fromSvc, toSvc)) { throw new FailoverFailedException("Unable to fence " + fromSvc + ". Fencing failed."); } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceTarget.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceTarget.java index 9d5c8e7b7ea..ff9658f1bbc 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceTarget.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAServiceTarget.java @@ -44,6 +44,12 @@ public abstract class HAServiceTarget { private static final String PORT_SUBST_KEY = "port"; private static final String ADDRESS_SUBST_KEY = "address"; + /** + * The HAState this service target is intended to be after transition + * is complete. + */ + private HAServiceProtocol.HAServiceState transitionTargetHAStatus; + /** * @return the IPC address of the target node. */ @@ -93,6 +99,15 @@ public abstract class HAServiceTarget { return getProxyForAddress(conf, timeoutMs, getAddress()); } + public void setTransitionTargetHAStatus( + HAServiceProtocol.HAServiceState status) { + this.transitionTargetHAStatus = status; + } + + public HAServiceProtocol.HAServiceState getTransitionTargetHAStatus() { + return this.transitionTargetHAStatus; + } + /** * Returns a proxy to connect to the target HA service for health monitoring. * If {@link #getHealthMonitorAddress()} is implemented to return a non-null diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java index 64e73151302..b0cead56ac0 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java @@ -89,15 +89,32 @@ public class NodeFencer { } public boolean fence(HAServiceTarget fromSvc) { + return fence(fromSvc, null); + } + + public boolean fence(HAServiceTarget fromSvc, HAServiceTarget toSvc) { LOG.info("====== Beginning Service Fencing Process... ======"); int i = 0; for (FenceMethodWithArg method : methods) { LOG.info("Trying method " + (++i) + "/" + methods.size() +": " + method); try { - if (method.method.tryFence(fromSvc, method.arg)) { - LOG.info("====== Fencing successful by method " + method + " ======"); - return true; + // only true when target node is given, AND fencing on it failed + boolean toSvcFencingFailed = false; + // if target is given, try to fence on target first. Only if fencing + // on target succeeded, do fencing on source node. + if (toSvc != null) { + toSvcFencingFailed = !method.method.tryFence(toSvc, method.arg); + } + if (toSvcFencingFailed) { + LOG.error("====== Fencing on target failed, skipping fencing " + + "on source ======"); + } else { + if (method.method.tryFence(fromSvc, method.arg)) { + LOG.info("====== Fencing successful by method " + + method + " ======"); + return true; + } } } catch (BadFencingConfigurationException e) { LOG.error("Fencing method " + method + " misconfigured", e); diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ShellCommandFencer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ShellCommandFencer.java index 7e4a88f729f..6363063abf2 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ShellCommandFencer.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ShellCommandFencer.java @@ -19,6 +19,7 @@ package org.apache.hadoop.ha; import java.io.IOException; import java.lang.reflect.Field; +import java.util.Arrays; import java.util.Map; import org.apache.hadoop.conf.Configured; @@ -60,6 +61,11 @@ public class ShellCommandFencer /** Prefix for target parameters added to the environment */ private static final String TARGET_PREFIX = "target_"; + /** Prefix for source parameters added to the environment */ + private static final String SOURCE_PREFIX = "source_"; + + private static final String ARG_DELIMITER = ","; + @VisibleForTesting static Logger LOG = LoggerFactory.getLogger(ShellCommandFencer.class); @@ -73,8 +79,9 @@ public class ShellCommandFencer } @Override - public boolean tryFence(HAServiceTarget target, String cmd) { + public boolean tryFence(HAServiceTarget target, String args) { ProcessBuilder builder; + String cmd = parseArgs(target.getTransitionTargetHAStatus(), args); if (!Shell.WINDOWS) { builder = new ProcessBuilder("bash", "-e", "-c", cmd); @@ -127,6 +134,28 @@ public class ShellCommandFencer return rc == 0; } + private String parseArgs(HAServiceProtocol.HAServiceState state, + String cmd) { + String[] args = cmd.split(ARG_DELIMITER); + if (args.length == 1) { + // only one command is given, assuming both src and dst + // will execute the same command/script. + return args[0]; + } + if (args.length > 2) { + throw new IllegalArgumentException("Expecting arguments size of at most " + + "two, getting " + Arrays.asList(args)); + } + if (HAServiceProtocol.HAServiceState.ACTIVE.equals(state)) { + return args[0]; + } else if (HAServiceProtocol.HAServiceState.STANDBY.equals(state)) { + return args[1]; + } else { + throw new IllegalArgumentException( + "Unexpected HA service state:" + state); + } + } + /** * Abbreviate a string by putting '...' in the middle of it, * in an attempt to keep logs from getting too messy. @@ -190,9 +219,24 @@ public class ShellCommandFencer */ private void addTargetInfoAsEnvVars(HAServiceTarget target, Map environment) { + String prefix; + HAServiceProtocol.HAServiceState targetState = + target.getTransitionTargetHAStatus(); + if (targetState == null || + HAServiceProtocol.HAServiceState.ACTIVE.equals(targetState)) { + // null is assumed to be same as ACTIVE, this is to be compatible + // with existing tests/use cases where target state is not specified + // but assuming it's active. + prefix = TARGET_PREFIX; + } else if (HAServiceProtocol.HAServiceState.STANDBY.equals(targetState)) { + prefix = SOURCE_PREFIX; + } else { + throw new IllegalArgumentException( + "Unexpected HA service state:" + targetState); + } for (Map.Entry e : target.getFencingParameters().entrySet()) { - String key = TARGET_PREFIX + e.getKey(); + String key = prefix + e.getKey(); key = key.replace('.', '_'); environment.put(key, e.getValue()); } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java index 791aaad59e9..3f027fa1c59 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestFailoverController.java @@ -177,7 +177,7 @@ public class TestFailoverController { } // svc1 still thinks it's active, that's OK, it was fenced - assertEquals(1, AlwaysSucceedFencer.fenceCalled); + assertEquals(2, AlwaysSucceedFencer.fenceCalled); assertSame(svc1, AlwaysSucceedFencer.fencedSvc); assertEquals(HAServiceState.ACTIVE, svc1.state); assertEquals(HAServiceState.ACTIVE, svc2.state); @@ -201,7 +201,7 @@ public class TestFailoverController { } assertEquals(1, AlwaysFailFencer.fenceCalled); - assertSame(svc1, AlwaysFailFencer.fencedSvc); + assertSame(svc2, AlwaysFailFencer.fencedSvc); assertEquals(HAServiceState.ACTIVE, svc1.state); assertEquals(HAServiceState.STANDBY, svc2.state); } @@ -223,7 +223,7 @@ public class TestFailoverController { // If fencing was requested and it failed we don't try to make // svc2 active anyway, and we don't failback to svc1. assertEquals(1, AlwaysFailFencer.fenceCalled); - assertSame(svc1, AlwaysFailFencer.fencedSvc); + assertSame(svc2, AlwaysFailFencer.fencedSvc); assertEquals(HAServiceState.STANDBY, svc1.state); assertEquals(HAServiceState.STANDBY, svc2.state); } @@ -344,7 +344,7 @@ public class TestFailoverController { // and we didn't force it, so we failed back to svc1 and fenced svc2. // Note svc2 still thinks it's active, that's OK, we fenced it. assertEquals(HAServiceState.ACTIVE, svc1.state); - assertEquals(1, AlwaysSucceedFencer.fenceCalled); + assertEquals(2, AlwaysSucceedFencer.fenceCalled); assertSame(svc2, AlwaysSucceedFencer.fencedSvc); } @@ -373,7 +373,7 @@ public class TestFailoverController { // so we did not failback to svc1, ie it's still standby. assertEquals(HAServiceState.STANDBY, svc1.state); assertEquals(1, AlwaysFailFencer.fenceCalled); - assertSame(svc2, AlwaysFailFencer.fencedSvc); + assertSame(svc1, AlwaysFailFencer.fencedSvc); } @Test diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestShellCommandFencer.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestShellCommandFencer.java index 3a2cf052a60..fc36b1dd846 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestShellCommandFencer.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestShellCommandFencer.java @@ -163,6 +163,37 @@ public class TestShellCommandFencer { } } + /** + * Test if fencing target has peer set, the failover can trigger different + * commands on source and destination respectively. + */ + @Test + public void testEnvironmentWithPeer() { + HAServiceTarget target = new DummyHAService(HAServiceState.ACTIVE, + new InetSocketAddress("dummytarget", 1111)); + HAServiceTarget source = new DummyHAService(HAServiceState.STANDBY, + new InetSocketAddress("dummysource", 2222)); + target.setTransitionTargetHAStatus(HAServiceState.ACTIVE); + source.setTransitionTargetHAStatus(HAServiceState.STANDBY); + String cmd = "echo $target_host $target_port," + + "echo $source_host $source_port"; + if (!Shell.WINDOWS) { + fencer.tryFence(target, cmd); + Mockito.verify(ShellCommandFencer.LOG).info( + Mockito.contains("echo $ta...rget_port: dummytarget 1111")); + fencer.tryFence(source, cmd); + Mockito.verify(ShellCommandFencer.LOG).info( + Mockito.contains("echo $so...urce_port: dummysource 2222")); + } else { + fencer.tryFence(target, cmd); + Mockito.verify(ShellCommandFencer.LOG).info( + Mockito.contains("echo %ta...get_port%: dummytarget 1111")); + fencer.tryFence(source, cmd); + Mockito.verify(ShellCommandFencer.LOG).info( + Mockito.contains("echo %so...urce_port%: dummysource 2222")); + } + } + /** * Test that we properly close off our input to the subprocess diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSHAAdmin.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSHAAdmin.java index fcfb47c8c65..db03db6e57c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSHAAdmin.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/DFSHAAdmin.java @@ -284,6 +284,11 @@ public class DFSHAAdmin extends HAAdmin { HAServiceTarget fromNode = resolveTarget(args[0]); HAServiceTarget toNode = resolveTarget(args[1]); + fromNode.setTransitionTargetHAStatus( + HAServiceProtocol.HAServiceState.STANDBY); + toNode.setTransitionTargetHAStatus( + HAServiceProtocol.HAServiceState.ACTIVE); + // Check that auto-failover is consistently configured for both nodes. Preconditions.checkState( fromNode.isAutoFailoverEnabled() == diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdminMiniCluster.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdminMiniCluster.java index fc569d0aa7d..7f861374adb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdminMiniCluster.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/tools/TestDFSHAAdminMiniCluster.java @@ -189,13 +189,13 @@ public class TestDFSHAAdminMiniCluster { tmpFile.deleteOnExit(); if (Shell.WINDOWS) { conf.set(DFSConfigKeys.DFS_HA_FENCE_METHODS_KEY, - "shell(echo %target_nameserviceid%.%target_namenodeid% " + - "%target_port% %dfs_ha_namenode_id% > " + + "shell(echo %source_nameserviceid%.%source_namenodeid% " + + "%source_port% %dfs_ha_namenode_id% > " + tmpFile.getAbsolutePath() + ")"); } else { conf.set(DFSConfigKeys.DFS_HA_FENCE_METHODS_KEY, - "shell(echo -n $target_nameserviceid.$target_namenodeid " + - "$target_port $dfs_ha_namenode_id > " + + "shell(echo -n $source_nameserviceid.$source_namenodeid " + + "$source_port $dfs_ha_namenode_id > " + tmpFile.getAbsolutePath() + ")"); }