diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java index 9018bed80e2..8c0edbb7481 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java @@ -81,6 +81,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration.IntegerRanges; import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; +import org.apache.hadoop.ha.HealthCheckFailedException; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableUtils; @@ -3090,6 +3091,8 @@ public abstract class Server { } this.exceptionsHandler.addTerseLoggingExceptions(StandbyException.class); + this.exceptionsHandler.addTerseLoggingExceptions( + HealthCheckFailedException.class); } public synchronized void addAuxiliaryListener(int auxiliaryPort) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index f4a8def1771..fb83baf8f67 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -1016,6 +1016,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final int DFS_HA_ZKFC_PORT_DEFAULT = 8019; public static final String DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY = "dfs.ha.zkfc.nn.http.timeout.ms"; public static final int DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY_DEFAULT = 20000; + public static final String DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE = + "dfs.ha.nn.not-become-active-in-safemode"; + public static final boolean DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT = + false; // Security-related configs public static final String DFS_ENCRYPT_DATA_TRANSFER_KEY = "dfs.encrypt.data.transfer"; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java index e4c88563732..126ac0bde1c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java @@ -118,6 +118,8 @@ import java.util.concurrent.atomic.AtomicBoolean; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT; import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DFS_NAMENODE_RPC_PORT_DEFAULT; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_CALLER_CONTEXT_ENABLED_KEY; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_CALLER_CONTEXT_ENABLED_DEFAULT; @@ -390,6 +392,7 @@ public class NameNode extends ReconfigurableBase implements private final HAContext haContext; protected final boolean allowStaleStandbyReads; private AtomicBoolean started = new AtomicBoolean(false); + private final boolean notBecomeActiveInSafemode; private final static int HEALTH_MONITOR_WARN_THRESHOLD_MS = 5000; @@ -983,6 +986,9 @@ public class NameNode extends ReconfigurableBase implements this.stopAtException(e); throw e; } + notBecomeActiveInSafemode = conf.getBoolean( + DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE, + DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT); this.started.set(true); } @@ -1802,6 +1808,10 @@ public class NameNode extends ReconfigurableBase implements throw new HealthCheckFailedException( "The NameNode has no resources available"); } + if (notBecomeActiveInSafemode && isInSafeMode()) { + throw new HealthCheckFailedException("The NameNode is configured to " + + "report UNHEALTHY to ZKFC in Safemode."); + } } synchronized void transitionToActive() @@ -1815,6 +1825,9 @@ public class NameNode extends ReconfigurableBase implements "Cannot transition from '" + OBSERVER_STATE + "' to '" + ACTIVE_STATE + "'"); } + if (notBecomeActiveInSafemode && isInSafeMode()) { + throw new ServiceFailedException(getRole() + " still not leave safemode"); + } state.setState(haContext, ACTIVE_STATE); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index d4f8abf9bf3..890d03475a0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -3192,6 +3192,15 @@ + + dfs.ha.nn.not-become-active-in-safemode + false + + This will prevent safe mode namenodes to become active while other standby + namenodes might be ready to serve requests when it is set to true. + + + dfs.ha.tail-edits.in-progress false diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithNFS.md b/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithNFS.md index d607561b6dd..06cda83651e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithNFS.md +++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithNFS.md @@ -296,6 +296,18 @@ The order in which you set these configurations is unimportant, but the values y hdfs://mycluster +* **dfs.ha.nn.not-become-active-in-safemode** - if prevent safe mode namenodes to become active + + Whether allow namenode to become active when it is in safemode, when it is + set to true, namenode in safemode will report SERVICE_UNHEALTHY to ZKFC if + auto failover is on, or will throw exception to fail the transition to + active if auto failover is off. For example: + + + dfs.ha.nn.not-become-active-in-safemode + true + + ### Deployment details After all of the necessary configuration options have been set, one must initially synchronize the two HA NameNodes' on-disk metadata. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithQJM.md b/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithQJM.md index 4f3df275289..eaa1a86db42 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithQJM.md +++ b/hadoop-hdfs-project/hadoop-hdfs/src/site/markdown/HDFSHighAvailabilityWithQJM.md @@ -347,6 +347,18 @@ The order in which you set these configurations is unimportant, but the values y /path/to/journal/node/local/data +* **dfs.ha.nn.not-become-active-in-safemode** - if prevent safe mode namenodes to become active + + Whether allow namenode to become active when it is in safemode, when it is + set to true, namenode in safemode will report SERVICE_UNHEALTHY to ZKFC if + auto failover is on, or will throw exception to fail the transition to + active if auto failover is off. For example: + + + dfs.ha.nn.not-become-active-in-safemode + true + + ### Deployment details After all of the necessary configuration options have been set, you must start the JournalNode daemons on the set of machines where they will run. This can be done by running the command "*hdfs \--daemon start journalnode*" and waiting for the daemon to start on each of the relevant machines. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java index f9445fa12ee..3f1a979d1de 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHASafeMode.java @@ -17,11 +17,13 @@ */ package org.apache.hadoop.hdfs.server.namenode.ha; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; import java.net.URI; @@ -30,6 +32,8 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import org.apache.hadoop.ha.ServiceFailedException; +import org.apache.hadoop.test.LambdaTestUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -887,4 +891,31 @@ public class TestHASafeMode { cluster.transitionToActive(1); assertSafeMode(nn1, 3, 3, 3, 0); } + + /** + * Test transition to active when namenode in safemode. + * + * @throws IOException + */ + @Test + public void testTransitionToActiveWhenSafeMode() throws Exception { + Configuration config = new Configuration(); + config.setBoolean(DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE, true); + try (MiniDFSCluster miniCluster = new MiniDFSCluster.Builder(config, + new File(GenericTestUtils.getRandomizedTempPath())) + .nnTopology(MiniDFSNNTopology.simpleHATopology()) + .numDataNodes(1) + .build()) { + miniCluster.waitActive(); + miniCluster.transitionToStandby(0); + miniCluster.transitionToStandby(1); + NameNode namenode0 = miniCluster.getNameNode(0); + NameNode namenode1 = miniCluster.getNameNode(1); + NameNodeAdapter.enterSafeMode(namenode0, false); + NameNodeAdapter.enterSafeMode(namenode1, false); + LambdaTestUtils.intercept(ServiceFailedException.class, + "NameNode still not leave safemode", + () -> miniCluster.transitionToActive(0)); + } + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestNNHealthCheck.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestNNHealthCheck.java index e0f794f285d..ab7e0afbca1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestNNHealthCheck.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestNNHealthCheck.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.namenode.ha; import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT; import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIFELINE_RPC_ADDRESS_KEY; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -31,10 +32,12 @@ import org.apache.hadoop.ha.HealthCheckFailedException; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSNNTopology; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker; import org.apache.hadoop.hdfs.tools.NNHAServiceTarget; import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.test.LambdaTestUtils; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -76,6 +79,33 @@ public class TestNNHealthCheck { doNNHealthCheckTest(); } + @Test + public void testNNHealthCheckWithSafemodeAsUnhealthy() throws Exception { + conf.setBoolean(DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE, true); + + // now bring up just the NameNode. + cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0) + .nnTopology(MiniDFSNNTopology.simpleHATopology()).build(); + cluster.waitActive(); + + // manually set safemode. + cluster.getFileSystem(0) + .setSafeMode(HdfsConstants.SafeModeAction.SAFEMODE_ENTER); + + NNHAServiceTarget haTarget = new NNHAServiceTarget(conf, + DFSUtil.getNamenodeNameServiceId(conf), "nn1"); + final String expectedTargetString = haTarget.getAddress().toString(); + + assertTrue("Expected haTarget " + haTarget + " containing " + + expectedTargetString, + haTarget.toString().contains(expectedTargetString)); + HAServiceProtocol rpc = haTarget.getHealthMonitorProxy(conf, 5000); + + LambdaTestUtils.intercept(RemoteException.class, + "The NameNode is configured to report UNHEALTHY to ZKFC in Safemode.", + () -> rpc.monitorHealth()); + } + private void doNNHealthCheckTest() throws IOException { MockNameNodeResourceChecker mockResourceChecker = new MockNameNodeResourceChecker(conf);