HDFS-14201. Ability to disallow safemode NN to become active. Contributed by Xiao Liang and He Xiaoqiao.

This commit is contained in:
Inigo Goiri 2019-06-18 09:58:29 -07:00
parent fb1ce0d50a
commit 3ab77d9bc9
8 changed files with 114 additions and 0 deletions

View File

@ -81,6 +81,7 @@
import org.apache.hadoop.conf.Configuration.IntegerRanges;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.ha.HealthCheckFailedException;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
@ -3090,6 +3091,8 @@ protected Server(String bindAddress, int port,
}
this.exceptionsHandler.addTerseLoggingExceptions(StandbyException.class);
this.exceptionsHandler.addTerseLoggingExceptions(
HealthCheckFailedException.class);
}
public synchronized void addAuxiliaryListener(int auxiliaryPort)

View File

@ -1016,6 +1016,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final int DFS_HA_ZKFC_PORT_DEFAULT = 8019;
public static final String DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY = "dfs.ha.zkfc.nn.http.timeout.ms";
public static final int DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY_DEFAULT = 20000;
public static final String DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE =
"dfs.ha.nn.not-become-active-in-safemode";
public static final boolean DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT =
false;
// Security-related configs
public static final String DFS_ENCRYPT_DATA_TRANSFER_KEY = "dfs.encrypt.data.transfer";

View File

@ -118,6 +118,8 @@
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY;
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT;
import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DFS_NAMENODE_RPC_PORT_DEFAULT;
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_CALLER_CONTEXT_ENABLED_KEY;
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_CALLER_CONTEXT_ENABLED_DEFAULT;
@ -390,6 +392,7 @@ public long getProtocolVersion(String protocol,
private final HAContext haContext;
protected final boolean allowStaleStandbyReads;
private AtomicBoolean started = new AtomicBoolean(false);
private final boolean notBecomeActiveInSafemode;
private final static int HEALTH_MONITOR_WARN_THRESHOLD_MS = 5000;
@ -983,6 +986,9 @@ protected NameNode(Configuration conf, NamenodeRole role)
this.stopAtException(e);
throw e;
}
notBecomeActiveInSafemode = conf.getBoolean(
DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE,
DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT);
this.started.set(true);
}
@ -1802,6 +1808,10 @@ synchronized void monitorHealth()
throw new HealthCheckFailedException(
"The NameNode has no resources available");
}
if (notBecomeActiveInSafemode && isInSafeMode()) {
throw new HealthCheckFailedException("The NameNode is configured to " +
"report UNHEALTHY to ZKFC in Safemode.");
}
}
synchronized void transitionToActive()
@ -1815,6 +1825,9 @@ synchronized void transitionToActive()
"Cannot transition from '" + OBSERVER_STATE + "' to '" +
ACTIVE_STATE + "'");
}
if (notBecomeActiveInSafemode && isInSafeMode()) {
throw new ServiceFailedException(getRole() + " still not leave safemode");
}
state.setState(haContext, ACTIVE_STATE);
}

View File

@ -3192,6 +3192,15 @@
</description>
</property>
<property>
<name>dfs.ha.nn.not-become-active-in-safemode</name>
<value>false</value>
<description>
This will prevent safe mode namenodes to become active while other standby
namenodes might be ready to serve requests when it is set to true.
</description>
</property>
<property>
<name>dfs.ha.tail-edits.in-progress</name>
<value>false</value>

View File

@ -296,6 +296,18 @@ The order in which you set these configurations is unimportant, but the values y
<value>hdfs://mycluster</value>
</property>
* **dfs.ha.nn.not-become-active-in-safemode** - if prevent safe mode namenodes to become active
Whether allow namenode to become active when it is in safemode, when it is
set to true, namenode in safemode will report SERVICE_UNHEALTHY to ZKFC if
auto failover is on, or will throw exception to fail the transition to
active if auto failover is off. For example:
<property>
<name>dfs.ha.nn.not-become-active-in-safemode</name>
<value>true</value>
</property>
### Deployment details
After all of the necessary configuration options have been set, one must initially synchronize the two HA NameNodes' on-disk metadata.

View File

@ -347,6 +347,18 @@ The order in which you set these configurations is unimportant, but the values y
<value>/path/to/journal/node/local/data</value>
</property>
* **dfs.ha.nn.not-become-active-in-safemode** - if prevent safe mode namenodes to become active
Whether allow namenode to become active when it is in safemode, when it is
set to true, namenode in safemode will report SERVICE_UNHEALTHY to ZKFC if
auto failover is on, or will throw exception to fail the transition to
active if auto failover is off. For example:
<property>
<name>dfs.ha.nn.not-become-active-in-safemode</name>
<value>true</value>
</property>
### Deployment details
After all of the necessary configuration options have been set, you must start the JournalNode daemons on the set of machines where they will run. This can be done by running the command "*hdfs \--daemon start journalnode*" and waiting for the daemon to start on each of the relevant machines.

View File

@ -17,11 +17,13 @@
*/
package org.apache.hadoop.hdfs.server.namenode.ha;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.File;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.URI;
@ -30,6 +32,8 @@
import java.util.List;
import java.util.Map;
import org.apache.hadoop.ha.ServiceFailedException;
import org.apache.hadoop.test.LambdaTestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@ -887,4 +891,31 @@ public Boolean get() {
cluster.transitionToActive(1);
assertSafeMode(nn1, 3, 3, 3, 0);
}
/**
* Test transition to active when namenode in safemode.
*
* @throws IOException
*/
@Test
public void testTransitionToActiveWhenSafeMode() throws Exception {
Configuration config = new Configuration();
config.setBoolean(DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE, true);
try (MiniDFSCluster miniCluster = new MiniDFSCluster.Builder(config,
new File(GenericTestUtils.getRandomizedTempPath()))
.nnTopology(MiniDFSNNTopology.simpleHATopology())
.numDataNodes(1)
.build()) {
miniCluster.waitActive();
miniCluster.transitionToStandby(0);
miniCluster.transitionToStandby(1);
NameNode namenode0 = miniCluster.getNameNode(0);
NameNode namenode1 = miniCluster.getNameNode(1);
NameNodeAdapter.enterSafeMode(namenode0, false);
NameNodeAdapter.enterSafeMode(namenode1, false);
LambdaTestUtils.intercept(ServiceFailedException.class,
"NameNode still not leave safemode",
() -> miniCluster.transitionToActive(0));
}
}
}

View File

@ -19,6 +19,7 @@
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT;
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIFELINE_RPC_ADDRESS_KEY;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@ -31,10 +32,12 @@
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker;
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.test.LambdaTestUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@ -76,6 +79,33 @@ public void testNNHealthCheckWithLifelineAddress() throws IOException {
doNNHealthCheckTest();
}
@Test
public void testNNHealthCheckWithSafemodeAsUnhealthy() throws Exception {
conf.setBoolean(DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE, true);
// now bring up just the NameNode.
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
.nnTopology(MiniDFSNNTopology.simpleHATopology()).build();
cluster.waitActive();
// manually set safemode.
cluster.getFileSystem(0)
.setSafeMode(HdfsConstants.SafeModeAction.SAFEMODE_ENTER);
NNHAServiceTarget haTarget = new NNHAServiceTarget(conf,
DFSUtil.getNamenodeNameServiceId(conf), "nn1");
final String expectedTargetString = haTarget.getAddress().toString();
assertTrue("Expected haTarget " + haTarget + " containing " +
expectedTargetString,
haTarget.toString().contains(expectedTargetString));
HAServiceProtocol rpc = haTarget.getHealthMonitorProxy(conf, 5000);
LambdaTestUtils.intercept(RemoteException.class,
"The NameNode is configured to report UNHEALTHY to ZKFC in Safemode.",
() -> rpc.monitorHealth());
}
private void doNNHealthCheckTest() throws IOException {
MockNameNodeResourceChecker mockResourceChecker =
new MockNameNodeResourceChecker(conf);