HDFS-14201. Ability to disallow safemode NN to become active. Contributed by Xiao Liang and He Xiaoqiao.
This commit is contained in:
parent
fb1ce0d50a
commit
3ab77d9bc9
|
@ -81,6 +81,7 @@ import org.apache.hadoop.conf.Configuration;
|
|||
import org.apache.hadoop.conf.Configuration.IntegerRanges;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeys;
|
||||
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
||||
import org.apache.hadoop.ha.HealthCheckFailedException;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.io.Writable;
|
||||
import org.apache.hadoop.io.WritableUtils;
|
||||
|
@ -3090,6 +3091,8 @@ public abstract class Server {
|
|||
}
|
||||
|
||||
this.exceptionsHandler.addTerseLoggingExceptions(StandbyException.class);
|
||||
this.exceptionsHandler.addTerseLoggingExceptions(
|
||||
HealthCheckFailedException.class);
|
||||
}
|
||||
|
||||
public synchronized void addAuxiliaryListener(int auxiliaryPort)
|
||||
|
|
|
@ -1016,6 +1016,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
|
|||
public static final int DFS_HA_ZKFC_PORT_DEFAULT = 8019;
|
||||
public static final String DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY = "dfs.ha.zkfc.nn.http.timeout.ms";
|
||||
public static final int DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY_DEFAULT = 20000;
|
||||
public static final String DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE =
|
||||
"dfs.ha.nn.not-become-active-in-safemode";
|
||||
public static final boolean DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT =
|
||||
false;
|
||||
|
||||
// Security-related configs
|
||||
public static final String DFS_ENCRYPT_DATA_TRANSFER_KEY = "dfs.encrypt.data.transfer";
|
||||
|
|
|
@ -118,6 +118,8 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
|||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY;
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_DEFAULT;
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_TRASH_INTERVAL_KEY;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT;
|
||||
import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DFS_NAMENODE_RPC_PORT_DEFAULT;
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_CALLER_CONTEXT_ENABLED_KEY;
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_CALLER_CONTEXT_ENABLED_DEFAULT;
|
||||
|
@ -390,6 +392,7 @@ public class NameNode extends ReconfigurableBase implements
|
|||
private final HAContext haContext;
|
||||
protected final boolean allowStaleStandbyReads;
|
||||
private AtomicBoolean started = new AtomicBoolean(false);
|
||||
private final boolean notBecomeActiveInSafemode;
|
||||
|
||||
private final static int HEALTH_MONITOR_WARN_THRESHOLD_MS = 5000;
|
||||
|
||||
|
@ -983,6 +986,9 @@ public class NameNode extends ReconfigurableBase implements
|
|||
this.stopAtException(e);
|
||||
throw e;
|
||||
}
|
||||
notBecomeActiveInSafemode = conf.getBoolean(
|
||||
DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE,
|
||||
DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT);
|
||||
this.started.set(true);
|
||||
}
|
||||
|
||||
|
@ -1802,6 +1808,10 @@ public class NameNode extends ReconfigurableBase implements
|
|||
throw new HealthCheckFailedException(
|
||||
"The NameNode has no resources available");
|
||||
}
|
||||
if (notBecomeActiveInSafemode && isInSafeMode()) {
|
||||
throw new HealthCheckFailedException("The NameNode is configured to " +
|
||||
"report UNHEALTHY to ZKFC in Safemode.");
|
||||
}
|
||||
}
|
||||
|
||||
synchronized void transitionToActive()
|
||||
|
@ -1815,6 +1825,9 @@ public class NameNode extends ReconfigurableBase implements
|
|||
"Cannot transition from '" + OBSERVER_STATE + "' to '" +
|
||||
ACTIVE_STATE + "'");
|
||||
}
|
||||
if (notBecomeActiveInSafemode && isInSafeMode()) {
|
||||
throw new ServiceFailedException(getRole() + " still not leave safemode");
|
||||
}
|
||||
state.setState(haContext, ACTIVE_STATE);
|
||||
}
|
||||
|
||||
|
|
|
@ -3192,6 +3192,15 @@
|
|||
</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>dfs.ha.nn.not-become-active-in-safemode</name>
|
||||
<value>false</value>
|
||||
<description>
|
||||
This will prevent safe mode namenodes to become active while other standby
|
||||
namenodes might be ready to serve requests when it is set to true.
|
||||
</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>dfs.ha.tail-edits.in-progress</name>
|
||||
<value>false</value>
|
||||
|
|
|
@ -296,6 +296,18 @@ The order in which you set these configurations is unimportant, but the values y
|
|||
<value>hdfs://mycluster</value>
|
||||
</property>
|
||||
|
||||
* **dfs.ha.nn.not-become-active-in-safemode** - if prevent safe mode namenodes to become active
|
||||
|
||||
Whether allow namenode to become active when it is in safemode, when it is
|
||||
set to true, namenode in safemode will report SERVICE_UNHEALTHY to ZKFC if
|
||||
auto failover is on, or will throw exception to fail the transition to
|
||||
active if auto failover is off. For example:
|
||||
|
||||
<property>
|
||||
<name>dfs.ha.nn.not-become-active-in-safemode</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
### Deployment details
|
||||
|
||||
After all of the necessary configuration options have been set, one must initially synchronize the two HA NameNodes' on-disk metadata.
|
||||
|
|
|
@ -347,6 +347,18 @@ The order in which you set these configurations is unimportant, but the values y
|
|||
<value>/path/to/journal/node/local/data</value>
|
||||
</property>
|
||||
|
||||
* **dfs.ha.nn.not-become-active-in-safemode** - if prevent safe mode namenodes to become active
|
||||
|
||||
Whether allow namenode to become active when it is in safemode, when it is
|
||||
set to true, namenode in safemode will report SERVICE_UNHEALTHY to ZKFC if
|
||||
auto failover is on, or will throw exception to fail the transition to
|
||||
active if auto failover is off. For example:
|
||||
|
||||
<property>
|
||||
<name>dfs.ha.nn.not-become-active-in-safemode</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
### Deployment details
|
||||
|
||||
After all of the necessary configuration options have been set, you must start the JournalNode daemons on the set of machines where they will run. This can be done by running the command "*hdfs \--daemon start journalnode*" and waiting for the daemon to start on each of the relevant machines.
|
||||
|
|
|
@ -17,11 +17,13 @@
|
|||
*/
|
||||
package org.apache.hadoop.hdfs.server.namenode.ha;
|
||||
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.URI;
|
||||
|
@ -30,6 +32,8 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hadoop.ha.ServiceFailedException;
|
||||
import org.apache.hadoop.test.LambdaTestUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -887,4 +891,31 @@ public class TestHASafeMode {
|
|||
cluster.transitionToActive(1);
|
||||
assertSafeMode(nn1, 3, 3, 3, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test transition to active when namenode in safemode.
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void testTransitionToActiveWhenSafeMode() throws Exception {
|
||||
Configuration config = new Configuration();
|
||||
config.setBoolean(DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE, true);
|
||||
try (MiniDFSCluster miniCluster = new MiniDFSCluster.Builder(config,
|
||||
new File(GenericTestUtils.getRandomizedTempPath()))
|
||||
.nnTopology(MiniDFSNNTopology.simpleHATopology())
|
||||
.numDataNodes(1)
|
||||
.build()) {
|
||||
miniCluster.waitActive();
|
||||
miniCluster.transitionToStandby(0);
|
||||
miniCluster.transitionToStandby(1);
|
||||
NameNode namenode0 = miniCluster.getNameNode(0);
|
||||
NameNode namenode1 = miniCluster.getNameNode(1);
|
||||
NameNodeAdapter.enterSafeMode(namenode0, false);
|
||||
NameNodeAdapter.enterSafeMode(namenode1, false);
|
||||
LambdaTestUtils.intercept(ServiceFailedException.class,
|
||||
"NameNode still not leave safemode",
|
||||
() -> miniCluster.transitionToActive(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.namenode.ha;
|
|||
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT;
|
||||
import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LIFELINE_RPC_ADDRESS_KEY;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
@ -31,10 +32,12 @@ import org.apache.hadoop.ha.HealthCheckFailedException;
|
|||
import org.apache.hadoop.hdfs.DFSUtil;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||
import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker;
|
||||
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
|
||||
import org.apache.hadoop.ipc.RemoteException;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.hadoop.test.LambdaTestUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
@ -76,6 +79,33 @@ public class TestNNHealthCheck {
|
|||
doNNHealthCheckTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNNHealthCheckWithSafemodeAsUnhealthy() throws Exception {
|
||||
conf.setBoolean(DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE, true);
|
||||
|
||||
// now bring up just the NameNode.
|
||||
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
|
||||
.nnTopology(MiniDFSNNTopology.simpleHATopology()).build();
|
||||
cluster.waitActive();
|
||||
|
||||
// manually set safemode.
|
||||
cluster.getFileSystem(0)
|
||||
.setSafeMode(HdfsConstants.SafeModeAction.SAFEMODE_ENTER);
|
||||
|
||||
NNHAServiceTarget haTarget = new NNHAServiceTarget(conf,
|
||||
DFSUtil.getNamenodeNameServiceId(conf), "nn1");
|
||||
final String expectedTargetString = haTarget.getAddress().toString();
|
||||
|
||||
assertTrue("Expected haTarget " + haTarget + " containing " +
|
||||
expectedTargetString,
|
||||
haTarget.toString().contains(expectedTargetString));
|
||||
HAServiceProtocol rpc = haTarget.getHealthMonitorProxy(conf, 5000);
|
||||
|
||||
LambdaTestUtils.intercept(RemoteException.class,
|
||||
"The NameNode is configured to report UNHEALTHY to ZKFC in Safemode.",
|
||||
() -> rpc.monitorHealth());
|
||||
}
|
||||
|
||||
private void doNNHealthCheckTest() throws IOException {
|
||||
MockNameNodeResourceChecker mockResourceChecker =
|
||||
new MockNameNodeResourceChecker(conf);
|
||||
|
|
Loading…
Reference in New Issue