HDFS-6184. Capture NN's thread dump when it fails over. Contributed by Ming Ma.
This commit is contained in:
parent
f24452d14e
commit
2463666ecb
|
@ -844,12 +844,11 @@ public abstract class ZKFailoverController {
|
||||||
* @return the last health state passed to the FC
|
* @return the last health state passed to the FC
|
||||||
* by the HealthMonitor.
|
* by the HealthMonitor.
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
protected synchronized State getLastHealthState() {
|
||||||
synchronized State getLastHealthState() {
|
|
||||||
return lastHealthState;
|
return lastHealthState;
|
||||||
}
|
}
|
||||||
|
|
||||||
private synchronized void setLastHealthState(HealthMonitor.State newState) {
|
protected synchronized void setLastHealthState(HealthMonitor.State newState) {
|
||||||
LOG.info("Local service " + localTarget +
|
LOG.info("Local service " + localTarget +
|
||||||
" entered state: " + newState);
|
" entered state: " + newState);
|
||||||
lastHealthState = newState;
|
lastHealthState = newState;
|
||||||
|
|
|
@ -546,6 +546,9 @@ Release 2.8.0 - UNRELEASED
|
||||||
HDFS-8255. Rename getBlockReplication to getPreferredBlockReplication.
|
HDFS-8255. Rename getBlockReplication to getPreferredBlockReplication.
|
||||||
(Contributed by Zhe Zhang)
|
(Contributed by Zhe Zhang)
|
||||||
|
|
||||||
|
HDFS-6184. Capture NN's thread dump when it fails over.
|
||||||
|
(Ming Ma via aajisaka)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
HDFS-8026. Trace FSOutputSummer#writeChecksumChunks rather than
|
HDFS-8026. Trace FSOutputSummer#writeChecksumChunks rather than
|
||||||
|
|
|
@ -543,6 +543,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
|
||||||
public static final boolean DFS_HA_AUTO_FAILOVER_ENABLED_DEFAULT = false;
|
public static final boolean DFS_HA_AUTO_FAILOVER_ENABLED_DEFAULT = false;
|
||||||
public static final String DFS_HA_ZKFC_PORT_KEY = "dfs.ha.zkfc.port";
|
public static final String DFS_HA_ZKFC_PORT_KEY = "dfs.ha.zkfc.port";
|
||||||
public static final int DFS_HA_ZKFC_PORT_DEFAULT = 8019;
|
public static final int DFS_HA_ZKFC_PORT_DEFAULT = 8019;
|
||||||
|
public static final String DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY = "dfs.ha.zkfc.nn.http.timeout.ms";
|
||||||
|
public static final int DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY_DEFAULT = 20000;
|
||||||
|
|
||||||
// Security-related configs
|
// Security-related configs
|
||||||
public static final String DFS_ENCRYPT_DATA_TRANSFER_KEY = "dfs.encrypt.data.transfer";
|
public static final String DFS_ENCRYPT_DATA_TRANSFER_KEY = "dfs.encrypt.data.transfer";
|
||||||
|
|
|
@ -20,15 +20,20 @@ package org.apache.hadoop.hdfs.tools;
|
||||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_KEYTAB_FILE_KEY;
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_KEYTAB_FILE_KEY;
|
||||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY;
|
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY;
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
|
import java.net.URL;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.HadoopIllegalArgumentException;
|
import org.apache.hadoop.HadoopIllegalArgumentException;
|
||||||
import org.apache.hadoop.classification.InterfaceAudience;
|
import org.apache.hadoop.classification.InterfaceAudience;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.ha.HAServiceTarget;
|
import org.apache.hadoop.ha.HAServiceTarget;
|
||||||
|
import org.apache.hadoop.ha.HealthMonitor;
|
||||||
import org.apache.hadoop.ha.ZKFailoverController;
|
import org.apache.hadoop.ha.ZKFailoverController;
|
||||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||||
import org.apache.hadoop.hdfs.DFSUtil;
|
import org.apache.hadoop.hdfs.DFSUtil;
|
||||||
|
@ -37,6 +42,7 @@ import org.apache.hadoop.hdfs.HDFSPolicyProvider;
|
||||||
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
|
import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
|
||||||
|
import org.apache.hadoop.io.IOUtils;
|
||||||
import org.apache.hadoop.ipc.Server;
|
import org.apache.hadoop.ipc.Server;
|
||||||
import org.apache.hadoop.security.AccessControlException;
|
import org.apache.hadoop.security.AccessControlException;
|
||||||
import org.apache.hadoop.security.SecurityUtil;
|
import org.apache.hadoop.security.SecurityUtil;
|
||||||
|
@ -57,6 +63,9 @@ public class DFSZKFailoverController extends ZKFailoverController {
|
||||||
/* the same as superclass's localTarget, but with the more specfic NN type */
|
/* the same as superclass's localTarget, but with the more specfic NN type */
|
||||||
private final NNHAServiceTarget localNNTarget;
|
private final NNHAServiceTarget localNNTarget;
|
||||||
|
|
||||||
|
// This is used only for unit tests
|
||||||
|
private boolean isThreadDumpCaptured = false;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected HAServiceTarget dataToTarget(byte[] data) {
|
protected HAServiceTarget dataToTarget(byte[] data) {
|
||||||
ActiveNodeInfo proto;
|
ActiveNodeInfo proto;
|
||||||
|
@ -201,4 +210,55 @@ public class DFSZKFailoverController extends ZKFailoverController {
|
||||||
LOG.warn(msg);
|
LOG.warn(msg);
|
||||||
throw new AccessControlException(msg);
|
throw new AccessControlException(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* capture local NN's thread dump and write it to ZKFC's log.
|
||||||
|
*/
|
||||||
|
private void getLocalNNThreadDump() {
|
||||||
|
isThreadDumpCaptured = false;
|
||||||
|
// We use the same timeout value for both connection establishment
|
||||||
|
// timeout and read timeout.
|
||||||
|
int httpTimeOut = conf.getInt(
|
||||||
|
DFSConfigKeys.DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY,
|
||||||
|
DFSConfigKeys.DFS_HA_ZKFC_NN_HTTP_TIMEOUT_KEY_DEFAULT);
|
||||||
|
if (httpTimeOut == 0) {
|
||||||
|
// If timeout value is set to zero, the feature is turned off.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
String stacksUrl = DFSUtil.getInfoServer(localNNTarget.getAddress(),
|
||||||
|
conf, DFSUtil.getHttpClientScheme(conf)) + "/stacks";
|
||||||
|
URL url = new URL(stacksUrl);
|
||||||
|
HttpURLConnection conn = (HttpURLConnection)url.openConnection();
|
||||||
|
conn.setReadTimeout(httpTimeOut);
|
||||||
|
conn.setConnectTimeout(httpTimeOut);
|
||||||
|
conn.connect();
|
||||||
|
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||||
|
IOUtils.copyBytes(conn.getInputStream(), out, 4096, true);
|
||||||
|
StringBuilder localNNThreadDumpContent =
|
||||||
|
new StringBuilder("-- Local NN thread dump -- \n");
|
||||||
|
localNNThreadDumpContent.append(out);
|
||||||
|
localNNThreadDumpContent.append("\n -- Local NN thread dump -- ");
|
||||||
|
LOG.info(localNNThreadDumpContent);
|
||||||
|
isThreadDumpCaptured = true;
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.warn("Can't get local NN thread dump due to " + e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected synchronized void setLastHealthState(HealthMonitor.State newState) {
|
||||||
|
super.setLastHealthState(newState);
|
||||||
|
// Capture local NN thread dump when the target NN health state changes.
|
||||||
|
if (getLastHealthState() == HealthMonitor.State.SERVICE_NOT_RESPONDING ||
|
||||||
|
getLastHealthState() == HealthMonitor.State.SERVICE_UNHEALTHY) {
|
||||||
|
getLocalNNThreadDump();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
boolean isThreadDumpCaptured() {
|
||||||
|
return isThreadDumpCaptured;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2334,4 +2334,15 @@
|
||||||
</description>
|
</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>dfs.ha.zkfc.nn.http.timeout.ms</name>
|
||||||
|
<value>20000</value>
|
||||||
|
<description>
|
||||||
|
The HTTP connection and read timeout value (unit is ms ) when DFS ZKFC
|
||||||
|
tries to get local NN thread dump after local NN becomes
|
||||||
|
SERVICE_NOT_RESPONDING or SERVICE_UNHEALTHY.
|
||||||
|
If it is set to zero, DFS ZKFC won't get local NN thread dump.
|
||||||
|
</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
</configuration>
|
</configuration>
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.hadoop.hdfs.server.namenode.ha;
|
package org.apache.hadoop.hdfs.tools;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
@ -36,9 +36,9 @@ import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.EditLogFileOutputStream;
|
import org.apache.hadoop.hdfs.server.namenode.EditLogFileOutputStream;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
import org.apache.hadoop.hdfs.tools.DFSHAAdmin;
|
import org.apache.hadoop.hdfs.server.namenode.NameNodeResourceChecker;
|
||||||
import org.apache.hadoop.hdfs.tools.DFSZKFailoverController;
|
|
||||||
import org.apache.hadoop.test.GenericTestUtils;
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
import org.apache.hadoop.test.MultithreadedTestUtil.TestContext;
|
import org.apache.hadoop.test.MultithreadedTestUtil.TestContext;
|
||||||
import org.apache.hadoop.test.MultithreadedTestUtil.TestingThread;
|
import org.apache.hadoop.test.MultithreadedTestUtil.TestingThread;
|
||||||
|
@ -47,6 +47,7 @@ import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import com.google.common.base.Supplier;
|
import com.google.common.base.Supplier;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
public class TestDFSZKFailoverController extends ClientBaseWithFixes {
|
public class TestDFSZKFailoverController extends ClientBaseWithFixes {
|
||||||
private Configuration conf;
|
private Configuration conf;
|
||||||
|
@ -123,6 +124,22 @@ public class TestDFSZKFailoverController extends ClientBaseWithFixes {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test that thread dump is captured after NN state changes.
|
||||||
|
*/
|
||||||
|
@Test(timeout=60000)
|
||||||
|
public void testThreadDumpCaptureAfterNNStateChange() throws Exception {
|
||||||
|
NameNodeResourceChecker mockResourceChecker = Mockito.mock(
|
||||||
|
NameNodeResourceChecker.class);
|
||||||
|
Mockito.doReturn(false).when(mockResourceChecker).hasAvailableDiskSpace();
|
||||||
|
cluster.getNameNode(0).getNamesystem()
|
||||||
|
.setNNResourceChecker(mockResourceChecker);
|
||||||
|
waitForHAState(0, HAServiceState.STANDBY);
|
||||||
|
while (!thr1.zkfc.isThreadDumpCaptured()) {
|
||||||
|
Thread.sleep(1000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test that automatic failover is triggered by shutting the
|
* Test that automatic failover is triggered by shutting the
|
||||||
* active NN down.
|
* active NN down.
|
Loading…
Reference in New Issue