HDFS-11560. Expose slow disks via NameNode JMX. Contributed by Hanisha Koneru.

This commit is contained in:
Hanisha Koneru 2017-03-31 13:50:29 -07:00 committed by Arpit Agarwal
parent 5485d93bda
commit 73835c73e2
6 changed files with 85 additions and 13 deletions

View File

@ -1907,5 +1907,14 @@ public class DatanodeManager {
public SlowDiskTracker getSlowDiskTracker() { public SlowDiskTracker getSlowDiskTracker() {
return slowDiskTracker; return slowDiskTracker;
} }
/**
* Retrieve information about slow disks as a JSON.
* Returns null if we are not tracking slow disks.
* @return
*/
public String getSlowDisksReport() {
return slowDiskTracker != null ?
slowDiskTracker.getSlowDiskReportAsJsonString() : null;
}
} }

View File

@ -256,6 +256,9 @@ public class SlowDiskTracker {
public String getSlowDiskReportAsJsonString() { public String getSlowDiskReportAsJsonString() {
ObjectMapper objectMapper = new ObjectMapper(); ObjectMapper objectMapper = new ObjectMapper();
try { try {
if (slowDisksReport.isEmpty()) {
return null;
}
return objectMapper.writeValueAsString(slowDisksReport); return objectMapper.writeValueAsString(slowDisksReport);
} catch (JsonProcessingException e) { } catch (JsonProcessingException e) {
// Failed to serialize. Don't log the exception call stack. // Failed to serialize. Don't log the exception call stack.

View File

@ -1826,6 +1826,12 @@ public class NameNode extends ReconfigurableBase implements
.getSlowPeersReport(); .getSlowPeersReport();
} }
@Override //NameNodeStatusMXBean
public String getSlowDisksReport() {
return namesystem.getBlockManager().getDatanodeManager()
.getSlowDisksReport();
}
/** /**
* Shutdown the NN immediately in an ungraceful way. Used when it would be * Shutdown the NN immediately in an ungraceful way. Used when it would be
* unsafe for the NN to continue operating, e.g. during a failed HA state * unsafe for the NN to continue operating, e.g. during a failed HA state

View File

@ -75,4 +75,12 @@ public interface NameNodeStatusMXBean {
* enabled. The report is in a JSON format. * enabled. The report is in a JSON format.
*/ */
String getSlowPeersReport(); String getSlowPeersReport();
/**
* Gets the topN slow disks in the cluster, if the feature is enabled.
*
* @return JSON string of list of diskIDs and latencies
*/
String getSlowDisksReport();
} }

View File

@ -393,18 +393,9 @@ public class TestSlowDiskTracker {
timer.advance(reportValidityMs); timer.advance(reportValidityMs);
tracker.updateSlowDiskReportAsync(timer.monotonicNow()); tracker.updateSlowDiskReportAsync(timer.monotonicNow());
Thread.sleep(OUTLIERS_REPORT_INTERVAL*2);
GenericTestUtils.waitFor(new Supplier<Boolean>() { assertTrue(tracker.getSlowDiskReportAsJsonString() == null);
@Override
public Boolean get() {
return tracker.getSlowDiskReportAsJsonString() != null;
}
}, 500, 5000);
ArrayList<DiskLatency> jsonReport = getAndDeserializeJson(
tracker.getSlowDiskReportAsJsonString());
assertTrue(jsonReport.isEmpty());
} }
private boolean isDiskInReports(ArrayList<DiskLatency> reports, private boolean isDiskInReports(ArrayList<DiskLatency> reports,

View File

@ -17,17 +17,23 @@
*/ */
package org.apache.hadoop.hdfs.server.namenode; package org.apache.hadoop.hdfs.server.namenode;
import com.google.common.base.Supplier;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.datanode.TestDataNodeMXBean; import org.apache.hadoop.test.GenericTestUtils;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import javax.management.MBeanServer; import javax.management.MBeanServer;
import javax.management.ObjectName; import javax.management.ObjectName;
import java.lang.management.ManagementFactory; import java.lang.management.ManagementFactory;
import java.util.List;
import java.util.concurrent.TimeUnit;
/** /**
* Class for testing {@link NameNodeStatusMXBean} implementation. * Class for testing {@link NameNodeStatusMXBean} implementation.
@ -38,7 +44,7 @@ public class TestNameNodeStatusMXBean {
TestNameNodeStatusMXBean.class); TestNameNodeStatusMXBean.class);
@Test(timeout = 120000L) @Test(timeout = 120000L)
public void testDataNodeMXBean() throws Exception { public void testNameNodeStatusMXBean() throws Exception {
Configuration conf = new Configuration(); Configuration conf = new Configuration();
MiniDFSCluster cluster = null; MiniDFSCluster cluster = null;
@ -84,6 +90,55 @@ public class TestNameNodeStatusMXBean {
String slowPeersReport = (String)mbs.getAttribute(mxbeanName, String slowPeersReport = (String)mbs.getAttribute(mxbeanName,
"SlowPeersReport"); "SlowPeersReport");
Assert.assertEquals(nn.getSlowPeersReport(), slowPeersReport); Assert.assertEquals(nn.getSlowPeersReport(), slowPeersReport);
// Get attribute "SlowDisksReport"
String slowDisksReport = (String)mbs.getAttribute(mxbeanName,
"SlowDisksReport");
Assert.assertEquals(nn.getSlowDisksReport(), slowDisksReport);
} finally {
if (cluster != null) {
cluster.shutdown();
}
}
}
@Test
public void testNameNodeMXBeanSlowDisksEnabled() throws Exception {
Configuration conf = new Configuration();
conf.setDouble(
DFSConfigKeys.DFS_DATANODE_FILEIO_PROFILING_SAMPLING_FRACTION_KEY, 1.0);
conf.setTimeDuration(
DFSConfigKeys.DFS_DATANODE_OUTLIERS_REPORT_INTERVAL_KEY,
1000, TimeUnit.MILLISECONDS);
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build();
try {
List<DataNode> datanodes = cluster.getDataNodes();
Assert.assertEquals(datanodes.size(), 1);
DataNode datanode = datanodes.get(0);
String slowDiskPath = "test/data1/slowVolume";
datanode.getDiskMetrics().addSlowDiskForTesting(slowDiskPath, null);
NameNode nn = cluster.getNameNode();
DatanodeManager datanodeManager = nn.getNamesystem().getBlockManager()
.getDatanodeManager();
MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
ObjectName mxbeanName = new ObjectName(
"Hadoop:service=NameNode,name=NameNodeStatus");
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
return (datanodeManager.getSlowDisksReport() != null);
}
}, 1000, 100000);
String slowDisksReport = (String)mbs.getAttribute(
mxbeanName, "SlowDisksReport");
Assert.assertEquals(datanodeManager.getSlowDisksReport(),
slowDisksReport);
Assert.assertTrue(slowDisksReport.contains(slowDiskPath));
} finally { } finally {
if (cluster != null) { if (cluster != null) {
cluster.shutdown(); cluster.shutdown();