HDFS-12131. Add some of the FSNamesystem JMX values as metrics. Contributed by Erik Krogen.

This commit is contained in:
Andrew Wang 2017-08-03 15:44:51 -07:00
parent 0542e6f86e
commit f4c6b00a9f
4 changed files with 150 additions and 1 deletions

View File

@ -213,7 +213,15 @@ Each metrics record contains tags such as HAState and Hostname as additional inf
| `PendingDataNodeMessageCount` | (HA-only) Current number of pending block-related messages for later processing in the standby NameNode |
| `MillisSinceLastLoadedEdits` | (HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0 |
| `BlockCapacity` | Current number of block capacity |
| `NumLiveDataNodes` | Number of datanodes which are currently live |
| `NumDeadDataNodes` | Number of datanodes which are currently dead |
| `NumDecomLiveDataNodes` | Number of datanodes which have been decommissioned and are now live |
| `NumDecomDeadDataNodes` | Number of datanodes which have been decommissioned and are now dead |
| `NumDecommissioningDataNodes` | Number of datanodes in decommissioning state |
| `VolumeFailuresTotal` | Total number of volume failures across all Datanodes |
| `EstimatedCapacityLostTotal` | An estimate of the total capacity lost due to volume failures |
| `StaleDataNodes` | Current number of DataNodes marked stale due to delayed heartbeat |
| `NumStaleStorages` | Number of storages marked as content stale (after NameNode restart/failover before first block report is received) |
| `MissingReplOneBlocks` | Current number of missing blocks with replication factor 1 |
| `NumFilesUnderConstruction` | Current number of files under construction |
| `NumActiveClients` | Current number of active clients holding lease |
@ -224,6 +232,9 @@ Each metrics record contains tags such as HAState and Hostname as additional inf
| `TotalSyncTimes` | Total number of milliseconds spent by various edit logs in sync operation|
| `NameDirSize` | NameNode name directories size in bytes |
| `NumTimedOutPendingReconstructions` | The number of timed out reconstructions. Not the number of unique blocks that timed out. |
| `NumInMaintenanceLiveDataNodes` | Number of live Datanodes which are in maintenance state |
| `NumInMaintenanceDeadDataNodes` | Number of dead Datanodes which are in maintenance state |
| `NumEnteringMaintenanceDataNodes` | Number of Datanodes that are entering the maintenance state |
| `FSN(Read|Write)Lock`*OperationName*`NumOps` | Total number of acquiring lock by operations |
| `FSN(Read|Write)Lock`*OperationName*`AvgTime` | Average time of holding the lock by operations in milliseconds |

View File

@ -4843,16 +4843,20 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
@Metric({"NumLiveDataNodes", "Number of datanodes which are currently live"})
public int getNumLiveDataNodes() {
return getBlockManager().getDatanodeManager().getNumLiveDataNodes();
}
@Override // FSNamesystemMBean
@Metric({"NumDeadDataNodes", "Number of datanodes which are currently dead"})
public int getNumDeadDataNodes() {
return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
}
@Override // FSNamesystemMBean
@Metric({"NumDecomLiveDataNodes",
"Number of datanodes which have been decommissioned and are now live"})
public int getNumDecomLiveDataNodes() {
final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
@ -4864,6 +4868,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
@Metric({"NumDecomDeadDataNodes",
"Number of datanodes which have been decommissioned and are now dead"})
public int getNumDecomDeadDataNodes() {
final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, false);
@ -4875,6 +4881,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
@Metric({"VolumeFailuresTotal",
"Total number of volume failures across all Datanodes"})
public int getVolumeFailuresTotal() {
List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
@ -4886,6 +4894,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
@Metric({"EstimatedCapacityLostTotal",
"An estimate of the total capacity lost due to volume failures"})
public long getEstimatedCapacityLostTotal() {
List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
@ -4901,6 +4911,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
@Metric({"NumDecommissioningDataNodes",
"Number of datanodes in decommissioning state"})
public int getNumDecommissioningDataNodes() {
return getBlockManager().getDatanodeManager().getDecommissioningNodes()
.size();
@ -4918,6 +4930,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
* before NN receives the first Heartbeat followed by the first Blockreport.
*/
@Override // FSNamesystemMBean
@Metric({"NumStaleStorages",
"Number of storages marked as content stale"})
public int getNumStaleStorages() {
return getBlockManager().getDatanodeManager().getNumStaleStorages();
}
@ -7542,6 +7556,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
@Override // FSNamesystemMBean
@Metric({"NumInMaintenanceLiveDataNodes",
"Number of live Datanodes which are in maintenance state"})
public int getNumInMaintenanceLiveDataNodes() {
final List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, true);
@ -7553,6 +7569,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
@Metric({"NumInMaintenanceDeadDataNodes",
"Number of dead Datanodes which are in maintenance state"})
public int getNumInMaintenanceDeadDataNodes() {
final List<DatanodeDescriptor> dead = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(null, dead, true);
@ -7564,6 +7582,8 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
@Override // FSNamesystemMBean
@Metric({"NumEnteringMaintenanceDataNodes",
"Number of Datanodes that are entering the maintenance state"})
public int getNumEnteringMaintenanceDataNodes() {
return getBlockManager().getDatanodeManager().getEnteringMaintenanceNodes()
.size();

View File

@ -61,6 +61,7 @@ import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import com.google.common.base.Supplier;
@ -148,6 +149,8 @@ public class MiniDFSCluster implements AutoCloseable {
public static final String HDFS_MINIDFS_BASEDIR = "hdfs.minidfs.basedir";
public static final String DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY
= DFS_NAMENODE_SAFEMODE_EXTENSION_KEY + ".testing";
public static final String DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY
= DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY + ".testing";
// Changing this default may break some tests that assume it is 2.
private static final int DEFAULT_STORAGES_PER_DATANODE = 2;
@ -818,7 +821,10 @@ public class MiniDFSCluster implements AutoCloseable {
int safemodeExtension = conf.getInt(
DFS_NAMENODE_SAFEMODE_EXTENSION_TESTING_KEY, 0);
conf.setInt(DFS_NAMENODE_SAFEMODE_EXTENSION_KEY, safemodeExtension);
conf.setInt(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY, 3); // 3 second
long decommissionInterval = conf.getTimeDuration(
DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, 3, TimeUnit.SECONDS);
conf.setTimeDuration(DFS_NAMENODE_DECOMMISSION_INTERVAL_KEY,
decommissionInterval, TimeUnit.SECONDS);
if (!useConfiguredTopologyMappingClass) {
conf.setClass(NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY,
StaticMapping.class, DNSToSwitchMapping.class);

View File

@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hdfs.server.namenode.metrics;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.crypto.key.JavaKeyStoreProvider;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.fs.FileSystemTestHelper;
@ -41,7 +42,9 @@ import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import java.util.Random;
import com.google.common.collect.ImmutableList;
@ -69,12 +72,15 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsVolumeImpl;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
import org.apache.hadoop.hdfs.util.HostsFileWriter;
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
import org.apache.hadoop.metrics2.MetricsSource;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@ -115,6 +121,15 @@ public class TestNameNodeMetrics {
CONF.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 1);
CONF.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
DFS_REDUNDANCY_INTERVAL);
// Set it long enough to essentially disable unless we manually call it
// Used for decommissioning DataNode metrics
CONF.setTimeDuration(
MiniDFSCluster.DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, 999,
TimeUnit.DAYS);
// Next two configs used for checking failed volume metrics
CONF.setTimeDuration(DFSConfigKeys.DFS_DATANODE_DISK_CHECK_MIN_GAP_KEY,
10, TimeUnit.MILLISECONDS);
CONF.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1);
CONF.setInt(DFSConfigKeys.DFS_NAMENODE_REDUNDANCY_INTERVAL_SECONDS_KEY,
DFS_REDUNDANCY_INTERVAL);
CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
@ -133,6 +148,7 @@ public class TestNameNodeMetrics {
private DistributedFileSystem fs;
private final Random rand = new Random();
private FSNamesystem namesystem;
private HostsFileWriter hostsFileWriter;
private BlockManager bm;
private Path ecDir;
@ -142,6 +158,8 @@ public class TestNameNodeMetrics {
@Before
public void setUp() throws Exception {
hostsFileWriter = new HostsFileWriter();
hostsFileWriter.initialize(CONF, "temp/decommission");
cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(DATANODE_COUNT)
.build();
cluster.waitActive();
@ -161,6 +179,10 @@ public class TestNameNodeMetrics {
MetricsRecordBuilder rb = getMetrics(source);
assertQuantileGauges("GetGroups1s", rb);
}
if (hostsFileWriter != null) {
hostsFileWriter.cleanup();
hostsFileWriter = null;
}
if (cluster != null) {
cluster.shutdown();
cluster = null;
@ -235,6 +257,96 @@ public class TestNameNodeMetrics {
.getBlockManager());
assertGauge("StaleDataNodes", 0, getMetrics(NS_METRICS));
}
/**
* Test metrics associated with volume failures.
*/
@Test
public void testVolumeFailures() throws Exception {
assertGauge("VolumeFailuresTotal", 0, getMetrics(NS_METRICS));
assertGauge("EstimatedCapacityLostTotal", 0L, getMetrics(NS_METRICS));
DataNode dn = cluster.getDataNodes().get(0);
FsDatasetSpi.FsVolumeReferences volumeReferences =
DataNodeTestUtils.getFSDataset(dn).getFsVolumeReferences();
FsVolumeImpl fsVolume = (FsVolumeImpl) volumeReferences.get(0);
File dataDir = new File(fsVolume.getBaseURI());
long capacity = fsVolume.getCapacity();
volumeReferences.close();
DataNodeTestUtils.injectDataDirFailure(dataDir);
DataNodeTestUtils.waitForDiskError(dn, fsVolume);
DataNodeTestUtils.triggerHeartbeat(dn);
BlockManagerTestUtil.checkHeartbeat(bm);
assertGauge("VolumeFailuresTotal", 1, getMetrics(NS_METRICS));
assertGauge("EstimatedCapacityLostTotal", capacity, getMetrics(NS_METRICS));
}
/**
* Test metrics associated with liveness and decommission status of DataNodes.
*/
@Test
public void testDataNodeLivenessAndDecom() throws Exception {
List<DataNode> dataNodes = cluster.getDataNodes();
DatanodeDescriptor[] dnDescriptors = new DatanodeDescriptor[DATANODE_COUNT];
String[] dnAddresses = new String[DATANODE_COUNT];
for (int i = 0; i < DATANODE_COUNT; i++) {
dnDescriptors[i] = bm.getDatanodeManager()
.getDatanode(dataNodes.get(i).getDatanodeId());
dnAddresses[i] = dnDescriptors[i].getXferAddr();
}
// First put all DNs into include
hostsFileWriter.initIncludeHosts(dnAddresses);
bm.getDatanodeManager().refreshNodes(CONF);
assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
// Now decommission one DN
hostsFileWriter.initExcludeHost(dnAddresses[0]);
bm.getDatanodeManager().refreshNodes(CONF);
assertGauge("NumDecommissioningDataNodes", 1, getMetrics(NS_METRICS));
BlockManagerTestUtil.recheckDecommissionState(bm.getDatanodeManager());
assertGauge("NumDecommissioningDataNodes", 0, getMetrics(NS_METRICS));
assertGauge("NumDecomLiveDataNodes", 1, getMetrics(NS_METRICS));
assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
// Now kill all DNs by expiring their heartbeats
for (int i = 0; i < DATANODE_COUNT; i++) {
DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), true);
long expireInterval = CONF.getLong(
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT) * 2L
+ CONF.getLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT) * 10 * 1000L;
DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i],
-(expireInterval + 1));
}
BlockManagerTestUtil.checkHeartbeat(bm);
assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
assertGauge("NumDecomDeadDataNodes", 1, getMetrics(NS_METRICS));
assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS));
assertGauge("NumDeadDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
// Now remove the decommissioned DN altogether
String[] includeHosts = new String[dnAddresses.length - 1];
for (int i = 0; i < includeHosts.length; i++) {
includeHosts[i] = dnAddresses[i + 1];
}
hostsFileWriter.initIncludeHosts(includeHosts);
hostsFileWriter.initExcludeHosts(new ArrayList<>());
bm.getDatanodeManager().refreshNodes(CONF);
assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
assertGauge("NumDecomDeadDataNodes", 0, getMetrics(NS_METRICS));
assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS));
assertGauge("NumDeadDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS));
// Finally mark the remaining DNs as live again
for (int i = 1; i < dataNodes.size(); i++) {
DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), false);
DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i], 0);
}
BlockManagerTestUtil.checkHeartbeat(bm);
assertGauge("NumLiveDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS));
assertGauge("NumDeadDataNodes", 0, getMetrics(NS_METRICS));
}
/** Test metrics associated with addition of a file */
@Test