HDFS-4059. Add number of stale DataNodes to metrics. Contributed by Jing Zhao.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1398949 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6a059dc40e
commit
0ef9c6f71a
|
@ -344,6 +344,8 @@ Release 2.0.3-alpha - Unreleased
|
||||||
HDFS-3912. Detect and avoid stale datanodes for writes.
|
HDFS-3912. Detect and avoid stale datanodes for writes.
|
||||||
(Jing Zhao via suresh)
|
(Jing Zhao via suresh)
|
||||||
|
|
||||||
|
HDFS-4059. Add number of stale DataNodes to metrics. (Jing Zhao via suresh)
|
||||||
|
|
||||||
IMPROVEMENTS
|
IMPROVEMENTS
|
||||||
|
|
||||||
HDFS-3925. Prettify PipelineAck#toString() for printing to a log
|
HDFS-3925. Prettify PipelineAck#toString() for printing to a log
|
||||||
|
|
|
@ -885,7 +885,7 @@ public class DatanodeManager {
|
||||||
* @return Return the current number of stale DataNodes (detected by
|
* @return Return the current number of stale DataNodes (detected by
|
||||||
* HeartbeatManager).
|
* HeartbeatManager).
|
||||||
*/
|
*/
|
||||||
int getNumStaleNodes() {
|
public int getNumStaleNodes() {
|
||||||
return this.numStaleNodes;
|
return this.numStaleNodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4678,6 +4678,13 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
||||||
return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
|
return getBlockManager().getDatanodeManager().getNumDeadDataNodes();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override // FSNamesystemMBean
|
||||||
|
@Metric({"StaleDataNodes",
|
||||||
|
"Number of datanodes marked stale due to delayed heartbeat"})
|
||||||
|
public int getNumStaleDataNodes() {
|
||||||
|
return getBlockManager().getDatanodeManager().getNumStaleNodes();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the generation stamp for this filesystem
|
* Sets the generation stamp for this filesystem
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -112,4 +112,10 @@ public interface FSNamesystemMBean {
|
||||||
* @return number of dead data nodes
|
* @return number of dead data nodes
|
||||||
*/
|
*/
|
||||||
public int getNumDeadDataNodes();
|
public int getNumDeadDataNodes();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of stale data nodes
|
||||||
|
* @return number of stale data nodes
|
||||||
|
*/
|
||||||
|
public int getNumStaleDataNodes();
|
||||||
}
|
}
|
||||||
|
|
|
@ -191,4 +191,12 @@ public class BlockManagerTestUtil {
|
||||||
"Must use default policy, got %s", bpp.getClass());
|
"Must use default policy, got %s", bpp.getClass());
|
||||||
((BlockPlacementPolicyDefault)bpp).setPreferLocalNode(prefer);
|
((BlockPlacementPolicyDefault)bpp).setPreferLocalNode(prefer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call heartbeat check function of HeartbeatManager
|
||||||
|
* @param bm the BlockManager to manipulate
|
||||||
|
*/
|
||||||
|
public static void checkHeartbeat(BlockManager bm) {
|
||||||
|
bm.getDatanodeManager().getHeartbeatManager().heartbeatCheck();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,10 +41,14 @@ import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
|
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
|
||||||
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||||
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
||||||
import org.apache.hadoop.test.MetricsAsserts;
|
import org.apache.hadoop.test.MetricsAsserts;
|
||||||
|
import org.apache.hadoop.util.Time;
|
||||||
import org.apache.log4j.Level;
|
import org.apache.log4j.Level;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
@ -77,7 +81,8 @@ public class TestNameNodeMetrics {
|
||||||
DFS_REPLICATION_INTERVAL);
|
DFS_REPLICATION_INTERVAL);
|
||||||
CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
|
CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
|
||||||
"" + PERCENTILES_INTERVAL);
|
"" + PERCENTILES_INTERVAL);
|
||||||
|
// Enable stale DataNodes checking
|
||||||
|
CONF.setBoolean(DFSConfigKeys.DFS_NAMENODE_CHECK_STALE_DATANODE_KEY, true);
|
||||||
((Log4JLogger)LogFactory.getLog(MetricsAsserts.class))
|
((Log4JLogger)LogFactory.getLog(MetricsAsserts.class))
|
||||||
.getLogger().setLevel(Level.DEBUG);
|
.getLogger().setLevel(Level.DEBUG);
|
||||||
}
|
}
|
||||||
|
@ -119,6 +124,40 @@ public class TestNameNodeMetrics {
|
||||||
stm.close();
|
stm.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Test metrics indicating the number of stale DataNodes */
|
||||||
|
@Test
|
||||||
|
public void testStaleNodes() throws Exception {
|
||||||
|
// Set two datanodes as stale
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
DataNode dn = cluster.getDataNodes().get(i);
|
||||||
|
DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, true);
|
||||||
|
long staleInterval = CONF.getLong(
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY,
|
||||||
|
DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT);
|
||||||
|
cluster.getNameNode().getNamesystem().getBlockManager()
|
||||||
|
.getDatanodeManager().getDatanode(dn.getDatanodeId())
|
||||||
|
.setLastUpdate(Time.now() - staleInterval - 1);
|
||||||
|
}
|
||||||
|
// Let HeartbeatManager to check heartbeat
|
||||||
|
BlockManagerTestUtil.checkHeartbeat(cluster.getNameNode().getNamesystem()
|
||||||
|
.getBlockManager());
|
||||||
|
assertGauge("StaleDataNodes", 2, getMetrics(NS_METRICS));
|
||||||
|
|
||||||
|
// Reset stale datanodes
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
DataNode dn = cluster.getDataNodes().get(i);
|
||||||
|
DataNodeTestUtils.setHeartbeatsDisabledForTests(dn, false);
|
||||||
|
cluster.getNameNode().getNamesystem().getBlockManager()
|
||||||
|
.getDatanodeManager().getDatanode(dn.getDatanodeId())
|
||||||
|
.setLastUpdate(Time.now());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Let HeartbeatManager to refresh
|
||||||
|
BlockManagerTestUtil.checkHeartbeat(cluster.getNameNode().getNamesystem()
|
||||||
|
.getBlockManager());
|
||||||
|
assertGauge("StaleDataNodes", 0, getMetrics(NS_METRICS));
|
||||||
|
}
|
||||||
|
|
||||||
/** Test metrics associated with addition of a file */
|
/** Test metrics associated with addition of a file */
|
||||||
@Test
|
@Test
|
||||||
public void testFileAdd() throws Exception {
|
public void testFileAdd() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue