Make DataNodePeerMetrics#minOutlierDetectionSamples configurable (#1314). Contributed by Lisheng Sun.

Signed-off-by: sunlisheng <sunlisheng@xiaomi.com>
This commit is contained in:
leosunli 2019-08-30 09:35:26 +08:00 committed by Wei-Chiu Chuang
parent dc0acceabb
commit 915271245b
6 changed files with 42 additions and 13 deletions

View File

@ -579,6 +579,12 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final String DFS_DATANODE_PEER_STATS_ENABLED_KEY = public static final String DFS_DATANODE_PEER_STATS_ENABLED_KEY =
"dfs.datanode.peer.stats.enabled"; "dfs.datanode.peer.stats.enabled";
public static final boolean DFS_DATANODE_PEER_STATS_ENABLED_DEFAULT = false; public static final boolean DFS_DATANODE_PEER_STATS_ENABLED_DEFAULT = false;
public static final String
DFS_DATANODE_PEER_METRICS_MIN_OUTLIER_DETECTION_SAMPLES_KEY =
"dfs.datanode.peer.metrics.min.outlier.detection.samples";
public static final long
DFS_DATANODE_PEER_METRICS_MIN_OUTLIER_DETECTION_SAMPLES_DEFAULT =
1000;
public static final String DFS_DATANODE_HOST_NAME_KEY = public static final String DFS_DATANODE_HOST_NAME_KEY =
HdfsClientConfigKeys.DeprecatedKeys.DFS_DATANODE_HOST_NAME_KEY; HdfsClientConfigKeys.DeprecatedKeys.DFS_DATANODE_HOST_NAME_KEY;
public static final String DFS_NAMENODE_CHECKPOINT_DIR_KEY = public static final String DFS_NAMENODE_CHECKPOINT_DIR_KEY =

View File

@ -1448,7 +1448,7 @@ void startDataNode(List<StorageLocation> dataDirectories,
metrics = DataNodeMetrics.create(getConf(), getDisplayName()); metrics = DataNodeMetrics.create(getConf(), getDisplayName());
peerMetrics = dnConf.peerStatsEnabled ? peerMetrics = dnConf.peerStatsEnabled ?
DataNodePeerMetrics.create(getDisplayName()) : null; DataNodePeerMetrics.create(getDisplayName(), getConf()) : null;
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor); metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
ecWorker = new ErasureCodingWorker(getConf(), this); ecWorker = new ErasureCodingWorker(getConf(), this);

View File

@ -19,9 +19,9 @@
package org.apache.hadoop.hdfs.server.datanode.metrics; package org.apache.hadoop.hdfs.server.datanode.metrics;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.metrics2.MetricsJsonBuilder; import org.apache.hadoop.metrics2.MetricsJsonBuilder;
import org.apache.hadoop.metrics2.lib.MutableRollingAverages; import org.apache.hadoop.metrics2.lib.MutableRollingAverages;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -30,6 +30,9 @@
import java.util.Map; import java.util.Map;
import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.ThreadLocalRandom;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_PEER_METRICS_MIN_OUTLIER_DETECTION_SAMPLES_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_PEER_METRICS_MIN_OUTLIER_DETECTION_SAMPLES_KEY;
/** /**
* This class maintains DataNode peer metrics (e.g. numOps, AvgTime, etc.) for * This class maintains DataNode peer metrics (e.g. numOps, AvgTime, etc.) for
* various peer operations. * various peer operations.
@ -58,11 +61,13 @@ public class DataNodePeerMetrics {
* for outlier detection. If the number of samples is below this then * for outlier detection. If the number of samples is below this then
* outlier detection is skipped. * outlier detection is skipped.
*/ */
@VisibleForTesting private final long minOutlierDetectionSamples;
static final long MIN_OUTLIER_DETECTION_SAMPLES = 1000;
public DataNodePeerMetrics(final String name) { public DataNodePeerMetrics(final String name, Configuration conf) {
this.name = name; this.name = name;
minOutlierDetectionSamples = conf.getLong(
DFS_DATANODE_PEER_METRICS_MIN_OUTLIER_DETECTION_SAMPLES_KEY,
DFS_DATANODE_PEER_METRICS_MIN_OUTLIER_DETECTION_SAMPLES_DEFAULT);
this.slowNodeDetector = new OutlierDetector(MIN_OUTLIER_DETECTION_NODES, this.slowNodeDetector = new OutlierDetector(MIN_OUTLIER_DETECTION_NODES,
LOW_THRESHOLD_MS); LOW_THRESHOLD_MS);
sendPacketDownstreamRollingAverages = new MutableRollingAverages("Time"); sendPacketDownstreamRollingAverages = new MutableRollingAverages("Time");
@ -72,15 +77,19 @@ public String name() {
return name; return name;
} }
long getMinOutlierDetectionSamples() {
return minOutlierDetectionSamples;
}
/** /**
* Creates an instance of DataNodePeerMetrics, used for registration. * Creates an instance of DataNodePeerMetrics, used for registration.
*/ */
public static DataNodePeerMetrics create(String dnName) { public static DataNodePeerMetrics create(String dnName, Configuration conf) {
final String name = "DataNodePeerActivity-" + (dnName.isEmpty() final String name = "DataNodePeerActivity-" + (dnName.isEmpty()
? "UndefinedDataNodeName" + ThreadLocalRandom.current().nextInt() ? "UndefinedDataNodeName" + ThreadLocalRandom.current().nextInt()
: dnName.replace(':', '-')); : dnName.replace(':', '-'));
return new DataNodePeerMetrics(name); return new DataNodePeerMetrics(name, conf);
} }
/** /**
@ -122,7 +131,7 @@ public Map<String, Double> getOutliers() {
// The metric name is the datanode ID. // The metric name is the datanode ID.
final Map<String, Double> stats = final Map<String, Double> stats =
sendPacketDownstreamRollingAverages.getStats( sendPacketDownstreamRollingAverages.getStats(
MIN_OUTLIER_DETECTION_SAMPLES); minOutlierDetectionSamples);
LOG.trace("DataNodePeerMetrics: Got stats: {}", stats); LOG.trace("DataNodePeerMetrics: Got stats: {}", stats);
return slowNodeDetector.getOutliers(stats); return slowNodeDetector.getOutliers(stats);

View File

@ -2182,6 +2182,15 @@
</description> </description>
</property> </property>
<property>
<name>dfs.datanode.peer.metrics.min.outlier.detection.samples</name>
<value>1000</value>
<description>
Minimum number of packet send samples which are required to qualify for outlier detection.
If the number of samples is below this then outlier detection is skipped.
</description>
</property>
<property> <property>
<name>dfs.datanode.outliers.report.interval</name> <name>dfs.datanode.outliers.report.interval</name>
<value>30m</value> <value>30m</value>

View File

@ -46,7 +46,7 @@ public void testGetSendPacketDownstreamAvgInfo() throws Exception {
conf.setBoolean(DFSConfigKeys.DFS_DATANODE_PEER_STATS_ENABLED_KEY, true); conf.setBoolean(DFSConfigKeys.DFS_DATANODE_PEER_STATS_ENABLED_KEY, true);
final DataNodePeerMetrics peerMetrics = DataNodePeerMetrics.create( final DataNodePeerMetrics peerMetrics = DataNodePeerMetrics.create(
"Sample-DataNode"); "Sample-DataNode", conf);
MetricsTestHelper.replaceRollingAveragesScheduler( MetricsTestHelper.replaceRollingAveragesScheduler(
peerMetrics.getSendPacketDownstreamRollingAverages(), peerMetrics.getSendPacketDownstreamRollingAverages(),
numWindows, windowSize, TimeUnit.SECONDS); numWindows, windowSize, TimeUnit.SECONDS);

View File

@ -19,6 +19,8 @@
package org.apache.hadoop.hdfs.server.datanode.metrics; package org.apache.hadoop.hdfs.server.datanode.metrics;
import com.google.common.base.Supplier; import com.google.common.base.Supplier;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.metrics2.lib.MetricsTestHelper; import org.apache.hadoop.metrics2.lib.MetricsTestHelper;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.apache.log4j.Level; import org.apache.log4j.Level;
@ -61,10 +63,13 @@ public class TestDataNodeOutlierDetectionViaMetrics {
private Random random = new Random(System.currentTimeMillis()); private Random random = new Random(System.currentTimeMillis());
private Configuration conf;
@Before @Before
public void setup() { public void setup() {
GenericTestUtils.setLogLevel(DataNodePeerMetrics.LOG, Level.ALL); GenericTestUtils.setLogLevel(DataNodePeerMetrics.LOG, Level.ALL);
GenericTestUtils.setLogLevel(OutlierDetector.LOG, Level.ALL); GenericTestUtils.setLogLevel(OutlierDetector.LOG, Level.ALL);
conf = new HdfsConfiguration();
} }
/** /**
@ -75,7 +80,7 @@ public void testOutlierIsDetected() throws Exception {
final String slowNodeName = "SlowNode"; final String slowNodeName = "SlowNode";
DataNodePeerMetrics peerMetrics = new DataNodePeerMetrics( DataNodePeerMetrics peerMetrics = new DataNodePeerMetrics(
"PeerMetrics-For-Test"); "PeerMetrics-For-Test", conf);
MetricsTestHelper.replaceRollingAveragesScheduler( MetricsTestHelper.replaceRollingAveragesScheduler(
peerMetrics.getSendPacketDownstreamRollingAverages(), peerMetrics.getSendPacketDownstreamRollingAverages(),
@ -107,7 +112,7 @@ public Boolean get() {
@Test @Test
public void testWithNoOutliers() throws Exception { public void testWithNoOutliers() throws Exception {
DataNodePeerMetrics peerMetrics = new DataNodePeerMetrics( DataNodePeerMetrics peerMetrics = new DataNodePeerMetrics(
"PeerMetrics-For-Test"); "PeerMetrics-For-Test", conf);
MetricsTestHelper.replaceRollingAveragesScheduler( MetricsTestHelper.replaceRollingAveragesScheduler(
peerMetrics.getSendPacketDownstreamRollingAverages(), peerMetrics.getSendPacketDownstreamRollingAverages(),
@ -134,7 +139,7 @@ public void injectFastNodesSamples(DataNodePeerMetrics peerMetrics) {
final String nodeName = "FastNode-" + nodeIndex; final String nodeName = "FastNode-" + nodeIndex;
LOG.info("Generating stats for node {}", nodeName); LOG.info("Generating stats for node {}", nodeName);
for (int i = 0; for (int i = 0;
i < 2 * DataNodePeerMetrics.MIN_OUTLIER_DETECTION_SAMPLES; i < 2 * peerMetrics.getMinOutlierDetectionSamples();
++i) { ++i) {
peerMetrics.addSendPacketDownstream( peerMetrics.addSendPacketDownstream(
nodeName, random.nextInt(FAST_NODE_MAX_LATENCY_MS)); nodeName, random.nextInt(FAST_NODE_MAX_LATENCY_MS));
@ -151,7 +156,7 @@ public void injectSlowNodeSamples(
// And the one slow node. // And the one slow node.
for (int i = 0; for (int i = 0;
i < 2 * DataNodePeerMetrics.MIN_OUTLIER_DETECTION_SAMPLES; i < 2 * peerMetrics.getMinOutlierDetectionSamples();
++i) { ++i) {
peerMetrics.addSendPacketDownstream( peerMetrics.addSendPacketDownstream(
slowNodeName, SLOW_NODE_LATENCY_MS); slowNodeName, SLOW_NODE_LATENCY_MS);