HDFS-2966. TestNameNodeMetrics tests can fail under load. Contributed by Steve Loughran
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1362985 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7221b08042
commit
e8b0614072
|
@ -348,6 +348,8 @@ Release 2.0.1-alpha - UNRELEASED
|
||||||
HDFS-3609. libhdfs: don't force the URI to look like hdfs://hostname:port.
|
HDFS-3609. libhdfs: don't force the URI to look like hdfs://hostname:port.
|
||||||
(Colin Patrick McCabe via eli)
|
(Colin Patrick McCabe via eli)
|
||||||
|
|
||||||
|
HDFS-2966 TestNameNodeMetrics tests can fail under load. (stevel)
|
||||||
|
|
||||||
BREAKDOWN OF HDFS-3042 SUBTASKS
|
BREAKDOWN OF HDFS-3042 SUBTASKS
|
||||||
|
|
||||||
HDFS-2185. HDFS portion of ZK-based FailoverController (todd)
|
HDFS-2185. HDFS portion of ZK-based FailoverController (todd)
|
||||||
|
|
|
@ -62,6 +62,8 @@ public class TestNameNodeMetrics {
|
||||||
|
|
||||||
// Number of datanodes in the cluster
|
// Number of datanodes in the cluster
|
||||||
private static final int DATANODE_COUNT = 3;
|
private static final int DATANODE_COUNT = 3;
|
||||||
|
private static final int WAIT_GAUGE_VALUE_RETRIES = 20;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
CONF.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 100);
|
CONF.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 100);
|
||||||
CONF.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 1);
|
CONF.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 1);
|
||||||
|
@ -149,10 +151,7 @@ public class TestNameNodeMetrics {
|
||||||
fs.delete(file, true);
|
fs.delete(file, true);
|
||||||
filesTotal--; // reduce the filecount for deleted file
|
filesTotal--; // reduce the filecount for deleted file
|
||||||
|
|
||||||
waitForDeletion();
|
rb = waitForDnMetricValue(NS_METRICS, "FilesTotal", filesTotal);
|
||||||
updateMetrics();
|
|
||||||
rb = getMetrics(NS_METRICS);
|
|
||||||
assertGauge("FilesTotal", filesTotal, rb);
|
|
||||||
assertGauge("BlocksTotal", 0L, rb);
|
assertGauge("BlocksTotal", 0L, rb);
|
||||||
assertGauge("PendingDeletionBlocks", 0L, rb);
|
assertGauge("PendingDeletionBlocks", 0L, rb);
|
||||||
|
|
||||||
|
@ -185,9 +184,7 @@ public class TestNameNodeMetrics {
|
||||||
assertGauge("PendingReplicationBlocks", 1L, rb);
|
assertGauge("PendingReplicationBlocks", 1L, rb);
|
||||||
assertGauge("ScheduledReplicationBlocks", 1L, rb);
|
assertGauge("ScheduledReplicationBlocks", 1L, rb);
|
||||||
fs.delete(file, true);
|
fs.delete(file, true);
|
||||||
waitForDeletion();
|
rb = waitForDnMetricValue(NS_METRICS, "CorruptBlocks", 0L);
|
||||||
rb = getMetrics(NS_METRICS);
|
|
||||||
assertGauge("CorruptBlocks", 0L, rb);
|
|
||||||
assertGauge("PendingReplicationBlocks", 0L, rb);
|
assertGauge("PendingReplicationBlocks", 0L, rb);
|
||||||
assertGauge("ScheduledReplicationBlocks", 0L, rb);
|
assertGauge("ScheduledReplicationBlocks", 0L, rb);
|
||||||
}
|
}
|
||||||
|
@ -229,8 +226,7 @@ public class TestNameNodeMetrics {
|
||||||
assertGauge("UnderReplicatedBlocks", 1L, rb);
|
assertGauge("UnderReplicatedBlocks", 1L, rb);
|
||||||
assertGauge("MissingBlocks", 1L, rb);
|
assertGauge("MissingBlocks", 1L, rb);
|
||||||
fs.delete(file, true);
|
fs.delete(file, true);
|
||||||
waitForDeletion();
|
waitForDnMetricValue(NS_METRICS, "UnderReplicatedBlocks", 0L);
|
||||||
assertGauge("UnderReplicatedBlocks", 0L, getMetrics(NS_METRICS));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void waitForDeletion() throws InterruptedException {
|
private void waitForDeletion() throws InterruptedException {
|
||||||
|
@ -239,6 +235,43 @@ public class TestNameNodeMetrics {
|
||||||
Thread.sleep(DFS_REPLICATION_INTERVAL * (DATANODE_COUNT + 1) * 1000);
|
Thread.sleep(DFS_REPLICATION_INTERVAL * (DATANODE_COUNT + 1) * 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wait for the named gauge value from the metrics source to reach the
|
||||||
|
* desired value.
|
||||||
|
*
|
||||||
|
* There's an initial delay then a spin cycle of sleep and poll. Because
|
||||||
|
* all the tests use a shared FS instance, these tests are not independent;
|
||||||
|
* that's why the initial sleep is in there.
|
||||||
|
*
|
||||||
|
* @param source metrics source
|
||||||
|
* @param name gauge name
|
||||||
|
* @param expected expected value
|
||||||
|
* @return the last metrics record polled
|
||||||
|
* @throws Exception if something went wrong.
|
||||||
|
*/
|
||||||
|
private MetricsRecordBuilder waitForDnMetricValue(String source,
|
||||||
|
String name,
|
||||||
|
long expected)
|
||||||
|
throws Exception {
|
||||||
|
MetricsRecordBuilder rb;
|
||||||
|
long gauge;
|
||||||
|
//initial wait.
|
||||||
|
waitForDeletion();
|
||||||
|
//lots of retries are allowed for slow systems; fast ones will still
|
||||||
|
//exit early
|
||||||
|
int retries = (DATANODE_COUNT + 1) * WAIT_GAUGE_VALUE_RETRIES;
|
||||||
|
rb = getMetrics(source);
|
||||||
|
gauge = MetricsAsserts.getLongGauge(name, rb);
|
||||||
|
while (gauge != expected && (--retries > 0)) {
|
||||||
|
Thread.sleep(DFS_REPLICATION_INTERVAL * 500);
|
||||||
|
rb = getMetrics(source);
|
||||||
|
gauge = MetricsAsserts.getLongGauge(name, rb);
|
||||||
|
}
|
||||||
|
//at this point the assertion is valid or the retry count ran out
|
||||||
|
assertGauge(name, expected, rb);
|
||||||
|
return rb;
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRenameMetrics() throws Exception {
|
public void testRenameMetrics() throws Exception {
|
||||||
Path src = getTestPath("src");
|
Path src = getTestPath("src");
|
||||||
|
|
Loading…
Reference in New Issue