HDFS-2966

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1298820 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steve Loughran 2012-03-09 13:17:04 +00:00
parent 07c3b02281
commit 381a9b2d58
2 changed files with 46 additions and 10 deletions

View File

@ -92,6 +92,8 @@ Trunk (unreleased changes)
HDFS-3037. TestMulitipleNNDataBlockScanner#testBlockScannerAfterRestart is HDFS-3037. TestMulitipleNNDataBlockScanner#testBlockScannerAfterRestart is
racy. (atm) racy. (atm)
HDFS-2966 TestNameNodeMetrics tests can fail under load. (stevel)
BREAKDOWN OF HDFS-1623 SUBTASKS BREAKDOWN OF HDFS-1623 SUBTASKS
HDFS-2179. Add fencing framework and mechanisms for NameNode HA. (todd) HDFS-2179. Add fencing framework and mechanisms for NameNode HA. (todd)

View File

@ -62,6 +62,8 @@ public class TestNameNodeMetrics {
// Number of datanodes in the cluster // Number of datanodes in the cluster
private static final int DATANODE_COUNT = 3; private static final int DATANODE_COUNT = 3;
private static final int WAIT_GAUGE_VALUE_RETRIES = 20;
static { static {
CONF.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 100); CONF.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 100);
CONF.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 1); CONF.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 1);
@ -141,9 +143,7 @@ public class TestNameNodeMetrics {
fs.delete(file, true); fs.delete(file, true);
filesTotal--; // reduce the filecount for deleted file filesTotal--; // reduce the filecount for deleted file
waitForDeletion(); rb = waitForDnMetricValue(NS_METRICS, "FilesTotal", filesTotal);
rb = getMetrics(NS_METRICS);
assertGauge("FilesTotal", filesTotal, rb);
assertGauge("BlocksTotal", 0L, rb); assertGauge("BlocksTotal", 0L, rb);
assertGauge("PendingDeletionBlocks", 0L, rb); assertGauge("PendingDeletionBlocks", 0L, rb);
@ -176,9 +176,7 @@ public class TestNameNodeMetrics {
assertGauge("PendingReplicationBlocks", 1L, rb); assertGauge("PendingReplicationBlocks", 1L, rb);
assertGauge("ScheduledReplicationBlocks", 1L, rb); assertGauge("ScheduledReplicationBlocks", 1L, rb);
fs.delete(file, true); fs.delete(file, true);
waitForDeletion(); rb = waitForDnMetricValue(NS_METRICS, "CorruptBlocks", 0L);
rb = getMetrics(NS_METRICS);
assertGauge("CorruptBlocks", 0L, rb);
assertGauge("PendingReplicationBlocks", 0L, rb); assertGauge("PendingReplicationBlocks", 0L, rb);
assertGauge("ScheduledReplicationBlocks", 0L, rb); assertGauge("ScheduledReplicationBlocks", 0L, rb);
} }
@ -219,8 +217,7 @@ public class TestNameNodeMetrics {
assertGauge("UnderReplicatedBlocks", 1L, rb); assertGauge("UnderReplicatedBlocks", 1L, rb);
assertGauge("MissingBlocks", 1L, rb); assertGauge("MissingBlocks", 1L, rb);
fs.delete(file, true); fs.delete(file, true);
waitForDeletion(); waitForDnMetricValue(NS_METRICS, "UnderReplicatedBlocks", 0L);
assertGauge("UnderReplicatedBlocks", 0L, getMetrics(NS_METRICS));
} }
private void waitForDeletion() throws InterruptedException { private void waitForDeletion() throws InterruptedException {
@ -229,6 +226,43 @@ public class TestNameNodeMetrics {
Thread.sleep(DFS_REPLICATION_INTERVAL * (DATANODE_COUNT + 1) * 1000); Thread.sleep(DFS_REPLICATION_INTERVAL * (DATANODE_COUNT + 1) * 1000);
} }
/**
* Wait for the named gauge value from the metrics source to reach the
* desired value.
*
* There's an initial delay then a spin cycle of sleep and poll. Because
* all the tests use a shared FS instance, these tests are not independent;
* that's why the initial sleep is in there.
*
* @param source metrics source
* @param name gauge name
* @param expected expected value
* @return the last metrics record polled
* @throws Exception if something went wrong.
*/
private MetricsRecordBuilder waitForDnMetricValue(String source,
String name,
long expected)
throws Exception {
MetricsRecordBuilder rb;
long gauge;
//initial wait.
waitForDeletion();
//lots of retries are allowed for slow systems; fast ones will still
//exit early
int retries = (DATANODE_COUNT + 1) * WAIT_GAUGE_VALUE_RETRIES;
rb = getMetrics(source);
gauge = MetricsAsserts.getLongGauge(name, rb);
while (gauge != expected && (--retries > 0)) {
Thread.sleep(DFS_REPLICATION_INTERVAL * 500);
rb = getMetrics(source);
gauge = MetricsAsserts.getLongGauge(name, rb);
}
//at this point the assertion is valid or the retry count ran out
assertGauge(name, expected, rb);
return rb;
}
@Test @Test
public void testRenameMetrics() throws Exception { public void testRenameMetrics() throws Exception {
Path src = getTestPath("src"); Path src = getTestPath("src");