From ffaf0d3c82928896904d901e1eacf8af03d1ddb6 Mon Sep 17 00:00:00 2001 From: Inigo Goiri Date: Tue, 28 Jan 2020 10:10:35 -0800 Subject: [PATCH] HDFS-13179. TestLazyPersistReplicaRecovery#testDnRestartWithSavedReplicas fails intermittently. Contributed by Ahmed Hussein. (cherry picked from commit 1839c467f60cbb8592d446694ec3d7710cda5142) --- .../fsdataset/impl/FsDatasetImpl.java | 5 +++ .../impl/TestLazyPersistReplicaRecovery.java | 41 ++++++++++++++++--- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java index 1ce29107dc8..ac92ae4fe8f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java @@ -3362,6 +3362,11 @@ class FsDatasetImpl implements FsDatasetSpi { return cacheManager.reserve(bytesNeeded) > 0; } + @VisibleForTesting + public int getNonPersistentReplicas() { + return ramDiskReplicaTracker.numReplicasNotPersisted(); + } + @VisibleForTesting public void setTimer(Timer newTimer) { this.timer = newTimer; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestLazyPersistReplicaRecovery.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestLazyPersistReplicaRecovery.java index 537f9e8d621..5fa470c86e0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestLazyPersistReplicaRecovery.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestLazyPersistReplicaRecovery.java @@ -19,6 +19,13 @@ package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.client.BlockReportOptions; +import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; +import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo; +import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; +import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.hadoop.test.GenericTestUtils; import org.junit.Test; @@ -27,6 +34,7 @@ import java.util.concurrent.TimeoutException; import static org.apache.hadoop.fs.StorageType.DEFAULT; import static org.apache.hadoop.fs.StorageType.RAM_DISK; +import static org.junit.Assert.assertTrue; public class TestLazyPersistReplicaRecovery extends LazyPersistTestCase { @Test @@ -34,6 +42,10 @@ public class TestLazyPersistReplicaRecovery extends LazyPersistTestCase { throws IOException, InterruptedException, TimeoutException { getClusterBuilder().build(); + FSNamesystem fsn = cluster.getNamesystem(); + final DataNode dn = cluster.getDataNodes().get(0); + DatanodeDescriptor dnd = + NameNodeAdapter.getDatanode(fsn, dn.getDatanodeId()); final String METHOD_NAME = GenericTestUtils.getMethodName(); Path path1 = new Path("/" + METHOD_NAME + ".01.dat"); @@ -42,14 +54,17 @@ public class TestLazyPersistReplicaRecovery extends LazyPersistTestCase { // Sleep for a short time to allow the lazy writer thread to do its job. // However the block replica should not be evicted from RAM_DISK yet. - Thread.sleep(3 * LAZY_WRITER_INTERVAL_SEC * 1000); + FsDatasetImpl fsDImpl = (FsDatasetImpl) DataNodeTestUtils.getFSDataset(dn); + GenericTestUtils + .waitFor(() -> fsDImpl.getNonPersistentReplicas() == 0, 10, + 3 * LAZY_WRITER_INTERVAL_SEC * 1000); ensureFileReplicasOnStorageType(path1, RAM_DISK); LOG.info("Restarting the DataNode"); - cluster.restartDataNode(0, true); - cluster.waitActive(); - triggerBlockReport(); - + assertTrue("DN did not restart properly", + cluster.restartDataNode(0, true)); + // wait for blockreport + waitForBlockReport(dn, dnd); // Ensure that the replica is now on persistent storage. ensureFileReplicasOnStorageType(path1, DEFAULT); } @@ -73,4 +88,20 @@ public class TestLazyPersistReplicaRecovery extends LazyPersistTestCase { // Ensure that the replica is still on transient storage. ensureFileReplicasOnStorageType(path1, RAM_DISK); } + + private boolean waitForBlockReport(final DataNode dn, + final DatanodeDescriptor dnd) throws IOException, InterruptedException { + final DatanodeStorageInfo storage = dnd.getStorageInfos()[0]; + final long lastCount = storage.getBlockReportCount(); + dn.triggerBlockReport( + new BlockReportOptions.Factory().setIncremental(false).build()); + try { + GenericTestUtils + .waitFor(() -> lastCount != storage.getBlockReportCount(), 10, 10000); + } catch (TimeoutException te) { + LOG.error("Timeout waiting for block report for {}", dnd); + return false; + } + return true; + } }