HDFS-13179. TestLazyPersistReplicaRecovery#testDnRestartWithSavedReplicas fails intermittently. Contributed by Ahmed Hussein.

(cherry picked from commit 1839c467f6)
This commit is contained in:
Inigo Goiri 2020-01-28 10:10:35 -08:00
parent 4af7d14ce2
commit ffaf0d3c82
2 changed files with 41 additions and 5 deletions

View File

@ -3362,6 +3362,11 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
return cacheManager.reserve(bytesNeeded) > 0; return cacheManager.reserve(bytesNeeded) > 0;
} }
@VisibleForTesting
public int getNonPersistentReplicas() {
return ramDiskReplicaTracker.numReplicasNotPersisted();
}
@VisibleForTesting @VisibleForTesting
public void setTimer(Timer newTimer) { public void setTimer(Timer newTimer) {
this.timer = newTimer; this.timer = newTimer;

View File

@ -19,6 +19,13 @@
package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl; package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.client.BlockReportOptions;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.junit.Test; import org.junit.Test;
@ -27,6 +34,7 @@ import java.util.concurrent.TimeoutException;
import static org.apache.hadoop.fs.StorageType.DEFAULT; import static org.apache.hadoop.fs.StorageType.DEFAULT;
import static org.apache.hadoop.fs.StorageType.RAM_DISK; import static org.apache.hadoop.fs.StorageType.RAM_DISK;
import static org.junit.Assert.assertTrue;
public class TestLazyPersistReplicaRecovery extends LazyPersistTestCase { public class TestLazyPersistReplicaRecovery extends LazyPersistTestCase {
@Test @Test
@ -34,6 +42,10 @@ public class TestLazyPersistReplicaRecovery extends LazyPersistTestCase {
throws IOException, InterruptedException, TimeoutException { throws IOException, InterruptedException, TimeoutException {
getClusterBuilder().build(); getClusterBuilder().build();
FSNamesystem fsn = cluster.getNamesystem();
final DataNode dn = cluster.getDataNodes().get(0);
DatanodeDescriptor dnd =
NameNodeAdapter.getDatanode(fsn, dn.getDatanodeId());
final String METHOD_NAME = GenericTestUtils.getMethodName(); final String METHOD_NAME = GenericTestUtils.getMethodName();
Path path1 = new Path("/" + METHOD_NAME + ".01.dat"); Path path1 = new Path("/" + METHOD_NAME + ".01.dat");
@ -42,14 +54,17 @@ public class TestLazyPersistReplicaRecovery extends LazyPersistTestCase {
// Sleep for a short time to allow the lazy writer thread to do its job. // Sleep for a short time to allow the lazy writer thread to do its job.
// However the block replica should not be evicted from RAM_DISK yet. // However the block replica should not be evicted from RAM_DISK yet.
Thread.sleep(3 * LAZY_WRITER_INTERVAL_SEC * 1000); FsDatasetImpl fsDImpl = (FsDatasetImpl) DataNodeTestUtils.getFSDataset(dn);
GenericTestUtils
.waitFor(() -> fsDImpl.getNonPersistentReplicas() == 0, 10,
3 * LAZY_WRITER_INTERVAL_SEC * 1000);
ensureFileReplicasOnStorageType(path1, RAM_DISK); ensureFileReplicasOnStorageType(path1, RAM_DISK);
LOG.info("Restarting the DataNode"); LOG.info("Restarting the DataNode");
cluster.restartDataNode(0, true); assertTrue("DN did not restart properly",
cluster.waitActive(); cluster.restartDataNode(0, true));
triggerBlockReport(); // wait for blockreport
waitForBlockReport(dn, dnd);
// Ensure that the replica is now on persistent storage. // Ensure that the replica is now on persistent storage.
ensureFileReplicasOnStorageType(path1, DEFAULT); ensureFileReplicasOnStorageType(path1, DEFAULT);
} }
@ -73,4 +88,20 @@ public class TestLazyPersistReplicaRecovery extends LazyPersistTestCase {
// Ensure that the replica is still on transient storage. // Ensure that the replica is still on transient storage.
ensureFileReplicasOnStorageType(path1, RAM_DISK); ensureFileReplicasOnStorageType(path1, RAM_DISK);
} }
private boolean waitForBlockReport(final DataNode dn,
final DatanodeDescriptor dnd) throws IOException, InterruptedException {
final DatanodeStorageInfo storage = dnd.getStorageInfos()[0];
final long lastCount = storage.getBlockReportCount();
dn.triggerBlockReport(
new BlockReportOptions.Factory().setIncremental(false).build());
try {
GenericTestUtils
.waitFor(() -> lastCount != storage.getBlockReportCount(), 10, 10000);
} catch (TimeoutException te) {
LOG.error("Timeout waiting for block report for {}", dnd);
return false;
}
return true;
}
} }