HDFS-12369. Edit log corruption due to hard lease recovery of not-closed file which has snapshots.

This commit is contained in:
Xiao Chen 2017-09-07 15:54:52 -07:00
parent b0b535d9d5
commit 52b894db33
3 changed files with 91 additions and 0 deletions

View File

@ -4993,6 +4993,7 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
}
boolean isFileDeleted(INodeFile file) {
assert hasReadLock();
// Not in the inodeMap or in the snapshot but marked deleted.
if (dir.getInode(file.getId()) == null) {
return true;

View File

@ -557,6 +557,12 @@ public class LeaseManager {
if (!p.startsWith("/")) {
throw new IOException("Invalid path in the lease " + p);
}
final INodeFile lastINode = iip.getLastINode().asFile();
if (fsnamesystem.isFileDeleted(lastINode)) {
// INode referred by the lease could have been deleted.
removeLease(lastINode.getId());
continue;
}
boolean completed = false;
try {
completed = fsnamesystem.internalReleaseLease(

View File

@ -20,11 +20,13 @@ package org.apache.hadoop.hdfs.server.namenode;
import java.io.FileNotFoundException;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -49,16 +51,21 @@ import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.InternalDataNodeTestUtils;
import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
import org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotTestHelper;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.test.GenericTestUtils.DelayAnswer;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.Timeout;
import org.mockito.Mockito;
import org.mockito.internal.util.reflection.Whitebox;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_KEY;
/**
* Test race between delete and other operations. For now only addBlock()
@ -71,6 +78,9 @@ public class TestDeleteRace {
private static final Configuration conf = new HdfsConfiguration();
private MiniDFSCluster cluster;
@Rule
public Timeout timeout = new Timeout(60000 * 3);
@Test
public void testDeleteAddBlockRace() throws Exception {
testDeleteAddBlockRace(false);
@ -358,4 +368,78 @@ public class TestDeleteRace {
throws Exception {
testDeleteAndCommitBlockSynchronizationRace(true);
}
/**
* Test the sequence of deleting a file that has snapshot,
* and lease manager's hard limit recovery.
*/
@Test
public void testDeleteAndLeaseRecoveryHardLimitSnapshot() throws Exception {
final Path rootPath = new Path("/");
final Configuration config = new Configuration();
// Disable permissions so that another user can recover the lease.
config.setBoolean(DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false);
config.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
FSDataOutputStream stm = null;
try {
cluster = new MiniDFSCluster.Builder(config).numDataNodes(3).build();
cluster.waitActive();
final DistributedFileSystem fs = cluster.getFileSystem();
final Path testPath = new Path("/testfile");
stm = fs.create(testPath);
LOG.info("test on " + testPath);
// write a half block
AppendTestUtil.write(stm, 0, BLOCK_SIZE / 2);
stm.hflush();
// create a snapshot, so delete does not release the file's inode.
SnapshotTestHelper.createSnapshot(fs, rootPath, "snap");
// delete the file without closing it.
fs.delete(testPath, false);
// write enough bytes to trigger an addBlock, which would fail in
// the streamer.
AppendTestUtil.write(stm, 0, BLOCK_SIZE);
// Mock a scenario that the lease reached hard limit.
final LeaseManager lm = (LeaseManager) Whitebox
.getInternalState(cluster.getNameNode().getNamesystem(),
"leaseManager");
final TreeSet<Lease> leases =
(TreeSet<Lease>) Whitebox.getInternalState(lm, "sortedLeases");
final TreeSet<Lease> spyLeases = new TreeSet<>(new Comparator<Lease>() {
@Override
public int compare(Lease o1, Lease o2) {
return Long.signum(o1.getLastUpdate() - o2.getLastUpdate());
}
});
while (!leases.isEmpty()) {
final Lease lease = leases.first();
final Lease spyLease = Mockito.spy(lease);
Mockito.doReturn(true).when(spyLease).expiredHardLimit();
spyLeases.add(spyLease);
leases.remove(lease);
}
Whitebox.setInternalState(lm, "sortedLeases", spyLeases);
// wait for lease manager's background 'Monitor' class to check leases.
Thread.sleep(2 * conf.getLong(DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_KEY,
DFS_NAMENODE_LEASE_RECHECK_INTERVAL_MS_DEFAULT));
LOG.info("Now check we can restart");
cluster.restartNameNodes();
LOG.info("Restart finished");
} finally {
if (stm != null) {
IOUtils.closeStream(stm);
}
if (cluster != null) {
cluster.shutdown();
}
}
}
}